From c4519b85fb061865a60a83ef634f1169f89028d6 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 26 Mar 2026 21:14:23 +0100
Subject: [PATCH 1/8] Upgrade code to StormCrawler 3.5.1
Upgrade dependencies and Maven plugins to latest available versions
Migrate from Elasticsearch to OpenSearch
---
bin/{ES_IndexInit.sh => OS_IndexInit.sh} | 16 ++-
conf/crawler-conf.yaml | 12 +-
conf/crawler.flux | 24 ++--
conf/es-conf.yaml | 2 +-
pom.xml | 107 +++++++-----------
.../stormcrawler/filter/FastURLFilter.java | 12 +-
.../stormcrawler/news/CrawlTopology.java | 35 +++---
.../stormcrawler/news/FeedDetectorBolt.java | 18 +--
.../news/NewsSiteMapParserBolt.java | 29 +++--
.../stormcrawler/news/PreFilterBolt.java | 11 +-
.../news/PunycodeURLNormalizer.java | 4 +-
.../news/bootstrap/BootstrapTopology.java | 28 ++---
.../news/bootstrap/FeedLinkParseFilter.java | 10 +-
.../bootstrap/NewsSiteMapDetectorBolt.java | 16 +--
.../resources/bootstrap-parsefilters.json | 6 +-
src/main/resources/bootstrap-urlfilters.json | 12 +-
src/main/resources/inject-urlfilters.json | 14 +--
src/main/resources/parsefilters.json | 4 +-
src/main/resources/pre-urlfilters.json | 2 +-
src/main/resources/urlfilters.json | 14 +--
.../stormcrawler/FastURLFilterTest.java | 4 +-
.../news/NewsSiteMapParserTest.java | 6 +-
22 files changed, 188 insertions(+), 198 deletions(-)
rename bin/{ES_IndexInit.sh => OS_IndexInit.sh} (76%)
diff --git a/bin/ES_IndexInit.sh b/bin/OS_IndexInit.sh
similarity index 76%
rename from bin/ES_IndexInit.sh
rename to bin/OS_IndexInit.sh
index 394faf7..60e72ec 100755
--- a/bin/ES_IndexInit.sh
+++ b/bin/OS_IndexInit.sh
@@ -1,5 +1,17 @@
-# modified version of
-# https://github.com/DigitalPebble/storm-crawler/blob/master/external/elasticsearch/ES_IndexInit.sh
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
ESHOST="http://localhost:9200"
#ESCREDENTIALS="-u elastic:passwordhere"
diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml
index 00032e1..ef0f478 100644
--- a/conf/crawler-conf.yaml
+++ b/conf/crawler-conf.yaml
@@ -19,8 +19,8 @@ config:
# mandatory when using Flux
topology.kryo.register:
- - com.digitalpebble.stormcrawler.Metadata
- - com.digitalpebble.stormcrawler.persistence.Status
+ - org.apache.stormcrawler.Metadata
+ - org.apache.stormcrawler.persistence.Status
topology.backpressure.enable: false
@@ -31,7 +31,7 @@ config:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1
- - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
+ - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer"
parallelism.hint: 1
# status index and fetcher queues are partitioned by domain
@@ -71,8 +71,8 @@ config:
http.timeout: 30000
# use okhttp
- http.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
- https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
+ http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
+ https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
# do not fail on unknown SSL certificates
http.trust.everything: true
@@ -103,7 +103,7 @@ config:
fetcher.max.urls.in.queues: 6000
# fetch Scheduler implementation
- scheduler.class: "com.digitalpebble.stormcrawler.persistence.AdaptiveScheduler"
+ scheduler.class: "org.apache.stormcrawler.persistence.AdaptiveScheduler"
# AdaptiveScheduler properties
scheduler.adaptive.setLastModified: true
# frequently changing feeds or news sitemaps are refetched after 90 min.
diff --git a/conf/crawler.flux b/conf/crawler.flux
index 5367184..73af7c7 100644
--- a/conf/crawler.flux
+++ b/conf/crawler.flux
@@ -21,7 +21,7 @@ config:
components:
- id: "WARCFileNameFormat"
- className: "com.digitalpebble.stormcrawler.warc.WARCFileNameFormat"
+ className: "org.apache.stormcrawler.warc.WARCFileNameFormat"
configMethods:
- name: "withPath"
args:
@@ -30,7 +30,7 @@ components:
args:
- "CC-NEWS"
- id: "WARCFileRotationPolicy"
- className: "com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy"
+ className: "org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy"
constructorArgs:
- 1024
- MB
@@ -77,10 +77,10 @@ components:
spouts:
- id: "spout"
- className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout"
+ className: "org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout"
parallelism: 16
- id: "filespout"
- className: "com.digitalpebble.stormcrawler.spout.FileSpout"
+ className: "org.apache.stormcrawler.spout.FileSpout"
parallelism: 1
constructorArgs:
- "/path/to/seeds/"
@@ -89,7 +89,7 @@ spouts:
bolts:
- id: "filter"
- className: "com.digitalpebble.stormcrawler.bolt.URLFilterBolt"
+ className: "org.apache.stormcrawler.bolt.URLFilterBolt"
parallelism: 1
- id: "prefilter"
className: "org.commoncrawl.stormcrawler.news.PreFilterBolt"
@@ -97,22 +97,22 @@ bolts:
constructorArgs:
- "pre-urlfilters.json"
- id: "partitioner"
- className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt"
+ className: "org.apache.stormcrawler.bolt.URLPartitionerBolt"
parallelism: 1
- id: "fetcher"
- className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt"
+ className: "org.apache.stormcrawler.bolt.FetcherBolt"
parallelism: 1
- id: "sitemap"
className: "org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt"
parallelism: 1
- id: "feed"
- className: "com.digitalpebble.stormcrawler.bolt.FeedParserBolt"
+ className: "org.apache.stormcrawler.bolt.FeedParserBolt"
parallelism: 1
- id: "ssbolt"
- className: "com.digitalpebble.stormcrawler.indexing.DummyIndexer"
+ className: "org.apache.stormcrawler.indexing.DummyIndexer"
parallelism: 1
- id: "warc"
- className: "com.digitalpebble.stormcrawler.warc.WARCHdfsBolt"
+ className: "org.apache.stormcrawler.warc.WARCHdfsBolt"
parallelism: 1
configMethods:
- name: "withFileNameFormat"
@@ -129,7 +129,7 @@ bolts:
args:
- "warc"
- id: "status"
- className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
+ className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
parallelism: 1
streams:
@@ -219,7 +219,7 @@ streams:
streamId: "status"
type: CUSTOM
customClass:
- className: "com.digitalpebble.stormcrawler.util.URLStreamGrouping"
+ className: "org.apache.stormcrawler.util.URLStreamGrouping"
constructorArgs:
- "byDomain"
diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml
index 9b02a56..7ea8b8b 100644
--- a/conf/es-conf.yaml
+++ b/conf/es-conf.yaml
@@ -74,7 +74,7 @@ config:
es.status.recentDate.min.gap: -1
topology.metrics.consumer.register:
- - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
+ - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer"
parallelism.hint: 1
#whitelist:
# - "fetcher_counter"
diff --git a/pom.xml b/pom.xml
index db70413..c470e83 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,10 +1,32 @@
-
+
+
+
+4.0.0org.commoncrawl.stormcrawler.newscrawler
- 2.10.0
+ 3.5.1jar
@@ -17,14 +39,14 @@
UTF-8
- 2.10
- 2.5.0
- 1.12.467
+ 3.5.1
+ 2.8.4
+ 1.12.7972.11.1
- 1.1
- 5.5.0
- 2.26.3
- 4.13
+ 1.6
+ 5.23.0
+ 3.0.1
+ 4.13.2
@@ -32,16 +54,16 @@
org.apache.maven.pluginsmaven-compiler-plugin
- 3.11.0
+ 3.15.0
- 11
- 11
+ 17
+ 17org.codehaus.mojoexec-maven-plugin
- 3.1.0
+ 3.6.3
@@ -59,7 +81,7 @@
org.apache.maven.pluginsmaven-shade-plugin
- 3.5.0
+ 3.6.2package
@@ -105,29 +127,13 @@
-
-
- org.owasp
- dependency-check-maven
- 6.1.0
-
- true
-
-
-
-
- aggregate
-
-
-
-
- com.digitalpebble.stormcrawler
- storm-crawler-core
+ org.apache.stormcrawler
+ stormcrawler-core${stormcrawler.version}
@@ -144,30 +150,15 @@
- com.digitalpebble.stormcrawler
- storm-crawler-elasticsearch
+ org.apache.stormcrawler
+ stormcrawler-opensearch${stormcrawler.version}
- com.digitalpebble.stormcrawler
- storm-crawler-warc
+ org.apache.stormcrawler
+ stormcrawler-warc${stormcrawler.version}
-
-
-
- jdk.tools
- jdk.tools
-
-
-
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- ${jackson-databind.version}
-
com.github.crawler-commonscrawler-commons
@@ -182,8 +173,8 @@
- com.digitalpebble.stormcrawler
- storm-crawler-core
+ org.apache.stormcrawler
+ stormcrawler-core${stormcrawler.version}test-jartest
@@ -210,14 +201,4 @@
test
-
-
-
-
- commons-io
- commons-io
- 2.11.0
-
-
-
diff --git a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
index 6478ec1..7c52716 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
@@ -30,7 +30,11 @@
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.stormcrawler.JSONResource;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
+import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -39,10 +43,6 @@
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
-import com.digitalpebble.stormcrawler.JSONResource;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilter;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.JsonNode;
@@ -419,4 +419,4 @@ public boolean match(URL url) {
return pattern.matcher(haystack).find();
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
index ce21c7a..54c69eb 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
@@ -24,26 +24,25 @@
import org.apache.storm.topology.BoltDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
+import org.apache.stormcrawler.ConfigurableTopology;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.bolt.FetcherBolt;
+import org.apache.stormcrawler.bolt.URLFilterBolt;
+import org.apache.stormcrawler.bolt.URLPartitionerBolt;
+import org.apache.stormcrawler.indexing.DummyIndexer;
+import org.apache.stormcrawler.opensearch.persistence.AggregationSpout;
+import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt;
+import org.apache.stormcrawler.protocol.AbstractHttpProtocol;
+import org.apache.stormcrawler.spout.FileSpout;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.apache.stormcrawler.util.URLStreamGrouping;
+import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy;
+import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy.Units;
+import org.apache.stormcrawler.warc.WARCFileNameFormat;
+import org.apache.stormcrawler.warc.WARCHdfsBolt;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.ConfigurableTopology;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
-import com.digitalpebble.stormcrawler.bolt.URLFilterBolt;
-import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt;
-import com.digitalpebble.stormcrawler.indexing.DummyIndexer;
-import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol;
-import com.digitalpebble.stormcrawler.spout.FileSpout;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-import com.digitalpebble.stormcrawler.util.URLStreamGrouping;
-import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy;
-import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy.Units;
-import com.digitalpebble.stormcrawler.warc.WARCFileNameFormat;
-import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt;
-
/**
* Dummy topology to play with the spouts and bolts on ElasticSearch
*/
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
index bdf5f58..c365525 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
@@ -21,15 +21,15 @@
import org.apache.storm.tuple.Values;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.parse.ParseData;
-import com.digitalpebble.stormcrawler.parse.ParseFilter;
-import com.digitalpebble.stormcrawler.parse.ParseFilters;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.persistence.Status;
-import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.parse.ParseData;
+import org.apache.stormcrawler.parse.ParseFilter;
+import org.apache.stormcrawler.parse.ParseFilters;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.persistence.Status;
+import org.apache.http.HttpHeaders;
/** Detect RSS and Atom feeds, but do not parse and extract links */
@SuppressWarnings("serial")
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
index 42f4de3..3c4cf55 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
@@ -13,7 +13,7 @@
*/
package org.commoncrawl.stormcrawler.news;
-import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
+import static org.apache.stormcrawler.Constants.StatusStreamName;
import java.io.IOException;
import java.net.URL;
@@ -25,28 +25,27 @@
import java.util.List;
import java.util.Map;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpHeaders;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.ReducedMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.bolt.SiteMapParserBolt;
+import org.apache.stormcrawler.parse.Outlink;
+import org.apache.stormcrawler.parse.ParseData;
+import org.apache.stormcrawler.parse.ParseFilter;
+import org.apache.stormcrawler.parse.ParseFilters;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.persistence.DefaultScheduler;
+import org.apache.stormcrawler.persistence.Status;
+import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
-import com.digitalpebble.stormcrawler.parse.Outlink;
-import com.digitalpebble.stormcrawler.parse.ParseData;
-import com.digitalpebble.stormcrawler.parse.ParseFilter;
-import com.digitalpebble.stormcrawler.parse.ParseFilters;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.persistence.DefaultScheduler;
-import com.digitalpebble.stormcrawler.persistence.Status;
-import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.SiteMap;
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
index 5a880da..b986506 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
@@ -4,7 +4,7 @@
import java.lang.invoke.MethodHandles;
import java.util.Map;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
@@ -12,13 +12,12 @@
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilters;
+import org.apache.stormcrawler.persistence.Status;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilters;
-import com.digitalpebble.stormcrawler.persistence.Status;
-
/**
* Variant of the URLFilterBolt to go upstream of the fetching to catch anything
* before it goes further into the topology. If filtered, a URL gets an ERROR
@@ -34,7 +33,7 @@ public class PreFilterBolt extends BaseRichBolt {
private final String filterConfigFile;
- private static final String _s = com.digitalpebble.stormcrawler.Constants.StatusStreamName;
+ private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName;
public PreFilterBolt(String filterConfigFile) {
this.filterConfigFile = filterConfigFile;
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
index a28e2e9..4adf03d 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
@@ -18,8 +18,8 @@
import java.net.URL;
import java.util.Map;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilter;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
import com.fasterxml.jackson.databind.JsonNode;
public class PunycodeURLNormalizer extends URLFilter {
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
index 51aa116..821551e 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
@@ -23,19 +23,19 @@
import org.commoncrawl.stormcrawler.news.FeedDetectorBolt;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.ConfigurableTopology;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
-import com.digitalpebble.stormcrawler.bolt.JSoupParserBolt;
-import com.digitalpebble.stormcrawler.bolt.URLFilterBolt;
-import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt;
-import com.digitalpebble.stormcrawler.indexing.DummyIndexer;
-import com.digitalpebble.stormcrawler.spout.FileSpout;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-import com.digitalpebble.stormcrawler.util.URLStreamGrouping;
-import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt;
+import org.apache.stormcrawler.ConfigurableTopology;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.bolt.FetcherBolt;
+import org.apache.stormcrawler.bolt.JSoupParserBolt;
+import org.apache.stormcrawler.bolt.URLFilterBolt;
+import org.apache.stormcrawler.bolt.URLPartitionerBolt;
+import org.apache.stormcrawler.opensearch.persistence.AggregationSpout;
+import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt;
+import org.apache.stormcrawler.indexing.DummyIndexer;
+import org.apache.stormcrawler.spout.FileSpout;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.apache.stormcrawler.util.URLStreamGrouping;
+import org.apache.stormcrawler.warc.WARCHdfsBolt;
/**
* Dummy topology to play with the spouts and bolts on ElasticSearch
@@ -114,4 +114,4 @@ protected int run(String[] args) {
return submit(conf, builder);
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
index bc36e64..5707189 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
@@ -18,10 +18,10 @@
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.parse.Outlink;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.parse.filter.LinkParseFilter;
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.parse.Outlink;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.parse.filter.LinkParseFilter;
/**
* ParseFilter which extracts exclusively RSS links via Xpath, all other links
@@ -67,4 +67,4 @@ public static void logLinks(ParseResult parse, String URL, String message) {
}
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
index 2b773be..1160201 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
@@ -23,14 +23,14 @@
import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
-import com.digitalpebble.stormcrawler.parse.ParseData;
-import com.digitalpebble.stormcrawler.parse.ParseFilter;
-import com.digitalpebble.stormcrawler.parse.ParseFilters;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.persistence.Status;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.bolt.SiteMapParserBolt;
+import org.apache.stormcrawler.parse.ParseData;
+import org.apache.stormcrawler.parse.ParseFilter;
+import org.apache.stormcrawler.parse.ParseFilters;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.persistence.Status;
/**
* Detector for
Date: Thu, 26 Mar 2026 21:34:02 +0100
Subject: [PATCH 2/8] Sync GitHub CI build workflow with upstream StormCrawler
Prepare for renaming of development branch to "main".
---
.github/workflows/maven.yml | 67 ++++++++++++++++++++++++++++++-------
1 file changed, 54 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 10acad1..dc52ef3 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -1,26 +1,67 @@
-# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time
-# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
name: Java CI with Maven
on:
+ # Run CI on Pushes to "main"" or on pull requests targeting "main".
push:
- branches: [ master ]
+ branches:
+ - main
pull_request:
- branches: [ master ]
+ branches:
+ - main
jobs:
- build:
-
+ rat:
runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+ with:
+ path: ~/.m2/repository
+ key: rat-maven-${{ hashFiles('**/pom.xml') }}
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
+ with:
+ distribution: adopt
+ java-version: 17
+ - name: Build with Maven
+ run: mvn -B --no-transfer-progress -Prat -DskipTests verify -Dskip.format.code=false
+ build:
+ needs: rat
+ runs-on: ${{ matrix.os }}
+ continue-on-error: ${{ matrix.experimental }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ java: [ 17 ]
+ experimental: [false]
steps:
- - uses: actions/checkout@v2
- - name: Set up JDK 8
- uses: actions/setup-java@v2
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+ with:
+ path: ~/.m2/repository
+ key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-maven-
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
with:
- java-version: '8'
- distribution: 'adopt'
- cache: maven
+ distribution: adopt
+ java-version: ${{ matrix.java }}
- name: Build with Maven
- run: mvn -B package --file pom.xml
+ run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify
From 6bd1e31c8cc1139823611949857e6c60701f1b8c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 31 Mar 2026 16:44:17 +0200
Subject: [PATCH 3/8] Replace Dockerfile by docker compose config
Make the configuration work.
---
Dockerfile | 134 +++--------------------
README.md | 102 +++++++++--------
bin/OS_IndexInit.sh | 123 +++------------------
bin/dashboards/OS_ImportDashboards.sh | 29 +++++
bin/dashboards/metrics.ndjson | 10 ++
bin/dashboards/status.ndjson | 5 +
bin/dashboards/storm.ndjson | 5 +
bin/run-crawler.sh | 43 --------
bin/{es_status => status} | 0
conf/crawler-conf.yaml | 23 +++-
conf/crawler.flux | 10 +-
conf/es-conf.yaml | 83 --------------
conf/opensearch-conf.yaml | 126 +++++++++++++++++++++
docker-compose.yaml | 122 +++++++++++++++++++++
etc/supervisor/conf.d/elasticsearch.conf | 8 --
etc/supervisor/conf.d/kibana.conf | 7 --
etc/sysctl.d/60-elasticsearch.conf | 7 --
seeds/feeds.txt | 29 +----
src/main/resources/indexer.mapping | 40 +++++++
src/main/resources/metrics.mapping | 40 +++++++
src/main/resources/status.mapping | 39 +++++++
21 files changed, 535 insertions(+), 450 deletions(-)
create mode 100755 bin/dashboards/OS_ImportDashboards.sh
create mode 100644 bin/dashboards/metrics.ndjson
create mode 100644 bin/dashboards/status.ndjson
create mode 100644 bin/dashboards/storm.ndjson
delete mode 100755 bin/run-crawler.sh
rename bin/{es_status => status} (100%)
delete mode 100644 conf/es-conf.yaml
create mode 100644 conf/opensearch-conf.yaml
create mode 100644 docker-compose.yaml
delete mode 100644 etc/supervisor/conf.d/elasticsearch.conf
delete mode 100644 etc/supervisor/conf.d/kibana.conf
delete mode 100644 etc/sysctl.d/60-elasticsearch.conf
create mode 100644 src/main/resources/indexer.mapping
create mode 100644 src/main/resources/metrics.mapping
create mode 100644 src/main/resources/status.mapping
diff --git a/Dockerfile b/Dockerfile
index eaad382..683df8f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,133 +1,29 @@
-FROM ubuntu:22.04
+FROM storm:2.8.4
RUN apt-get update -qq && \
- apt-get upgrade -yq && \
-# apt-mark hold openjdk-11-jre-headless && \
apt-get install -yq --no-install-recommends \
- apt-transport-https \
- apt-utils \
- ca-certificates \
curl \
- git-core \
- gnupg \
jq \
less \
- maven \
-# openjdk-8-jdk-headless \
- sudo \
- supervisor \
- wget \
- tar \
- vim
-# zookeeperd
+ vim
#
-# Elasticsearch and Kibana
+# news-crawler
#
-ENV ES_VERSION=7.10.2
-RUN wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \
- | apt-key add -
-RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" \
- >> /etc/apt/sources.list.d/elasticsearch-7.x.list
-RUN apt-get update -qq && \
- apt-get install -yq --no-install-recommends \
- elasticsearch=$ES_VERSION \
- kibana=$ES_VERSION
-RUN ln -s /usr/share/elasticsearch/bin/elasticsearch /usr/bin/elasticsearch
-RUN ln -s /usr/share/kibana/bin/kibana /usr/bin/kibana
-USER root
-# system configuration, see https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
-ADD etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/60-elasticsearch.conf
-ADD etc/supervisor/conf.d/elasticsearch.conf /etc/supervisor/conf.d/elasticsearch.conf
-ADD etc/supervisor/conf.d/kibana.conf /etc/supervisor/conf.d/kibana.conf
-RUN chmod -R 644 /etc/sysctl.d/60-elasticsearch.conf /etc/supervisor/conf.d/*.conf
-ENV ES_HEAP_SIZE=20g
-# set Elasticsearch data path
-RUN sed -Ei 's@^path\.data: .*@path.data: /data/elasticsearch@' /etc/elasticsearch/elasticsearch.yml
-# TODO: enable updates via scripting
-
-
-# Zookeeper
-
-ENV ZOOKEEPER_VERSION=3.8.3
-RUN wget -q -O - https://downloads.apache.org/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz \
- | sudo tar -xzf - -C /opt
-ENV ZOOKEEPER_HOME=/opt/apache-zookeeper-$ZOOKEEPER_VERSION-bin
-RUN ln -s $ZOOKEEPER_HOME/conf/zoo_sample.cfg $ZOOKEEPER_HOME/conf/zoo.cfg
-# prevent ZK's admin UI to run on 8080
-RUN echo "admin.enableServer=false" >> $ZOOKEEPER_HOME/conf/zoo.cfg
-RUN ln -s $ZOOKEEPER_HOME /usr/share/zookeeper
-
-#
-# Apache Storm
-#
-ENV STORM_VERSION=2.5.0
-COPY downloads/apache-storm-$STORM_VERSION.tar.gz /tmp/apache-storm-$STORM_VERSION.tar.gz
-RUN tar -xzf /tmp/apache-storm-$STORM_VERSION.tar.gz -C /opt
-RUN rm /tmp/apache-storm-$STORM_VERSION.tar.gz
-ENV STORM_HOME /opt/apache-storm-$STORM_VERSION
-RUN groupadd storm && \
- useradd --gid storm --home-dir /home/storm \
- --create-home --shell /bin/bash storm && \
- chown -R storm:storm $STORM_HOME && \
- mkdir /var/log/storm && \
- chown -R storm:storm /var/log/storm
-RUN ln -s /var/log/storm $STORM_HOME/logs
-RUN ln -s $STORM_HOME/bin/storm /usr/bin/storm
-
-ADD etc/supervisor/conf.d/storm-*.conf /etc/supervisor/conf.d/
-ADD etc/supervisor/conf.d/zookeeper.conf /etc/supervisor/conf.d/
-RUN chmod -R 644 /etc/supervisor/conf.d/*.conf
+ENV CRAWLER_VERSION=3.5.1
-
-#
-# Storm crawler / news crawler
-#
-ENV CRAWLER_VERSION=2.10.0
-RUN groupadd ubuntu && \
- useradd --gid ubuntu --home-dir /home/ubuntu \
- --create-home --shell /bin/bash ubuntu && \
- chown -R ubuntu:ubuntu /home/ubuntu
-USER ubuntu
-WORKDIR /home/ubuntu
-RUN mkdir news-crawler/ && \
- mkdir news-crawler/conf/ && \
- mkdir news-crawler/lib/ && \
- mkdir news-crawler/bin/ && \
- mkdir news-crawler/seeds/ && \
- chmod -R a+rx news-crawler/
+RUN mkdir /news-crawler/ && \
+ mkdir /news-crawler/conf/ && \
+ mkdir /news-crawler/lib/ && \
+ mkdir /news-crawler/bin/ && \
+ chmod -R a+rx /news-crawler/
# add the news crawler uber-jar
-ADD target/crawler-$CRAWLER_VERSION.jar news-crawler/lib/crawler.jar
+ADD target/crawler-$CRAWLER_VERSION.jar /news-crawler/lib/crawler.jar
# and configuration files
-ADD conf/*.* news-crawler/conf/
-ADD seeds/*.txt news-crawler/seeds/
-ADD bin/*.sh news-crawler/bin/
-ADD bin/es_status news-crawler/bin/
-
-USER root
-RUN chown -R ubuntu:ubuntu /home/ubuntu && \
- chmod -R a+r /home/ubuntu && \
- chmod u+x news-crawler/bin/*
-
-
-# Ports:
-# 8080 - Storm UI
-# 9200 - Elasticsearch http
-# 9300 - Elasticsearch java
-# 5601 - Kibana
-EXPOSE 8080 9200 9300 5601
-
-# volumes for persistent data
-USER root
-RUN mkdir /data
-RUN mkdir /data/elasticsearch && chown elasticsearch:elasticsearch /data/elasticsearch
-VOLUME ["/data/elasticsearch"]
-RUN mkdir /data/warc && chown storm:storm /data/warc
-VOLUME ["/data/warc"]
-
-# start all services
-CMD ["/usr/bin/supervisord"]
+ADD conf/*.* /news-crawler/conf/
+ADD bin/*.sh /news-crawler/bin/
+ADD bin/status /news-crawler/bin/
-# launch the crawl
-# CMD ["/home/ubuntu/news-crawler/bin/run-crawler.sh"]
+USER storm
+WORKDIR /news-crawler/
diff --git a/README.md b/README.md
index 58f0d4d..ad990bf 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,29 @@
-# NEWS-CRAWL
+# News Crawler
-Crawler for news based on [StormCrawler](https://stormcrawler.net/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/).
+Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/).
-Prerequisites
--------------
+## Prerequisites
-* Install Elasticsearch 7.10.2 (ev. also Kibana)
-* Install Apache Storm 2.5.0
-* Start Elasticsearch and Storm
-* Build ES indices by running `bin/ES_IndexInit.sh`
+* Install OpenSearch 2.19.4
+* Install Apache Storm 2.8.4
+* Start OpenSearch and Storm
+* Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh)
-Crawler Seeds
--------------
+Alternatively, use the Docker Compose setup, see below.
-The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](//github.com/commoncrawl/news-crawl/issues/41).
+## Crawler Seeds
-Configuration
--------------
+The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](https://github.com/commoncrawl/news-crawl/issues/41).
+
+
+## Configuration
The default configuration should work out-of-the-box. The only thing to do is to configure the user agent properties send in the HTTP request header. Open the file `conf/crawler-conf.yaml` in an editor and fill in the values for `http.agent.name` and all further properties starting with the `http.agent.` prefix.
-Run the crawl
--------------
+## Run the crawl
Generate an uberjar:
``` sh
@@ -33,23 +32,23 @@ mvn clean package
And run ...
``` sh
-storm local target/crawler-2.10.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
+storm local target/crawler-3.5.1.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
```
This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand.
Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments.
-Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.5.0/flux.html). Make sure to adapt the Flux definition to your needs!
+Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.4/flux.html). Make sure to adapt the Flux definition to your needs!
In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging.
-Monitor the crawl
------------------
-When the topology is running you can check that URLs have been injected and news are getting fetched on [http://localhost:9200/status/_search?pretty]. Or use StormCrawler's Kibana dashboards to monitor the crawling process. Please follow the instructions to install the templates for Kibana provided as part of [StormCrawler's Elasticsearch module documentation](//github.com/DigitalPebble/storm-crawler/tree/master/external/elasticsearch).
+## Monitor the crawl
-There is also a shell script [bin/es_status](./bin/es_status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g.,
+When the topology is running you can check that URLs have been injected and news are getting fetched on . Or use StormCrawler's OpenSearch dashboards to monitor the crawling process on .
+
+There is also a shell script [bin/status](./bin/status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g.,
```
$> bin/es_status aggregate_status
@@ -59,15 +58,7 @@ $> bin/es_status aggregate_status
```
-Run Crawl from Docker Container
--------------------------------
-
-First, download Apache Storm 2.5.0. from the [download page](https://storm.apache.org/downloads.html) and place it in the directory `downloads`:
-```
-STORM_VERSION=2.5.0
-mkdir downloads
-wget -q -P downloads --timestamping https://downloads.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz
-```
+## Run Crawl with Docker Compose
Do not forget to create the uberjar (see above) which is included in the Docker image. Simply run:
@@ -75,31 +66,56 @@ Do not forget to create the uberjar (see above) which is included in the Docker
mvn clean package
```
-Then build the Docker image from the [Dockerfile](./Dockerfile):
+Verify the configuration in the file [docker-compose.yaml](docker-compose.yaml) and [conf/](conf/) is correct:
+- Don't forget to adapt the paths to mounted volumes used to persist data (OpenSearch indexes and WARC files).
+- Make sure to add the user agent configuration in conf/crawler-conf.yaml.
-Note: the uberjar is included in the Docker image and needs to be built first (see above).
+Then download and build the Docker images:
```
-docker build -t newscrawler:2.10.0 .
+docker compose -f docker-compose.yaml up --build --renew-anon-volumes --remove-orphans
```
-To launch an interactive container:
+Wait until the containers are running, then initialize the OpenSearch index and the dashboards:
```
-docker run --net=host \
- -v $PWD/data/elasticsearch:/data/elasticsearch \
- -v $PWD/data/warc:/data/warc \
- --rm --name newscrawler -i -t newscrawler:2.10.0 /bin/bash
+./bin/OS_IndexInit.sh
+./bin/dashboards/OS_ImportDashboards.sh
```
-NOTE: don't forget to adapt the paths to mounted volumes used to persist data on the host. Make sure to add the user agent configuration in conf/crawler-conf.yaml.
+NOTE:
+- This will delete existing indexes!
+- Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance!
+
+
+To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.4/flux.html):
+```
+docker compose run --rm news-crawler \
+ storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux
+```
+Or using the Java topology:
+```
+docker compose run --rm news-crawler \
+ storm jar lib/crawler.jar -- org.commoncrawl.stormcrawler.news.CrawlTopology \
+ /data/seeds '*' -conf conf/opensearch-conf.yaml -conf conf/crawler-conf.yaml
+```
-CAVEAT: Make sure that the Elasticsearch port 9200 is not already in use or mapped by a running ES instance. Otherwise Elasticsearch commands may affect the running instance!
+After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http://localhost:9200/) or the OpenSearch dashboards on port [5601](http://localhost:5601/).
-Once you are logged onto the Docker container, start the services and crawl with
+For inspecting the worker log files:
+```
+docker exec storm-supervisor /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log'
+```
+To stop the topology:
```
-/home/ubuntu/news-crawler/bin/run-crawler.sh
+docker compose run --rm -ti news-crawler /bin/bash
+
+$> storm list
+Topology_name Status Num_tasks Num_workers Uptime_secs Topology_Id Owner
+----------------------------------------------------------------------------------------
+NewsCrawl ACTIVE 48 1 146 NewsCrawl-1-1774977605 storm
+
+$> storm kill NewsCrawl
```
-After 1-2 minutes if everything is up, connect to Elasticsearch on port [9200](http://127.0.0.1:9200/) or Kibana on port [5601](http://127.0.0.1:5601/).
diff --git a/bin/OS_IndexInit.sh b/bin/OS_IndexInit.sh
index 60e72ec..81b4066 100755
--- a/bin/OS_IndexInit.sh
+++ b/bin/OS_IndexInit.sh
@@ -13,121 +13,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-ESHOST="http://localhost:9200"
-#ESCREDENTIALS="-u elastic:passwordhere"
+#!/bin/bash
-# deletes and recreates a status index with a bespoke schema
+# set -e
-curl $ESCREDENTIALS -s -XDELETE "$ESHOST/status/" > /dev/null
+OSHOST=${1:-"http://localhost:9200"}
+OSCREDENTIALS=${2:-"-u opensearch:passwordhere"}
-echo "Deleted status index"
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/status/" > /dev/null
+echo "Deleted 'status' index, now recreating it..."
+curl $OSCREDENTIALS -s -XPUT "$OSHOST/status" -H 'Content-Type: application/json' --upload-file src/main/resources/status.mapping
-# http://localhost:9200/status/_mapping/status?pretty
+echo
-echo "Creating status index with mapping"
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/content/" > /dev/null
+echo "Deleted 'content' index, now recreating it..."
+curl $OSCREDENTIALS -s -XPUT "$OSHOST/content" -H 'Content-Type: application/json' --upload-file src/main/resources/indexer.mapping
-curl $ESCREDENTIALS -s -XPUT $ESHOST/status -H 'Content-Type: application/json' -d '
-{
- "settings": {
- "index": {
- "number_of_shards": 16,
- "number_of_replicas": 1,
- "refresh_interval": "5s"
- }
- },
- "mappings": {
- "dynamic_templates": [{
- "metadata": {
- "path_match": "metadata.*",
- "match_mapping_type": "string",
- "mapping": {
- "type": "keyword"
- }
- }
- }],
- "_source": {
- "enabled": true
- },
- "properties": {
- "nextFetchDate": {
- "type": "date",
- "format": "dateOptionalTime"
- },
- "status": {
- "type": "keyword"
- },
- "url": {
- "type": "keyword"
- }
- }
- }
-}'
+echo
-# deletes and recreates a status index with a bespoke schema
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/metrics*/" > /dev/null
-curl $ESCREDENTIALS -s -XDELETE "$ESHOST/metrics*/" > /dev/null
-
-echo ""
-echo "Deleted metrics index"
-
-curl $ESCREDENTIALS -s -XPUT $ESHOST/_ilm/policy/14d-deletion_policy -H 'Content-Type:application/json' -d '
-{
- "policy": {
- "phases": {
- "delete": {
- "min_age": "14d",
- "actions": {
- "delete": {}
- }
- }
- }
- }
-}
-'
-
-echo "Creating metrics index with mapping"
+echo "Deleted 'metrics' index, now recreating it..."
# http://localhost:9200/metrics/_mapping/status?pretty
-curl $ESCREDENTIALS -s -XPOST $ESHOST/_template/storm-metrics-template -H 'Content-Type: application/json' -d '
-{
- "index_patterns": "metrics*",
- "settings": {
- "index": {
- "number_of_shards": 1,
- "refresh_interval": "30s"
- },
- "number_of_replicas": 0,
- "lifecycle.name": "14d-deletion_policy"
- },
- "mappings": {
- "_source": { "enabled": true },
- "properties": {
- "name": {
- "type": "keyword"
- },
- "stormId": {
- "type": "keyword"
- },
- "srcComponentId": {
- "type": "keyword"
- },
- "srcTaskId": {
- "type": "short"
- },
- "srcWorkerHost": {
- "type": "keyword"
- },
- "srcWorkerPort": {
- "type": "integer"
- },
- "timestamp": {
- "type": "date",
- "format": "dateOptionalTime"
- },
- "value": {
- "type": "double"
- }
- }
- }
-}'
+curl $OSCREDENTIALS -s -XPOST "$OSHOST/_template/metrics-template" -H 'Content-Type: application/json' --upload-file src/main/resources/metrics.mapping
+echo
diff --git a/bin/dashboards/OS_ImportDashboards.sh b/bin/dashboards/OS_ImportDashboards.sh
new file mode 100755
index 0000000..561f739
--- /dev/null
+++ b/bin/dashboards/OS_ImportDashboards.sh
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/sh
+
+BIN=$(dirname $0)
+
+echo "Importing status dashboard into OpenSearch Dashboards"
+curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson
+echo ""
+
+echo "Importing metrics dashboard into OpenSearch Dashboards"
+curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson
+echo ""
+
+# Storm internal metrics
+# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson
diff --git a/bin/dashboards/metrics.ndjson b/bin/dashboards/metrics.ndjson
new file mode 100644
index 0000000..20cbb2b
--- /dev/null
+++ b/bin/dashboards/metrics.ndjson
@@ -0,0 +1,10 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="}
+{"exportedCount":9,"missingRefCount":0,"missingReferences":[]}
diff --git a/bin/dashboards/status.ndjson b/bin/dashboards/status.ndjson
new file mode 100644
index 0000000..b3d0122
--- /dev/null
+++ b/bin/dashboards/status.ndjson
@@ -0,0 +1,5 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"}
+{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/bin/dashboards/storm.ndjson b/bin/dashboards/storm.ndjson
new file mode 100644
index 0000000..880c232
--- /dev/null
+++ b/bin/dashboards/storm.ndjson
@@ -0,0 +1,5 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"}
+{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
\ No newline at end of file
diff --git a/bin/run-crawler.sh b/bin/run-crawler.sh
deleted file mode 100755
index ca32e47..0000000
--- a/bin/run-crawler.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# in case volumes are on the host need to adjust permissions
-chown -R elasticsearch:elasticsearch /data/elasticsearch
-chown -R storm:storm /data/warc
-
-# export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-
-# as root
-/usr/bin/supervisord
-
-# wait until Storm and Elasticsearch are running
-sleep 60
-
-mkdir /tmp/seeds
-cp -rf /home/ubuntu/news-crawler/seeds /tmp/
-chmod -R a+r /tmp/seeds
-
-# start the news crawler as user ubuntu
-sudo -iu ubuntu /bin/bash <<"EOF"
-
-set -e
-
-cd $HOME/news-crawler/
-
-# initialize Elasticsearch indices
-# CAVEAT: this deletes existing indices!
-bin/ES_IndexInit.sh
-sleep 10
-
-STORMCRAWLER="storm jar $PWD/lib/crawler.jar"
-
-# run the crawler
-$STORMCRAWLER -- org.commoncrawl.stormcrawler.news.CrawlTopology \
- /tmp/seeds '*' -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml
-# alternatively running the flux
-#$STORMCRAWLER org.apache.storm.flux.Flux --remote $PWD/conf/crawler.flux
-# suppress warnings about malformed XML in sitemaps
-storm set_log_level NewsCrawl \
- -l crawlercommons.sitemaps.SiteMapParser=ERROR
-
-
-EOF
diff --git a/bin/es_status b/bin/status
similarity index 100%
rename from bin/es_status
rename to bin/status
diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml
index ef0f478..eb2594b 100644
--- a/conf/crawler-conf.yaml
+++ b/conf/crawler-conf.yaml
@@ -31,7 +31,7 @@ config:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1
- - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer"
+ - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
parallelism.hint: 1
# status index and fetcher queues are partitioned by domain
@@ -70,6 +70,9 @@ config:
# increased network timeout (ms) for news sites from Asia and eastern Europe
http.timeout: 30000
+ # allowed URL protocols
+ protocols: "http,https"
+
# use okhttp
http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
@@ -82,10 +85,26 @@ config:
# or transferred protocol metadata must also be prefixed.
protocol.md.prefix: "protocol."
+ # number of instances for each protocol implementation
+ protocol.instances.num: 8
+ # connection pool configuration of OkHttp protocol
+ okhttp.protocol.connection.pool:
+ # maximum number of idle connections (in addition to active connections)
+ max.idle.connections: 256
+ # maximum keep-alive time of the connections in seconds
+ connection.keep.alive: 300
+ # See also
+ # https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html
+ # Note that OkHttp's connection pool (v4.9.1) is not optimized for fast
+ # look-up of connections, the pool size (idle and active connections)
+ # should not exceed 1000. To allow for efficient pooling in large and
+ # diverse crawls, it's recommended to increase also the number of protocol
+ # instances, see `protocol.instance.num`.
+
# delay between successive requests to the same host/domain
# (be defensive, a delay of 5 sec. means about 1000 fetches per hour
# which should be enough even for large news sites)
- fetcher.server.delay: 6.0
+ fetcher.server.delay: 9.0
# generous max. crawl delay
# (fetch content even if the robots.txt specifies a large host-specific crawl delay:
diff --git a/conf/crawler.flux b/conf/crawler.flux
index 73af7c7..9f48a01 100644
--- a/conf/crawler.flux
+++ b/conf/crawler.flux
@@ -6,11 +6,11 @@ includes:
override: false
- resource: false
- file: "crawler-conf.yaml"
+ file: "conf/crawler-conf.yaml"
override: true
- resource: false
- file: "es-conf.yaml"
+ file: "conf/opensearch-conf.yaml"
override: true
config:
@@ -77,13 +77,13 @@ components:
spouts:
- id: "spout"
- className: "org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout"
+ className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout"
parallelism: 16
- id: "filespout"
className: "org.apache.stormcrawler.spout.FileSpout"
parallelism: 1
constructorArgs:
- - "/path/to/seeds/"
+ - "/data/seeds/"
- "feeds.txt"
- true
@@ -129,7 +129,7 @@ bolts:
args:
- "warc"
- id: "status"
- className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
+ className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
parallelism: 1
streams:
diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml
deleted file mode 100644
index 7ea8b8b..0000000
--- a/conf/es-conf.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# configuration for Elasticsearch resources
-
-config:
- # ES metricsConsumer
- es.metrics.addresses: "http://localhost:9200"
- es.metrics.index.name: "metrics"
-
- # ES spout and persistence bolt
- es.status.addresses: "http://localhost:9200"
- es.status.index.name: "status"
- #es.status.user: "USERNAME"
- #es.status.password: "PASSWORD"
- # the routing is done on the value of 'partition.url.mode'
- es.status.routing: true
- # stores the value used for grouping the URLs as a separate field
- # needed by the spout implementations
- # also used for routing if the value above is set to true
- es.status.routing.fieldname: "metadata.hostname"
- es.status.bulkActions: 500
- es.status.flushInterval: "5s"
- es.status.concurrentRequests: 1
-
- ################
- # spout config #
- ################
-
- # positive or negative filter parsable by the Lucene Query Parser
- # es.status.filterQuery:
- # - "-(metadata.hostname:stormcrawler.net)"
- # - "-(key:digitalpebble.com)"
-
- # time in secs for which the URLs will be considered for fetching after a ack or fail
- # need a high value to avoid duplicates by URLs added multiple times to the fetcher
- # queues, should be close to
- # fetcher.max.crawl.delay * fetcher.max.queue.size
- spout.ttl.purgatory: 1200
-
- # Min time (in msecs) to allow between 2 successive queries (per bucket) to ES
- spout.min.delay.queries: 30000
-
- # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
- # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results
- # might be returned.
- # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look
- # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs.
- spout.reset.fetchdate.after: 240
-
- es.status.max.buckets: 200
- # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6
- # but set to a lower number for domains with longer crawl-delay
- # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues
- es.status.max.urls.per.bucket: 5
- # field to group the URLs into buckets
- es.status.bucket.field: "metadata.hostname"
- # fields to sort the URLs within a bucket
- es.status.bucket.sort.field:
- - "nextFetchDate"
- - "url"
- # field to sort the buckets
- es.status.global.sort.field: "nextFetchDate"
-
- # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
- es.status.max.start.offset: 500
-
- # AggregationSpout : sampling improves the performance on large crawls
- es.status.sample: false
-
- # max allowed duration of a query in sec
- es.status.query.timeout: -1
-
- # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
- # use it as nextFetchDate
- es.status.recentDate.increase: -1
- es.status.recentDate.min.gap: -1
-
- topology.metrics.consumer.register:
- - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer"
- parallelism.hint: 1
- #whitelist:
- # - "fetcher_counter"
- # - "fetcher_average.bytes_fetched"
- #blacklist:
- # - "__receive.*"
diff --git a/conf/opensearch-conf.yaml b/conf/opensearch-conf.yaml
new file mode 100644
index 0000000..e6d2025
--- /dev/null
+++ b/conf/opensearch-conf.yaml
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# configuration for OpenSearch resources
+
+config:
+
+ # address to use unless a more specific one has been
+ # defined for a component
+ # also accepts a list or multiple values in a single line
+ # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
+ # Note: here the address from inside the docker-compose cluster is required
+ opensearch.addresses: "http://opensearch-news-crawl:9200"
+ #opensearch.user: "USERNAME"
+ #opensearch.password: "PASSWORD"
+ opensearch.concurrentRequests: 2
+
+ # Disable TLS validation for connection to OpenSearch
+ # opensearch.disable.tls.validation: false
+
+ # Indexer bolt
+ # addresses can be specified as a full URL
+ # if not we assume that the protocol is http and the port 9200
+ opensearch.indexer.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.indexer.index.name: "content"
+ # opensearch.indexer.pipeline: "_PIPELINE_"
+ opensearch.indexer.create: false
+ opensearch.indexer.bulkActions: 100
+ opensearch.indexer.flushInterval: "2s"
+ opensearch.indexer.concurrentRequests: 1
+ opensearch.indexer.sniff: true
+
+ # MetricsConsumer
+ opensearch.metrics.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.metrics.index.name: "metrics"
+ opensearch.metrics.sniff: true
+
+ # Spout and persistence bolt
+ opensearch.status.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.status.index.name: "status"
+ #opensearch.status.user: "USERNAME"
+ #opensearch.status.password: "PASSWORD"
+ # the routing is done on the value of 'partition.url.mode'
+ opensearch.status.routing: true
+ # stores the value used for grouping the URLs as a separate field
+ # needed by the spout implementations
+ # also used for routing if the value above is set to true
+ opensearch.status.routing.fieldname: "key"
+ opensearch.status.bulkActions: 500
+ opensearch.status.flushInterval: "5s"
+ opensearch.status.concurrentRequests: 1
+ opensearch.status.sniff: true
+
+ ################
+ # spout config #
+ ################
+
+ # positive or negative filters parsable by the Lucene Query Parser
+ # opensearch.status.filterQuery:
+ # - "-(key:stormcrawler.net)"
+ # - "-(key:stormcrawler.apache.org)"
+
+ # time in secs for which the URLs will be considered for fetching after a ack or fail
+ # need a high value to avoid duplicates by URLs added multiple times to the fetcher
+ # queues, should be close to
+ # fetcher.max.crawl.delay * fetcher.max.queue.size
+ spout.ttl.purgatory: 1200
+
+ # Min time (in msecs) to allow between 2 successive queries to OpenSearch
+ spout.min.delay.queries: 30000
+
+ # Max time (in msecs) to allow between 2 successive queries to OpenSearch
+ spout.max.delay.queries: 60000
+
+ # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
+ # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer results
+ # might be returned.
+ # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look
+ # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs.
+ spout.reset.fetchdate.after: 240
+
+ opensearch.status.max.buckets: 200
+ # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6
+ # but set to a lower number for domains with longer crawl-delay
+ # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues
+ opensearch.status.max.urls.per.bucket: 5
+ # field to group the URLs into buckets
+ opensearch.status.bucket.field: "key"
+ # fields to sort the URLs within a bucket
+ opensearch.status.bucket.sort.field:
+ - "nextFetchDate"
+ - "url"
+ # field to sort the buckets
+ opensearch.status.global.sort.field: "nextFetchDate"
+
+ # AggregationSpout : sampling improves the performance on large crawls
+ opensearch.status.sample: false
+
+ # max allowed duration of a query in sec
+ opensearch.status.query.timeout: -1
+
+ # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
+ # use it as nextFetchDate
+ opensearch.status.recentDate.increase: -1
+ opensearch.status.recentDate.min.gap: -1
+
+ topology.metrics.consumer.register:
+ - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
+ parallelism.hint: 1
+ #whitelist:
+ # - "fetcher_counter"
+ # - "fetcher_average.bytes_fetched"
+ #blacklist:
+ # - "__receive.*"
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..24d048c
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+services:
+
+ # Apache Storm components
+ # - Zookeeper coordinates the communication between Nimbus and the Supervisors
+ zookeeper:
+ image: zookeeper:${ZOOKEEPER_VERSION:-3.9.3}
+ container_name: zookeeper
+ restart: always
+
+ # - the daemon Nimbus runs on the master node
+ storm-nimbus:
+ image: storm:${STORM_VERSION:-2.8.4}
+ container_name: storm-nimbus
+ hostname: nimbus
+ command: storm nimbus
+ depends_on:
+ - zookeeper
+ links:
+ - zookeeper
+ ports:
+ - 6627:6627
+ restart: always
+
+ # - the Supervisors run on the worker nodes
+ storm-supervisor:
+ image: storm:${STORM_VERSION:-2.8.4}
+ container_name: storm-supervisor
+ command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
+ depends_on:
+ - zookeeper
+ - storm-nimbus
+ links:
+ - zookeeper
+ - storm-nimbus:nimbus
+ # supervisor launches the worker processes
+ # which need to be able to access
+ # - (in case a indexing topology is run) the
+ # OpenSearch (http://opensearch:9200/) and
+ - opensearch-news-crawl
+ # - the WARC output folder
+ # - and the seed folder
+ volumes:
+ - ${WARCOUTPUT:-./warcdata}:/data/warc
+ - ${SEEDDIR:-./seeds}:/data/seeds
+ restart: always
+
+ # - the Storm UI provides diagnostics about the Storm cluster
+ storm-ui:
+ image: storm:${STORM_VERSION:-2.8.4}
+ container_name: storm-ui
+ command: storm ui
+ depends_on:
+ - storm-nimbus
+ links:
+ - storm-nimbus:nimbus
+ ports:
+ - "127.0.0.1:8080:8080"
+ restart: always
+
+ opensearch-news-crawl:
+ image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.4}
+ container_name: opensearch-news-crawl
+ environment:
+ - cluster.name=opensearch-news-crawl-cluster
+ - node.name=opensearch-news-crawl
+ - discovery.type=single-node
+ - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
+ - "OPENSEARCH_JAVA_OPTS=-Xms4G -Xmx4G"
+ - plugins.security.disabled=true
+ - "DISABLE_INSTALL_DEMO_CONFIG=true"
+ volumes:
+ - ${OPENSEARCHDATA:-./opensearchdata}:/usr/share/opensearch/data
+ ulimits:
+ memlock:
+ soft: -1
+ hard: -1
+ nofile:
+ soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
+ hard: 65536
+ ports:
+ - "127.0.0.1:9200:9200" # REST API
+
+ opensearch-dashboard-news-crawl:
+ image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.4}
+ container_name: opensearch-dashboard-news-crawl
+ ports:
+ - "127.0.0.1:5601:5601"
+ expose:
+ - "5601"
+ environment:
+ - 'OPENSEARCH_HOSTS=["http://opensearch-news-crawl:9200"]'
+ - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards
+
+ # - to launch a topology
+ # - will exit on startup
+ news-crawler:
+ build: .
+ container_name: news-crawler
+ command: /bin/bash
+ depends_on:
+ - storm-nimbus
+ links:
+ - storm-nimbus:nimbus
+ volumes:
+ - ${WARCOUTPUT:-./warcdata}:/data/warc
+ - ${SEEDDIR:-./seeds}:/data/seeds
+ restart: "no"
+
diff --git a/etc/supervisor/conf.d/elasticsearch.conf b/etc/supervisor/conf.d/elasticsearch.conf
deleted file mode 100644
index a4f0020..0000000
--- a/etc/supervisor/conf.d/elasticsearch.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-[program:elasticsearch]
-command=/usr/share/elasticsearch/bin/elasticsearch -Enetwork.host=127.0.0.1 -Ehttp.port=9200 -Etransport.tcp.port=9300
-numprocs=1
-autostart=true
-autorestart=true
-user=elasticsearch
-echo environment=ES_HEAP_SIZE="20g"
-environment=ES_PATH_CONF=/etc/elasticsearch
\ No newline at end of file
diff --git a/etc/supervisor/conf.d/kibana.conf b/etc/supervisor/conf.d/kibana.conf
deleted file mode 100644
index 45e1812..0000000
--- a/etc/supervisor/conf.d/kibana.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-[program:kibana]
-command=/usr/share/kibana/bin/kibana -c /etc/kibana/kibana.yml
-numprocs=1
-autostart=true
-autorestart=true
-user=kibana
-directory=/usr/share/kibana/
diff --git a/etc/sysctl.d/60-elasticsearch.conf b/etc/sysctl.d/60-elasticsearch.conf
deleted file mode 100644
index ae43f01..0000000
--- a/etc/sysctl.d/60-elasticsearch.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-
-# Elasticsearch settings
-# see
-# https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-configuration-memory.html#swappiness
-# https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html
-vm.swappiness=1
-vm.max_map_count=262144
diff --git a/seeds/feeds.txt b/seeds/feeds.txt
index b74ad0e..88468a8 100644
--- a/seeds/feeds.txt
+++ b/seeds/feeds.txt
@@ -1,26 +1,3 @@
-https://www.usatoday.com/news-sitemap.xml isSitemapNews=true
-https://www.theguardian.com/sitemaps/news.xml isSitemapNews=true
-https://www.theguardian.com/international/rss isFeed=true
-https://www.theguardian.com/world/rss isFeed=true
-https://www.theguardian.com/uk/rss isFeed=true
-https://www.theguardian.com/us/rss isFeed=true
-https://www.theguardian.com/world/eu/rss isFeed=true
-https://www.theguardian.com/politics/rss isFeed=true
-https://www.theguardian.com/science/rss isFeed=true
-https://www.theguardian.com/education/rss isFeed=true
-https://www.theguardian.com/football/rss isFeed=true
-https://www.elwatannews.com/home/rssfeeds isFeed=true
-https://www.corriere.it/rss/sitemap_v2.xml isSitemapIndex=true
-https://www.repubblica.it/rss/homepage/rss2.0.xml isFeed=true
-https://www.repubblica.it/rss/economia/rss2.0.xml isFeed=true
-https://www.repubblica.it/rss/politica/rss2.0.xml isFeed=true
-https://www.lemonde.fr/sitemap_news.xml isSitemapNews=true
-https://www.lemonde.fr/economie/rss_full.xml isFeed=true
-https://www.lemonde.fr/rss/une.xml isFeed=true
-https://www.lemonde.fr/international/rss_full.xml isFeed=true
-https://www.lemonde.fr/politique/rss_full.xml isFeed=true
-https://www.lemonde.fr/livres/rss_full.xml isFeed=true
-https://www.lemonde.fr/afrique/rss_full.xml isFeed=true
-https://www.lemonde.fr/ameriques/rss_full.xml isFeed=true
-https://www.cnn.com/sitemaps/cnn/news.xml isSitemapNews=true
-https://www.bbc.com/sitemaps/https-index-com-news.xml isSitemapNews=true
+https://commoncrawl.org/blog/rss.xml
+
+# Please, add your news feeds and sitemaps below - one line, one URL.
\ No newline at end of file
diff --git a/src/main/resources/indexer.mapping b/src/main/resources/indexer.mapping
new file mode 100644
index 0000000..b788e6b
--- /dev/null
+++ b/src/main/resources/indexer.mapping
@@ -0,0 +1,40 @@
+{
+ "settings": {
+ "index": {
+ "number_of_shards": 5,
+ "number_of_replicas": 1,
+ "refresh_interval": "60s"
+ }
+ },
+ "mappings": {
+ "_source": {
+ "enabled": true
+ },
+ "properties": {
+ "content": {
+ "type": "text"
+ },
+ "description": {
+ "type": "text"
+ },
+ "domain": {
+ "type": "keyword"
+ },
+ "format": {
+ "type": "keyword"
+ },
+ "keywords": {
+ "type": "keyword"
+ },
+ "host": {
+ "type": "keyword"
+ },
+ "title": {
+ "type": "text"
+ },
+ "url": {
+ "type": "keyword"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/resources/metrics.mapping b/src/main/resources/metrics.mapping
new file mode 100644
index 0000000..5b2ac15
--- /dev/null
+++ b/src/main/resources/metrics.mapping
@@ -0,0 +1,40 @@
+{
+ "index_patterns": "metrics*",
+ "settings": {
+ "index": {
+ "number_of_shards": 1,
+ "refresh_interval": "30s"
+ },
+ "number_of_replicas": 0
+ },
+ "mappings": {
+ "_source": { "enabled": true },
+ "properties": {
+ "name": {
+ "type": "keyword"
+ },
+ "stormId": {
+ "type": "keyword"
+ },
+ "srcComponentId": {
+ "type": "keyword"
+ },
+ "srcTaskId": {
+ "type": "short"
+ },
+ "srcWorkerHost": {
+ "type": "keyword"
+ },
+ "srcWorkerPort": {
+ "type": "integer"
+ },
+ "timestamp": {
+ "type": "date",
+ "format": "date_optional_time"
+ },
+ "value": {
+ "type": "double"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/resources/status.mapping b/src/main/resources/status.mapping
new file mode 100644
index 0000000..e5b14fe
--- /dev/null
+++ b/src/main/resources/status.mapping
@@ -0,0 +1,39 @@
+{
+ "settings": {
+ "index": {
+ "number_of_shards": 10,
+ "number_of_replicas": 1,
+ "refresh_interval": "5s"
+ }
+ },
+ "mappings": {
+ "dynamic_templates": [{
+ "metadata": {
+ "path_match": "metadata.*",
+ "match_mapping_type": "string",
+ "mapping": {
+ "type": "keyword"
+ }
+ }
+ }],
+ "_source": {
+ "enabled": true
+ },
+ "properties": {
+ "key": {
+ "type": "keyword",
+ "index": true
+ },
+ "nextFetchDate": {
+ "type": "date",
+ "format": "date_optional_time"
+ },
+ "status": {
+ "type": "keyword"
+ },
+ "url": {
+ "type": "keyword"
+ }
+ }
+ }
+}
From 12a5b503b2ba35beb6180674cb1554f77c226c3c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Tue, 31 Mar 2026 19:12:03 +0200
Subject: [PATCH 4/8] Pin version of Jackson: needs to be same as used by Storm
---
pom.xml | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index c470e83..9deb934 100644
--- a/pom.xml
+++ b/pom.xml
@@ -42,7 +42,7 @@ under the License.
3.5.12.8.41.12.797
- 2.11.1
+ 2.18.11.65.23.03.0.1
@@ -171,6 +171,23 @@ under the License.
${aws.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ ${jackson.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ ${jackson.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
+
org.apache.stormcrawler
From 0d68ae50016c595104992fe85d5d931534ac084b Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 11 Jun 2026 19:24:11 +0200
Subject: [PATCH 5/8] Upgrade to StormCrawler 3.6.0, Storm 2.8.8
---
Dockerfile | 4 ++--
README.md | 12 ++++++------
bin/status | 8 ++++----
conf/crawler-conf.yaml | 7 +++++++
conf/crawler.flux | 2 +-
docker-compose.yaml | 12 ++++++------
pom.xml | 19 ++++---------------
.../stormcrawler/news/CrawlTopology.java | 2 +-
8 files changed, 31 insertions(+), 35 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index 683df8f..6b469b3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM storm:2.8.4
+FROM storm:2.8.8
RUN apt-get update -qq && \
apt-get install -yq --no-install-recommends \
@@ -10,7 +10,7 @@ RUN apt-get update -qq && \
#
# news-crawler
#
-ENV CRAWLER_VERSION=3.5.1
+ENV CRAWLER_VERSION=3.6.0
RUN mkdir /news-crawler/ && \
mkdir /news-crawler/conf/ && \
diff --git a/README.md b/README.md
index ad990bf..2d18792 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod
## Prerequisites
-* Install OpenSearch 2.19.4
-* Install Apache Storm 2.8.4
+* Install OpenSearch 2.19.5
+* Install Apache Storm 2.8.8
* Start OpenSearch and Storm
* Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh)
@@ -32,14 +32,14 @@ mvn clean package
And run ...
``` sh
-storm local target/crawler-3.5.1.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
+storm local target/crawler-3.6.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
```
This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand.
-Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments.
+Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the OpenSearch API. In this case, the can topology can be run without the last two arguments.
-Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.4/flux.html). Make sure to adapt the Flux definition to your needs!
+Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.8/flux.html). Make sure to adapt the Flux definition to your needs!
In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging.
@@ -88,7 +88,7 @@ NOTE:
- Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance!
-To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.4/flux.html):
+To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.8/flux.html):
```
docker compose run --rm news-crawler \
storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux
diff --git a/bin/status b/bin/status
index 410c4d1..978fb5a 100755
--- a/bin/status
+++ b/bin/status
@@ -5,14 +5,14 @@ __ES_STATUS_URL_DEFAULT='http://localhost:9200/status'
function ____show_help() {
echo "$0 [-v|-V] [-C] []"
echo
- echo "Query StormCrawler's Elasticsearch status index"
+ echo "Query StormCrawler's Elasticsearch or OpenSearch status index"
echo " with help of curl, jq and bash"
echo
echo "Global options"
echo " -h show detailed help"
echo " -v verbose, print commands before execution"
echo " -V very verbose"
- echo " -D dry run, do not execute request to ES (use in combination with -v)"
+ echo " -D dry run, do not execute request (use in combination with -v)"
echo " -C colorize JSON output"
echo
echo "Commands"
@@ -134,12 +134,12 @@ ES_STATUS_URL=${ES_STATUS_URL:-$__ES_STATUS_URL_DEFAULT}
set -e
-# current time in Elasticsearch date format
+# current time in Elasticsearch/OpenSearch date format
function ____now () {
date -u '+%Y-%m-%dT%H:%M:%S.000Z'
}
-# given date in Elasticsearch date format
+# given date in Elasticsearch/OpenSearch date format
function ____date () {
date -d"$1" -u '+%Y-%m-%dT%H:%M:%S.000Z'
}
diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml
index eb2594b..6bbd52c 100644
--- a/conf/crawler-conf.yaml
+++ b/conf/crawler-conf.yaml
@@ -77,6 +77,13 @@ config:
http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
+ # the http/https protocol versions to use, in order of preference
+ # - the WARC writer handles HTTP/1.1 and HTTP/2 (cf. storm-crawler#1010)
+ # - okhttp does not support HTTP/1.0 requests (it supports responses however)
+ # http.protocol.versions:
+ # - "h2"
+ # - "http/1.1"
+
# do not fail on unknown SSL certificates
http.trust.everything: true
diff --git a/conf/crawler.flux b/conf/crawler.flux
index 9f48a01..4d390a5 100644
--- a/conf/crawler.flux
+++ b/conf/crawler.flux
@@ -45,7 +45,7 @@ components:
- name: "put"
args:
- "software"
- - "StormCrawler 2.10 https://stormcrawler.net/"
+ - "StormCrawler 3.6.0 https://stormcrawler.apache.org/"
- name: "put"
args:
- "description"
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 24d048c..8d8d1fd 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -17,13 +17,13 @@ services:
# Apache Storm components
# - Zookeeper coordinates the communication between Nimbus and the Supervisors
zookeeper:
- image: zookeeper:${ZOOKEEPER_VERSION:-3.9.3}
+ image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4}
container_name: zookeeper
restart: always
# - the daemon Nimbus runs on the master node
storm-nimbus:
- image: storm:${STORM_VERSION:-2.8.4}
+ image: storm:${STORM_VERSION:-2.8.8}
container_name: storm-nimbus
hostname: nimbus
command: storm nimbus
@@ -37,7 +37,7 @@ services:
# - the Supervisors run on the worker nodes
storm-supervisor:
- image: storm:${STORM_VERSION:-2.8.4}
+ image: storm:${STORM_VERSION:-2.8.8}
container_name: storm-supervisor
command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
depends_on:
@@ -60,7 +60,7 @@ services:
# - the Storm UI provides diagnostics about the Storm cluster
storm-ui:
- image: storm:${STORM_VERSION:-2.8.4}
+ image: storm:${STORM_VERSION:-2.8.8}
container_name: storm-ui
command: storm ui
depends_on:
@@ -72,7 +72,7 @@ services:
restart: always
opensearch-news-crawl:
- image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.4}
+ image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5}
container_name: opensearch-news-crawl
environment:
- cluster.name=opensearch-news-crawl-cluster
@@ -95,7 +95,7 @@ services:
- "127.0.0.1:9200:9200" # REST API
opensearch-dashboard-news-crawl:
- image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.4}
+ image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5}
container_name: opensearch-dashboard-news-crawl
ports:
- "127.0.0.1:5601:5601"
diff --git a/pom.xml b/pom.xml
index 9deb934..0508210 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@ under the License.
4.0.0org.commoncrawl.stormcrawler.newscrawler
- 3.5.1
+ 3.6.0jar
@@ -39,10 +39,10 @@ under the License.
UTF-8
- 3.5.1
- 2.8.4
+ 3.6.0
+ 2.8.81.12.797
- 2.18.1
+ 2.21.31.65.23.03.0.1
@@ -171,17 +171,6 @@ under the License.
${aws.version}
-
-
- com.fasterxml.jackson.core
- jackson-annotations
- ${jackson.version}
-
-
- com.fasterxml.jackson.core
- jackson-core
- ${jackson.version}
- com.fasterxml.jackson.corejackson-databind
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
index 54c69eb..3a3c018 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
@@ -44,7 +44,7 @@
import org.slf4j.LoggerFactory;
/**
- * Dummy topology to play with the spouts and bolts on ElasticSearch
+ * Dummy topology to play with the spouts and bolts on OpenSearch
*/
public class CrawlTopology extends ConfigurableTopology {
From 98e531a7b1b64057dce9bbbd8ada5e86fc6be02d Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 12 Jun 2026 09:18:41 +0200
Subject: [PATCH 6/8] Consistently name components and containers in docker
compose configuration
Add suffix "-news-crawl" to all Storm / Zookeeper container names to avoid
name collisions with other StormCrawler setups running on the same system.
---
README.md | 2 +-
docker-compose.yaml | 14 +++++++-------
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index 2d18792..4efbf08 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http
For inspecting the worker log files:
```
-docker exec storm-supervisor /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log'
+docker exec storm-supervisor-news-crawl /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log'
```
To stop the topology:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8d8d1fd..31bbc05 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -18,13 +18,13 @@ services:
# - Zookeeper coordinates the communication between Nimbus and the Supervisors
zookeeper:
image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4}
- container_name: zookeeper
+ container_name: zookeeper-news-crawl
restart: always
# - the daemon Nimbus runs on the master node
storm-nimbus:
image: storm:${STORM_VERSION:-2.8.8}
- container_name: storm-nimbus
+ container_name: storm-nimbus-news-crawl
hostname: nimbus
command: storm nimbus
depends_on:
@@ -38,7 +38,7 @@ services:
# - the Supervisors run on the worker nodes
storm-supervisor:
image: storm:${STORM_VERSION:-2.8.8}
- container_name: storm-supervisor
+ container_name: storm-supervisor-news-crawl
command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
depends_on:
- zookeeper
@@ -50,7 +50,7 @@ services:
# which need to be able to access
# - (in case a indexing topology is run) the
# OpenSearch (http://opensearch:9200/) and
- - opensearch-news-crawl
+ - opensearch
# - the WARC output folder
# - and the seed folder
volumes:
@@ -61,7 +61,7 @@ services:
# - the Storm UI provides diagnostics about the Storm cluster
storm-ui:
image: storm:${STORM_VERSION:-2.8.8}
- container_name: storm-ui
+ container_name: storm-ui-news-crawl
command: storm ui
depends_on:
- storm-nimbus
@@ -71,7 +71,7 @@ services:
- "127.0.0.1:8080:8080"
restart: always
- opensearch-news-crawl:
+ opensearch:
image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5}
container_name: opensearch-news-crawl
environment:
@@ -94,7 +94,7 @@ services:
ports:
- "127.0.0.1:9200:9200" # REST API
- opensearch-dashboard-news-crawl:
+ opensearch-dashboard:
image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5}
container_name: opensearch-dashboard-news-crawl
ports:
From f57b83d39d417845e7ac73b5fe139db16d4a5c87 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 12 Jun 2026 12:18:14 +0200
Subject: [PATCH 7/8] Add GitHub Dependabot configuration file
---
.github/dependabot.yml | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
create mode 100644 .github/dependabot.yml
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..4205e52
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+version: 2
+updates:
+ - package-ecosystem: maven
+ directory: "/"
+ schedule:
+ interval: weekly
+ open-pull-requests-limit: 5
+ ignore:
+ # Jackson libs must be in sync with the version required by Storm
+ - dependency-name: "com.fasterxml.jackson*"
+
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: weekly
From e6e4d7466bbb1c75034569fb3822a86b55a4b6c7 Mon Sep 17 00:00:00 2001
From: Luca <15426+lfoppiano@users.noreply.github.com>
Date: Tue, 16 Jun 2026 19:37:36 +0200
Subject: [PATCH 8/8] Add gitignore and automatic code formatting (#71)
- Added Cosium formatter
- Added .gitignore
- Fixed Licence headers (start with /* instead of /**)
---
.github/workflows/maven.yml | 2 +
.gitignore | 5 +
.mvn/jvm.config | 8 +
README.md | 17 +-
pom.xml | 217 +++---
.../stormcrawler/filter/FastURLFilter.java | 629 +++++++++---------
.../stormcrawler/news/ContentDetector.java | 31 +-
.../stormcrawler/news/CrawlTopology.java | 182 ++---
.../stormcrawler/news/FeedDetectorBolt.java | 34 +-
.../news/NewsSiteMapParserBolt.java | 235 ++++---
.../stormcrawler/news/PreFilterBolt.java | 136 ++--
.../news/PunycodeURLNormalizer.java | 14 +-
.../news/bootstrap/BootstrapTopology.java | 43 +-
.../news/bootstrap/FeedLinkParseFilter.java | 26 +-
.../bootstrap/NewsSiteMapDetectorBolt.java | 43 +-
.../stormcrawler/FastURLFilterTest.java | 84 ++-
.../news/NewsSiteMapParserTest.java | 115 ++--
17 files changed, 954 insertions(+), 867 deletions(-)
create mode 100644 .gitignore
create mode 100644 .mvn/jvm.config
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index dc52ef3..5c79a4c 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -63,5 +63,7 @@ jobs:
with:
distribution: adopt
java-version: ${{ matrix.java }}
+ - name: Check code formatting
+ run: mvn -B --no-transfer-progress com.cosium.code:git-code-format-maven-plugin:validate-code-format -Dskip.format.code=false
- name: Build with Maven
run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..37effdb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.idea
+target
+opensearchdata
+warcdata
+.java-version
\ No newline at end of file
diff --git a/.mvn/jvm.config b/.mvn/jvm.config
new file mode 100644
index 0000000..87ae20c
--- /dev/null
+++ b/.mvn/jvm.config
@@ -0,0 +1,8 @@
+--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED
+--add-opens jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED
+--add-opens jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED
diff --git a/README.md b/README.md
index 4efbf08..874da21 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod
## Prerequisites
-
+* JVM 17 or higher
* Install OpenSearch 2.19.5
* Install Apache Storm 2.8.8
* Start OpenSearch and Storm
@@ -119,3 +119,18 @@ NewsCrawl ACTIVE 48 1 146 NewsCrawl-1
$> storm kill NewsCrawl
```
+## Note for developers
+
+Please format your code before submitting a PR with
+
+```
+mvn git-code-format:format-code -Dgcf.globPattern="**/*" -Dskip.format.code=false
+```
+
+You can enable pre-commit format hooks by running:
+
+```
+mvn clean install -Dskip.format.code=false
+```
+
+
diff --git a/pom.xml b/pom.xml
index 0508210..69b7487 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,5 +1,4 @@
-
-
-
+4.0.0org.commoncrawl.stormcrawler.newscrawler3.6.0jar
+
+ https://github.com/commoncrawl/news-crawlApache License, Version 2.0
@@ -35,8 +36,6 @@ under the License.
- https://github.com/commoncrawl/news-crawl
-
UTF-83.6.0
@@ -47,89 +46,10 @@ under the License.
5.23.03.0.14.13.2
+ 5.4
+ true
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.15.0
-
- 17
- 17
-
-
-
- org.codehaus.mojo
- exec-maven-plugin
- 3.6.3
-
-
-
- exec
-
-
-
-
- java
- true
- false
- compile
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.6.2
-
-
- package
-
- shade
-
-
- false
-
-
-
- org.apache.storm.flux.Flux
-
-
-
-
-
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
- org.apache.storm:flux-core
-
- org/apache/commons/**
- org/apache/http/**
- org/yaml/**
-
-
-
-
-
-
-
-
-
-
org.apache.stormcrawler
@@ -207,4 +127,125 @@ under the License.
test
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.15.0
+
+ 17
+ 17
+
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 3.6.3
+
+ java
+ true
+ false
+ compile
+
+
+
+
+ exec
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+ 3.6.2
+
+
+
+ shade
+
+ package
+
+ false
+
+
+
+ org.apache.storm.flux.Flux
+
+
+
+
+
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+ org.apache.storm:flux-core
+
+ org/apache/commons/**
+ org/apache/http/**
+ org/yaml/**
+
+
+
+
+
+
+
+
+ com.cosium.code
+ git-code-format-maven-plugin
+ ${git-code-format-maven-plugin.version}
+
+
+
+ install-formatter-hook
+
+ install-hooks
+
+
+
+
+ validate-code-format
+
+ validate-code-format
+
+
+
+
+
+
+ com.cosium.code
+ google-java-format
+ ${git-code-format-maven-plugin.version}
+
+
+
+ ${skip.format.code}
+
+ true
+ false
+ false
+ false
+
+
+
+
+
diff --git a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
index 7c52716..9b2ee32 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,6 +16,16 @@
*/
package org.commoncrawl.stormcrawler.filter;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.GetObjectRequest;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.model.S3Object;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonMappingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimap;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
@@ -29,7 +39,6 @@
import java.util.TimerTask;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
-
import org.apache.commons.lang3.StringUtils;
import org.apache.stormcrawler.JSONResource;
import org.apache.stormcrawler.Metadata;
@@ -38,28 +47,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.amazonaws.services.s3.AmazonS3;
-import com.amazonaws.services.s3.AmazonS3ClientBuilder;
-import com.amazonaws.services.s3.model.GetObjectRequest;
-import com.amazonaws.services.s3.model.ObjectMetadata;
-import com.amazonaws.services.s3.model.S3Object;
-import com.fasterxml.jackson.core.JsonParseException;
-import com.fasterxml.jackson.databind.JsonMappingException;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.google.common.collect.LinkedHashMultimap;
-import com.google.common.collect.Multimap;
-
/**
- * Version of the FastURLFilter that can load from a text representation instead
- * of the JSON that the SC version handles. Can also reload periodically and get
- * its content from S3.
- *
- * Filters URLs based on a file of regular expressions using host/domains
- * matching first. The default policy is to accept a URL if no matches are
- * found.
+ * Version of the FastURLFilter that can load from a text representation instead of the JSON that
+ * the SC version handles. Can also reload periodically and get its content from S3.
+ *
+ *
Filters URLs based on a file of regular expressions using host/domains matching first. The
+ * default policy is to accept a URL if no matches are found.
+ *
+ *
- *
- * Host rules are evaluated before Domain rules. For
- * Host rules the entire host name of a URL must match while the
- * domain names in Domain rules are considered as matches if the
- * domain is a suffix of the host name (consisting of complete host name parts).
- * Shorter domain suffixes are checked first, a single dot
- * "." as "domain name" can be used to specify
- * global rules applied to every URL.
- *
- * E.g., for "www.example.com" the rules given above are looked up in the
- * following order:
+ *
+ * Host rules are evaluated before Domain rules. For Host
+ * rules the entire host name of a URL must match while the domain names in Domain
+ * rules are considered as matches if the domain is a suffix of the host name (consisting of
+ * complete host name parts). Shorter domain suffixes are checked first, a single dot ".
+ * " as "domain name" can be used to specify global rules applied to every
+ * URL.
+ *
+ *
E.g., for "www.example.com" the rules given above are looked up in the following order:
+ *
*
- *
check "www.example.com" whether host-based rules exist and whether one of
- * them matches
- *
check "www.example.com" for domain-based rules
- *
check "example.com" for domain-based rules
- *
check "com" for domain-based rules
- *
check for global rules ("Domain .")
+ *
check "www.example.com" whether host-based rules exist and whether one of them matches
+ *
check "www.example.com" for domain-based rules
+ *
check "example.com" for domain-based rules
+ *
check "com" for domain-based rules
+ *
check for global rules ("Domain .")
*
- * The first matching rule will reject the URL and no further rules are checked.
- * If no rule matches the URL is accepted. URLs without a host name (e.g.,
- * file:/path/file.txt are checked for global rules only. URLs
- * which fail to be parsed as {@link java.net.URL} are always rejected.
- *
- * For rules either the URL path (DenyPath) or path and query
- * (DenyPathQuery) are checked whether the given
- * {@link java.util.regex Java Regular expression} is found (see
- * {@link java.util.regex.Matcher#find()}) in the URL path (and query).
- *
- * Rules are applied in the order of their definition. For better performance,
- * regular expressions which are simpler/faster or match more URLs should be
- * defined earlier.
- *
- * Comments in the rule file start with the # character and reach
- * until the end of the line.
- *
- * The rules file is defined via the property urlfilter.fast.file,
- * the default name is fast-urlfilter.txt.
+ *
+ * The first matching rule will reject the URL and no further rules are checked. If no rule matches
+ * the URL is accepted. URLs without a host name (e.g., file:/path/file.txt are checked
+ * for global rules only. URLs which fail to be parsed as {@link java.net.URL} are always rejected.
+ *
+ *
For rules either the URL path (DenyPath) or path and query (DenyPathQuery
+ * ) are checked whether the given {@link java.util.regex Java Regular expression} is found
+ * (see {@link java.util.regex.Matcher#find()}) in the URL path (and query).
+ *
+ *
Rules are applied in the order of their definition. For better performance, regular
+ * expressions which are simpler/faster or match more URLs should be defined earlier.
+ *
+ *
Comments in the rule file start with the # character and reach until the end of
+ * the line.
+ *
+ *
The rules file is defined via the property urlfilter.fast.file, the default name
+ * is fast-urlfilter.txt.
*/
-public class FastURLFilter extends URLFilter implements JSONResource {
+public class FastURLFilter extends URLFilter implements JSONResource {
- protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
private Multimap hostRules = LinkedHashMultimap.create();
@@ -121,302 +114,320 @@ public class FastURLFilter extends URLFilter implements JSONResource {
private String resourceFile;
- private static final Pattern CATCH_ALL_RULE = Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
+ private static final Pattern CATCH_ALL_RULE =
+ Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
private String resourceETAG;
public void configure(@SuppressWarnings("rawtypes") Map stormConf, JsonNode filterParams) {
- // read from conf first
- int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1);
- this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null);
-
- // then from the param file (which needs recompiling in case of change)
- if (filterParams != null) {
- JsonNode node = filterParams.get("file");
- if (node != null && node.isTextual() && this.resourceFile == null) {
- this.resourceFile = node.asText();
- }
- node = filterParams.get("refresh");
- if (node != null && node.isInt() && refreshRate == -1) {
- refreshRate = node.asInt();
- }
- }
-
- try {
- loadJSONResources();
- } catch (Exception e) {
- LOG.error("Exception while loading resources", e);
- }
-
- if (refreshRate != -1) {
- LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate);
- new Timer().schedule(new TimerTask() {
- public void run() {
- LOG.info("Reloading resources");
- try {
- loadJSONResources();
- } catch (Exception e) {
- LOG.error("Can't load resources", e);
- }
- }
- }, refreshRate * 1000, refreshRate * 1000);
- }
+ // read from conf first
+ int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1);
+ this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null);
+
+ // then from the param file (which needs recompiling in case of change)
+ if (filterParams != null) {
+ JsonNode node = filterParams.get("file");
+ if (node != null && node.isTextual() && this.resourceFile == null) {
+ this.resourceFile = node.asText();
+ }
+ node = filterParams.get("refresh");
+ if (node != null && node.isInt() && refreshRate == -1) {
+ refreshRate = node.asInt();
+ }
+ }
+
+ try {
+ loadJSONResources();
+ } catch (Exception e) {
+ LOG.error("Exception while loading resources", e);
+ }
+
+ if (refreshRate != -1) {
+ LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate);
+ new Timer()
+ .schedule(
+ new TimerTask() {
+ public void run() {
+ LOG.info("Reloading resources");
+ try {
+ loadJSONResources();
+ } catch (Exception e) {
+ LOG.error("Can't load resources", e);
+ }
+ }
+ },
+ refreshRate * 1000,
+ refreshRate * 1000);
+ }
}
/**
* Load the resources from the JSON file in the uber jar or from S3
- *
+ *
* @throws Exception
- **/
+ */
@Override
public void loadJSONResources() throws Exception {
- InputStream inputStream = null;
- AmazonS3 s3client = null;
- try {
- if (getResourceFile().startsWith("s3://")) {
- // try loading from S3
- s3client = AmazonS3ClientBuilder.standard().build();
- java.net.URI uri = new java.net.URI(getResourceFile());
-
- String bucketName = uri.getHost();
- // remove the first "/"
- String path = uri.getPath().substring(1);
-
- // optimisation - avoid a full reload if the resource has not changed
- ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path);
- final String ETAG = metadata.getETag();
- if (ETAG != null && ETAG.equals(resourceETAG)) {
- LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile());
- return;
- } else {
- resourceETAG = ETAG;
- }
-
- final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path));
- inputStream = object.getObjectContent();
- } else {
- inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile());
- if (inputStream == null) {
- LOG.error("Can't load conf from {}", getResourceFile());
- return;
- }
- }
- if (getResourceFile().endsWith(".gz")) {
- inputStream = new GZIPInputStream(inputStream);
- }
-
- loadJSONResources(new BufferedInputStream(inputStream));
- } finally {
- if (inputStream != null) {
- inputStream.close();
- }
- if (s3client != null) {
- s3client.shutdown();
- }
- }
+ InputStream inputStream = null;
+ AmazonS3 s3client = null;
+ try {
+ if (getResourceFile().startsWith("s3://")) {
+ // try loading from S3
+ s3client = AmazonS3ClientBuilder.standard().build();
+ java.net.URI uri = new java.net.URI(getResourceFile());
+
+ String bucketName = uri.getHost();
+ // remove the first "/"
+ String path = uri.getPath().substring(1);
+
+ // optimisation - avoid a full reload if the resource has not changed
+ ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path);
+ final String ETAG = metadata.getETag();
+ if (ETAG != null && ETAG.equals(resourceETAG)) {
+ LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile());
+ return;
+ } else {
+ resourceETAG = ETAG;
+ }
+
+ final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path));
+ inputStream = object.getObjectContent();
+ } else {
+ inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile());
+ if (inputStream == null) {
+ LOG.error("Can't load conf from {}", getResourceFile());
+ return;
+ }
+ }
+ if (getResourceFile().endsWith(".gz")) {
+ inputStream = new GZIPInputStream(inputStream);
+ }
+
+ loadJSONResources(new BufferedInputStream(inputStream));
+ } finally {
+ if (inputStream != null) {
+ inputStream.close();
+ }
+ if (s3client != null) {
+ s3client.shutdown();
+ }
+ }
}
@Override
public void loadJSONResources(InputStream inputStream)
- throws JsonParseException, JsonMappingException, IOException {
- long start = System.currentTimeMillis();
-
- try (Reader r = new InputStreamReader(inputStream)) {
- reloadRules(r);
- }
-
- long end = System.currentTimeMillis();
- LOG.info("Loaded {} hostrules and {} domain rules in {} msec from {}", hostRules.size(), domainRules.size(),
- (end - start), resourceFile);
+ throws JsonParseException, JsonMappingException, IOException {
+ long start = System.currentTimeMillis();
+
+ try (Reader r = new InputStreamReader(inputStream)) {
+ reloadRules(r);
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info(
+ "Loaded {} hostrules and {} domain rules in {} msec from {}",
+ hostRules.size(),
+ domainRules.size(),
+ (end - start),
+ resourceFile);
}
@Override
public String getResourceFile() {
- return resourceFile;
+ return resourceFile;
}
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) {
- synchronized (this) {
- URL u;
-
- try {
- u = new URL(urlToFilter);
- } catch (Exception e) {
- LOG.debug("Rejected {} because failed to parse as URL: {}", urlToFilter, e.getMessage());
- return null;
- }
-
- String hostname = u.getHost();
-
- // first check for host-specific rules
- for (Rule rule : hostRules.get(hostname)) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // also look up domain rules for host name
- for (Rule rule : domainRules.get(hostname)) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // check suffixes of host name from longer to shorter:
- // subdomains, domain, top-level domain
- int start = 0;
- int pos;
- while ((pos = hostname.indexOf('.', start)) != -1) {
- start = pos + 1;
- String domain = hostname.substring(start);
- for (Rule rule : domainRules.get(domain)) {
- if (rule.match(u)) {
- return null;
- }
- }
- }
-
- // finally check "global" rules defined for `Domain .`
- for (Rule rule : domainRules.get(".")) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // no reject rules found
- return urlToFilter;
- }
+ synchronized (this) {
+ URL u;
+
+ try {
+ u = new URL(urlToFilter);
+ } catch (Exception e) {
+ LOG.debug(
+ "Rejected {} because failed to parse as URL: {}",
+ urlToFilter,
+ e.getMessage());
+ return null;
+ }
+
+ String hostname = u.getHost();
+
+ // first check for host-specific rules
+ for (Rule rule : hostRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // also look up domain rules for host name
+ for (Rule rule : domainRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // check suffixes of host name from longer to shorter:
+ // subdomains, domain, top-level domain
+ int start = 0;
+ int pos;
+ while ((pos = hostname.indexOf('.', start)) != -1) {
+ start = pos + 1;
+ String domain = hostname.substring(start);
+ for (Rule rule : domainRules.get(domain)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+ }
+
+ // finally check "global" rules defined for `Domain .`
+ for (Rule rule : domainRules.get(".")) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // no reject rules found
+ return urlToFilter;
+ }
}
private void reloadRules(Reader rules) throws IOException {
- synchronized (this) {
- domainRules.clear();
- hostRules.clear();
-
- BufferedReader reader = new BufferedReader(rules);
-
- String current = null;
- boolean host = false;
- int lineno = 0;
-
- String line;
- try {
- while ((line = reader.readLine()) != null) {
- lineno++;
- line = line.trim();
-
- if (line.indexOf("#") != -1) {
- // strip comments
- line = line.substring(0, line.indexOf("#")).trim();
- }
-
- if (StringUtils.isBlank(line)) {
- continue;
- }
-
- if (line.startsWith("Host")) {
- host = true;
- current = line.split("\\s+")[1];
- } else if (line.startsWith("Domain")) {
- host = false;
- current = line.split("\\s+")[1];
- } else {
- if (current == null) {
- continue;
- }
-
- Rule rule = null;
- try {
- if (CATCH_ALL_RULE.matcher(line).matches()) {
- rule = DenyAllRule.getInstance();
- } else if (line.startsWith("DenyPathQuery")) {
- rule = new DenyPathQueryRule(line.split("\\s+")[1]);
- } else if (line.startsWith("DenyPath")) {
- rule = new DenyPathRule(line.split("\\s+")[1]);
- } else {
- LOG.warn("Problem reading rule on line {}: {}", lineno, line);
- continue;
- }
- } catch (Exception e) {
- LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage());
- continue;
- }
-
- if (host) {
- LOG.trace("Adding host rule [{}] [{}]", current, rule);
- hostRules.put(current, rule);
- } else {
- LOG.trace("Adding domain rule [{}] [{}]", current, rule);
- domainRules.put(current, rule);
- }
- }
- }
-
- } catch (IOException e) {
- LOG.warn("Caught exception while reading rules file at line {}: {}", lineno, e.getMessage());
- throw e;
- }
- }
+ synchronized (this) {
+ domainRules.clear();
+ hostRules.clear();
+
+ BufferedReader reader = new BufferedReader(rules);
+
+ String current = null;
+ boolean host = false;
+ int lineno = 0;
+
+ String line;
+ try {
+ while ((line = reader.readLine()) != null) {
+ lineno++;
+ line = line.trim();
+
+ if (line.indexOf("#") != -1) {
+ // strip comments
+ line = line.substring(0, line.indexOf("#")).trim();
+ }
+
+ if (StringUtils.isBlank(line)) {
+ continue;
+ }
+
+ if (line.startsWith("Host")) {
+ host = true;
+ current = line.split("\\s+")[1];
+ } else if (line.startsWith("Domain")) {
+ host = false;
+ current = line.split("\\s+")[1];
+ } else {
+ if (current == null) {
+ continue;
+ }
+
+ Rule rule = null;
+ try {
+ if (CATCH_ALL_RULE.matcher(line).matches()) {
+ rule = DenyAllRule.getInstance();
+ } else if (line.startsWith("DenyPathQuery")) {
+ rule = new DenyPathQueryRule(line.split("\\s+")[1]);
+ } else if (line.startsWith("DenyPath")) {
+ rule = new DenyPathRule(line.split("\\s+")[1]);
+ } else {
+ LOG.warn("Problem reading rule on line {}: {}", lineno, line);
+ continue;
+ }
+ } catch (Exception e) {
+ LOG.warn(
+ "Problem reading rule on line {}: {} - {}",
+ lineno,
+ line,
+ e.getMessage());
+ continue;
+ }
+
+ if (host) {
+ LOG.trace("Adding host rule [{}] [{}]", current, rule);
+ hostRules.put(current, rule);
+ } else {
+ LOG.trace("Adding domain rule [{}] [{}]", current, rule);
+ domainRules.put(current, rule);
+ }
+ }
+ }
+
+ } catch (IOException e) {
+ LOG.warn(
+ "Caught exception while reading rules file at line {}: {}",
+ lineno,
+ e.getMessage());
+ throw e;
+ }
+ }
}
public static class Rule {
- protected Pattern pattern;
+ protected Pattern pattern;
- Rule() {
- }
+ Rule() {}
- public Rule(String regex) {
- pattern = Pattern.compile(regex);
- }
+ public Rule(String regex) {
+ pattern = Pattern.compile(regex);
+ }
- public boolean match(URL url) {
- return pattern.matcher(url.toString()).find();
- }
+ public boolean match(URL url) {
+ return pattern.matcher(url.toString()).find();
+ }
- public String toString() {
- return pattern.toString();
- }
+ public String toString() {
+ return pattern.toString();
+ }
}
public static class DenyPathRule extends Rule {
- public DenyPathRule(String regex) {
- super(regex);
- }
-
- public boolean match(URL url) {
- String haystack = url.getPath();
- return pattern.matcher(haystack).find();
- }
+ public DenyPathRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getPath();
+ return pattern.matcher(haystack).find();
+ }
}
/** Rule for DenyPath .* or DenyPath .? */
public static class DenyAllRule extends Rule {
- private static Rule instance = new DenyAllRule(".");
+ private static Rule instance = new DenyAllRule(".");
- private DenyAllRule(String regex) {
- super(regex);
- }
+ private DenyAllRule(String regex) {
+ super(regex);
+ }
- public static Rule getInstance() {
- return instance;
- }
+ public static Rule getInstance() {
+ return instance;
+ }
- public boolean match(URL url) {
- return true;
- }
+ public boolean match(URL url) {
+ return true;
+ }
}
public static class DenyPathQueryRule extends Rule {
- public DenyPathQueryRule(String regex) {
- super(regex);
- }
-
- public boolean match(URL url) {
- String haystack = url.getFile();
- return pattern.matcher(haystack).find();
- }
+ public DenyPathQueryRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getFile();
+ return pattern.matcher(haystack).find();
+ }
}
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
index 1c3a4a5..9979781 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
@@ -13,33 +13,28 @@
*/
package org.commoncrawl.stormcrawler.news;
+import com.google.common.primitives.Bytes;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
-import com.google.common.primitives.Bytes;
-
public class ContentDetector {
protected byte[][][] clues;
protected int maxOffset;
/**
- * Set up detector to detect content sniffing for a set of clue strings in a
- * prefix of the binary content.
+ * Set up detector to detect content sniffing for a set of clue strings in a prefix of the
+ * binary content.
*
- * @param clues
- * nested list of literal clues. Outer list defines an OR-group,
- * inner list contained ANDed clues required to match all, e.g.
- * the following definition would match if either
- * "clue1" and "and_clue2" are matched, or
- * alternatively "or_clue3" is found
+ * @param clues nested list of literal clues. Outer list defines an OR-group, inner list
+ * contained ANDed clues required to match all, e.g. the following definition would match if
+ * either "clue1" and "and_clue2" are matched, or alternatively
+ * "or_clue3" is found
+ *
+ * { { clue1, and_clue2 }, { or_clue3 } }
+ *
*
- *
- * { { clue1, and_clue2 }, { or_clue3 } }
- *
- *
- * @param maxOffset
- * max. offset of content prefix checked for clues
+ * @param maxOffset max. offset of content prefix checked for clues
*/
public ContentDetector(String[][] clues, int maxOffset) {
this.maxOffset = maxOffset;
@@ -60,8 +55,7 @@ public int getFirstMatch(byte[] content) {
for (int i = 0; i < clues.length; i++) {
byte[][] group = clues[i];
for (byte[] clue : group) {
- if (Bytes.indexOf(beginning, clue) == -1)
- continue OR;
+ if (Bytes.indexOf(beginning, clue) == -1) continue OR;
}
// success, all members of one group matched
return i;
@@ -72,5 +66,4 @@ public int getFirstMatch(byte[] content) {
public boolean matches(byte[] content) {
return (getFirstMatch(content) >= 0);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
index 3a3c018..cc2d85a 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,13 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.commoncrawl.stormcrawler.news;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
-
import org.apache.storm.topology.BoltDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
@@ -43,116 +41,126 @@
import org.apache.stormcrawler.warc.WARCHdfsBolt;
import org.slf4j.LoggerFactory;
-/**
- * Dummy topology to play with the spouts and bolts on OpenSearch
- */
+/** Dummy topology to play with the spouts and bolts on OpenSearch */
public class CrawlTopology extends ConfigurableTopology {
private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(CrawlTopology.class);
public static void main(String[] args) throws Exception {
- ConfigurableTopology.start(new CrawlTopology(), args);
+ ConfigurableTopology.start(new CrawlTopology(), args);
}
@Override
protected int run(String[] args) {
- TopologyBuilder builder = new TopologyBuilder();
+ TopologyBuilder builder = new TopologyBuilder();
- int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
+ int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
- // set to the real number of shards ONLY if es.status.routing is set to
- // true in the configuration
- int numShards = 16;
+ // set to the real number of shards ONLY if es.status.routing is set to
+ // true in the configuration
+ int numShards = 16;
- if (args.length >= 2) {
- // arguments include seed directory and file pattern
- LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
- builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
- Fields key = new Fields("url");
+ if (args.length >= 2) {
+ // arguments include seed directory and file pattern
+ LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
+ builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
+ Fields key = new Fields("url");
- builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping("filespout", Constants.StatusStreamName, key);
- }
+ builder.setBolt("filter", new URLFilterBolt())
+ .fieldsGrouping("filespout", Constants.StatusStreamName, key);
+ }
- builder.setSpout("spout", new AggregationSpout(), numShards);
+ builder.setSpout("spout", new AggregationSpout(), numShards);
- builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers).shuffleGrouping("spout");
+ builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers)
+ .shuffleGrouping("spout");
- builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers).shuffleGrouping("prefilter");
+ builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers)
+ .shuffleGrouping("prefilter");
- builder.setBolt("fetch", new FetcherBolt(), numWorkers).fieldsGrouping("partitioner", new Fields("key"));
+ builder.setBolt("fetch", new FetcherBolt(), numWorkers)
+ .fieldsGrouping("partitioner", new Fields("key"));
- builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers).setNumTasks(2)
- .localOrShuffleGrouping("fetch");
+ builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers)
+ .setNumTasks(2)
+ .localOrShuffleGrouping("fetch");
- builder.setBolt("feed", new FeedParserBolt(), numWorkers).setNumTasks(4).localOrShuffleGrouping("sitemap");
+ builder.setBolt("feed", new FeedParserBolt(), numWorkers)
+ .setNumTasks(4)
+ .localOrShuffleGrouping("sitemap");
- // don't need to parse the pages but need to update their status
- builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed");
+ // don't need to parse the pages but need to update their status
+ builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed");
- WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS");
+ WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS");
- // take it from feed default output so that the feed files themselves
- // don't get included - unless we want them too of course!
- builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed");
-
- final Fields furl = new Fields("url");
+ // take it from feed default output so that the feed files themselves
+ // don't get included - unless we want them too of course!
+ builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed");
- BoltDeclarer statusBolt = builder.setBolt("status", new StatusUpdaterBolt(), numWorkers)
- .fieldsGrouping("fetch", Constants.StatusStreamName, furl)
- .fieldsGrouping("sitemap", Constants.StatusStreamName, furl)
- .fieldsGrouping("feed", Constants.StatusStreamName, furl)
- .fieldsGrouping("ssb", Constants.StatusStreamName, furl)
- .fieldsGrouping("prefilter", Constants.StatusStreamName, furl);
-
- if (args.length >= 2) {
- statusBolt.customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping());
- }
- statusBolt.setNumTasks(numShards);
+ final Fields furl = new Fields("url");
- return submit(conf, builder);
+ BoltDeclarer statusBolt =
+ builder.setBolt("status", new StatusUpdaterBolt(), numWorkers)
+ .fieldsGrouping("fetch", Constants.StatusStreamName, furl)
+ .fieldsGrouping("sitemap", Constants.StatusStreamName, furl)
+ .fieldsGrouping("feed", Constants.StatusStreamName, furl)
+ .fieldsGrouping("ssb", Constants.StatusStreamName, furl)
+ .fieldsGrouping("prefilter", Constants.StatusStreamName, furl);
+
+ if (args.length >= 2) {
+ statusBolt.customGrouping(
+ "filter", Constants.StatusStreamName, new URLStreamGrouping());
+ }
+ statusBolt.setNumTasks(numShards);
+
+ return submit(conf, builder);
}
protected WARCHdfsBolt getWarcBolt(String filePrefix) {
- // path is absolute
- String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc");
-
- WARCFileNameFormat fileNameFormat = new WARCFileNameFormat();
- fileNameFormat.withPath(warcFilePath);
- fileNameFormat.withPrefix(filePrefix);
-
- Map fields = new LinkedHashMap<>();
- fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/");
- fields.put("description", "News crawl for Common Crawl");
- String userAgent = AbstractHttpProtocol.getAgentString(getConf());
- fields.put("http-header-user-agent", userAgent);
- fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email"));
- String robotsTxtParser = "checked by crawler-commons " + crawlercommons.CrawlerCommons.getVersion()
- + " (https://github.com/crawler-commons/crawler-commons)";
- fields.put("robots", robotsTxtParser);
- fields.put("format", "WARC File Format 1.1");
- fields.put("conformsTo", "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/");
-
- WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt();
- warcbolt.withConfigKey("warc");
- warcbolt.withFileNameFormat(fileNameFormat);
- warcbolt.withHeader(fields);
- warcbolt.withRequestRecords();
-
- // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that
- // WARC files are truncated if the topology is stopped because of a
- // delayed sync of the default ChecksumFileSystem
- Map hdfsConf = new HashMap<>();
- hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
- getConf().put("warc", hdfsConf);
-
- // will rotate if reaches size or time limit
- int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024);
- int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440);
- FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB);
- rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES);
- warcbolt.withRotationPolicy(rotpol);
-
- return warcbolt;
+ // path is absolute
+ String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc");
+
+ WARCFileNameFormat fileNameFormat = new WARCFileNameFormat();
+ fileNameFormat.withPath(warcFilePath);
+ fileNameFormat.withPrefix(filePrefix);
+
+ Map fields = new LinkedHashMap<>();
+ fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/");
+ fields.put("description", "News crawl for Common Crawl");
+ String userAgent = AbstractHttpProtocol.getAgentString(getConf());
+ fields.put("http-header-user-agent", userAgent);
+ fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email"));
+ String robotsTxtParser =
+ "checked by crawler-commons "
+ + crawlercommons.CrawlerCommons.getVersion()
+ + " (https://github.com/crawler-commons/crawler-commons)";
+ fields.put("robots", robotsTxtParser);
+ fields.put("format", "WARC File Format 1.1");
+ fields.put(
+ "conformsTo",
+ "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/");
+
+ WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt();
+ warcbolt.withConfigKey("warc");
+ warcbolt.withFileNameFormat(fileNameFormat);
+ warcbolt.withHeader(fields);
+ warcbolt.withRequestRecords();
+
+ // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that
+ // WARC files are truncated if the topology is stopped because of a
+ // delayed sync of the default ChecksumFileSystem
+ Map hdfsConf = new HashMap<>();
+ hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
+ getConf().put("warc", hdfsConf);
+
+ // will rotate if reaches size or time limit
+ int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024);
+ int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440);
+ FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB);
+ rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES);
+ warcbolt.withRotationPolicy(rotpol);
+
+ return warcbolt;
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
index c365525..e607dc2 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
@@ -14,13 +14,11 @@
package org.commoncrawl.stormcrawler.news;
import java.util.Map;
-
+import org.apache.http.HttpHeaders;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
-import org.slf4j.LoggerFactory;
-
import org.apache.stormcrawler.Constants;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.bolt.FeedParserBolt;
@@ -29,28 +27,23 @@
import org.apache.stormcrawler.parse.ParseFilters;
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.stormcrawler.persistence.Status;
-import org.apache.http.HttpHeaders;
+import org.slf4j.LoggerFactory;
/** Detect RSS and Atom feeds, but do not parse and extract links */
@SuppressWarnings("serial")
public class FeedDetectorBolt extends FeedParserBolt {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(FeedDetectorBolt.class);
+ private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedDetectorBolt.class);
- public static final String[] mimeTypeClues = {
- "rss+xml", "atom+xml", "text/rss"
- };
+ public static final String[] mimeTypeClues = {"rss+xml", "atom+xml", "text/rss"};
- public static String[][] contentClues = { { "<{}> for {}",
- ct, url);
+ LOG.info("Feed detected from content type <{}> for {}", ct, url);
break;
}
}
@@ -90,8 +82,8 @@ public void execute(Tuple tuple) {
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
// emit status
- collector.emit(Constants.StatusStreamName, tuple,
- new Values(url, metadata, Status.FETCHED));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
} else {
// pass on
collector.emit(tuple, tuple.getValues());
@@ -100,11 +92,9 @@ public void execute(Tuple tuple) {
}
@Override
- @SuppressWarnings({ "rawtypes" })
- public void prepare(Map stormConf, TopologyContext context,
- OutputCollector collect) {
+ @SuppressWarnings({"rawtypes"})
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) {
super.prepare(stormConf, context, collect);
parseFilters = ParseFilters.fromConf(stormConf);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
index 3c4cf55..187d086 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
@@ -15,6 +15,18 @@
import static org.apache.stormcrawler.Constants.StatusStreamName;
+import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.Namespace;
+import crawlercommons.sitemaps.SiteMap;
+import crawlercommons.sitemaps.SiteMapIndex;
+import crawlercommons.sitemaps.SiteMapParser;
+import crawlercommons.sitemaps.SiteMapURL;
+import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency;
+import crawlercommons.sitemaps.UnknownFormatException;
+import crawlercommons.sitemaps.extension.Extension;
+import crawlercommons.sitemaps.extension.ExtensionMetadata;
+import crawlercommons.sitemaps.extension.LinkAttributes;
+import crawlercommons.sitemaps.extension.NewsAttributes;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
@@ -24,7 +36,6 @@
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHeaders;
import org.apache.storm.metric.api.MeanReducer;
@@ -46,56 +57,45 @@
import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.LoggerFactory;
-import crawlercommons.sitemaps.AbstractSiteMap;
-import crawlercommons.sitemaps.Namespace;
-import crawlercommons.sitemaps.SiteMap;
-import crawlercommons.sitemaps.SiteMapIndex;
-import crawlercommons.sitemaps.SiteMapParser;
-import crawlercommons.sitemaps.SiteMapURL;
-import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency;
-import crawlercommons.sitemaps.UnknownFormatException;
-import crawlercommons.sitemaps.extension.Extension;
-import crawlercommons.sitemaps.extension.ExtensionMetadata;
-import crawlercommons.sitemaps.extension.LinkAttributes;
-import crawlercommons.sitemaps.extension.NewsAttributes;
-
-
/**
- * ParserBolt for news
+ * ParserBolt for news
* sitemaps.
*/
@SuppressWarnings("serial")
public class NewsSiteMapParserBolt extends SiteMapParserBolt {
// TODO:
- // this is a modified copy of c.d.s.bolt.SiteMapParserBolt
- // - make parent class extensible and overridable
- // modifications:
- // - detect and process only Google news sitemaps
- // - or a sitemapindex because some subsitemaps may
- // be news sitemaps
- // - pass "isSitemapNews" to status metadata
+ // this is a modified copy of c.d.s.bolt.SiteMapParserBolt
+ // - make parent class extensible and overridable
+ // modifications:
+ // - detect and process only Google news sitemaps
+ // - or a sitemapindex because some subsitemaps may
+ // be news sitemaps
+ // - pass "isSitemapNews" to status metadata
public static enum SitemapType {
- NEWS, INDEX, SITEMAP
+ NEWS,
+ INDEX,
+ SITEMAP
}
public static final String isSitemapNewsKey = "isSitemapNews";
public static final String isSitemapIndexKey = "isSitemapIndex";
+
/**
- * A sitemap (not necessarily a news sitemap) which is verified to contain
- * links to news articles. Necessary to crawl news sites which provide a
- * sitemap but neither a news feed or sitemap.
+ * A sitemap (not necessarily a news sitemap) which is verified to contain links to news
+ * articles. Necessary to crawl news sites which provide a sitemap but neither a news feed or
+ * sitemap.
*/
public static final String isSitemapVerifiedKey = "isSitemapVerified";
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(NewsSiteMapParserBolt.class);
+ private static final org.slf4j.Logger LOG =
+ LoggerFactory.getLogger(NewsSiteMapParserBolt.class);
/* content clues for news sitemaps, sitemap indexes or any sitemaps */
public static String[][] contentClues;
public static int contentCluesSitemapNewsMatchUpTo = -1;
public static int contentCluesSitemapIndexMatchUpTo = -1;
+
static {
int cluesSize = Namespace.NEWS.length + 1 + 1 + Namespace.SITEMAP_LEGACY.length;
contentClues = new String[cluesSize][1];
@@ -129,7 +129,7 @@ public static enum SitemapType {
private ReducedMetric averagedMetrics;
- /** Delay in minutes used for scheduling sub-sitemaps **/
+ /** Delay in minutes used for scheduling sub-sitemaps * */
private int scheduleSitemapsWithDelay = -1;
@Override
@@ -140,14 +140,10 @@ public void execute(Tuple tuple) {
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
- boolean isSitemap = Boolean.valueOf(
- metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
- boolean isNewsSitemap = Boolean
- .valueOf(metadata.getFirstValue(isSitemapNewsKey));
- boolean isSitemapIndex = Boolean
- .valueOf(metadata.getFirstValue(isSitemapIndexKey));
- boolean isSitemapVerified = Boolean
- .valueOf(metadata.getFirstValue(isSitemapVerifiedKey));
+ boolean isSitemap = Boolean.valueOf(metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
+ boolean isNewsSitemap = Boolean.valueOf(metadata.getFirstValue(isSitemapNewsKey));
+ boolean isSitemapIndex = Boolean.valueOf(metadata.getFirstValue(isSitemapIndexKey));
+ boolean isSitemapVerified = Boolean.valueOf(metadata.getFirstValue(isSitemapVerifiedKey));
if (sniffContent) {
SitemapType type = detectContent(url, content);
@@ -183,14 +179,16 @@ public void execute(Tuple tuple) {
if (isNewsSitemap || isSitemapIndex || isSitemapVerified) {
/*
- * remove the isSitemap key from metadata to avoid that the default
- * sitemap fetch interval is applied to news sitemaps, sitemap
- * indexes and verified sitemaps
+ * remove the isSitemap key from metadata to avoid that the default sitemap
+ * fetch interval is applied to news sitemaps, sitemap indexes and verified
+ * sitemaps
*/
metadata.remove(isSitemapKey);
} else {
if (isSitemap) {
- collector.emit(Constants.StatusStreamName, tuple,
+ collector.emit(
+ Constants.StatusStreamName,
+ tuple,
new Values(url, metadata, Status.FETCHED));
} else {
// not a sitemap, just pass it on
@@ -217,8 +215,8 @@ public void execute(Tuple tuple) {
metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
metadata.remove("numLinks");
- collector.emit(Constants.StatusStreamName, tuple, new Values(url,
- metadata, Status.ERROR));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
collector.ack(tuple);
return;
}
@@ -232,15 +230,12 @@ public void execute(Tuple tuple) {
parseFilters.filter(url, content, null, parse);
} catch (RuntimeException e) {
- String errorMessage = "Exception while running parse filters on "
- + url + ": " + e;
+ String errorMessage = "Exception while running parse filters on " + url + ": " + e;
LOG.error(errorMessage);
- metadata.setValue(Constants.STATUS_ERROR_SOURCE,
- "content filtering");
+ metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
metadata.remove("numLinks");
- collector.emit(StatusStreamName, tuple, new Values(url, metadata,
- Status.ERROR));
+ collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
collector.ack(tuple);
return;
}
@@ -263,8 +258,7 @@ public void execute(Tuple tuple) {
ol.getMetadata().setValue(isSitemapVerifiedKey, "true");
}
}
- Values v = new Values(ol.getTargetURL(), ol.getMetadata(),
- Status.DISCOVERED);
+ Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED);
collector.emit(Constants.StatusStreamName, tuple, v);
}
@@ -272,8 +266,8 @@ public void execute(Tuple tuple) {
metadata.setValue("numLinks", String.valueOf(outlinks.size()));
// marking the main URL as successfully fetched
- collector.emit(Constants.StatusStreamName, tuple, new Values(url,
- metadata, Status.FETCHED));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
collector.ack(tuple);
}
@@ -291,12 +285,10 @@ public SitemapType detectContent(String url, byte[] content) {
if (match >= 0) {
// a sitemap, need to detect type of sitemap
if (match <= contentCluesSitemapNewsMatchUpTo) {
- LOG.info("{} detected as news sitemap based on content",
- url);
+ LOG.info("{} detected as news sitemap based on content", url);
return SitemapType.NEWS;
} else if (match <= contentCluesSitemapIndexMatchUpTo) {
- LOG.info("{} detected as sitemap index based on content",
- url);
+ LOG.info("{} detected as sitemap index based on content", url);
return SitemapType.INDEX;
} else {
return SitemapType.SITEMAP;
@@ -317,12 +309,15 @@ private boolean recentlyModified(Date lastModified) {
return true;
}
- protected AbstractSiteMap parseSiteMap(String url, byte[] content,
- String contentType, Metadata parentMetadata, List links)
+ protected AbstractSiteMap parseSiteMap(
+ String url,
+ byte[] content,
+ String contentType,
+ Metadata parentMetadata,
+ List links)
throws UnknownFormatException, IOException {
- SiteMapParser parser = new SiteMapParser(strictModeSitemaps,
- allowPartialSitemaps);
+ SiteMapParser parser = new SiteMapParser(strictModeSitemaps, allowPartialSitemaps);
parser.setStrictNamespace(true);
parser.addAcceptedNamespace(Namespace.SITEMAP_LEGACY);
parser.addAcceptedNamespace(Namespace.EMPTY);
@@ -334,8 +329,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
long start = System.currentTimeMillis();
AbstractSiteMap siteMap;
// let the parser guess what the mimetype is
- if (StringUtils.isBlank(contentType)
- || contentType.contains("octet-stream")) {
+ if (StringUtils.isBlank(contentType) || contentType.contains("octet-stream")) {
siteMap = parser.parseSiteMap(content, sURL);
} else {
siteMap = parser.parseSiteMap(contentType, content, sURL);
@@ -351,8 +345,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
Collection subsitemaps = smi.getSitemaps();
int delay = 0;
/*
- * keep the subsitemaps as outlinks they will be fetched and parsed
- * in the following steps
+ * keep the subsitemaps as outlinks they will be fetched and parsed in the
+ * following steps
*/
Iterator iter = subsitemaps.iterator();
while (iter.hasNext()) {
@@ -365,13 +359,21 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a modified date {} which is more than {} hours old",
- target, lastModified.toString(),
+ target,
+ lastModified.toString(),
filterHoursSinceModified);
continue;
}
- Outlink ol = filterOutlink(sURL, target, parentMetadata,
- isSitemapKey, "true", isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ target,
+ parentMetadata,
+ isSitemapKey,
+ "true",
+ isSitemapNewsKey,
+ "false");
if (ol == null) {
continue;
}
@@ -379,9 +381,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
// add a delay
if (this.scheduleSitemapsWithDelay > 0) {
if (delay > 0) {
- ol.getMetadata().setValue(
- DefaultScheduler.DELAY_METADATA,
- Integer.toString(delay));
+ ol.getMetadata()
+ .setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(delay));
}
delay += this.scheduleSitemapsWithDelay;
}
@@ -389,15 +390,19 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
- LOG.info("Sitemap index (found {} sitemaps, {} skipped): {}",
- linksFound, linksSkippedNotRecentlyModified, url);
+ LOG.info(
+ "Sitemap index (found {} sitemaps, {} skipped): {}",
+ linksFound,
+ linksSkippedNotRecentlyModified,
+ url);
}
// sitemap files
else {
SiteMap sm = (SiteMap) siteMap;
Collection sitemapURLs = sm.getSiteMapUrls();
Iterator iter = sitemapURLs.iterator();
- sitemap_urls: while (iter.hasNext()) {
+ sitemap_urls:
+ while (iter.hasNext()) {
linksFound++;
SiteMapURL smurl = iter.next();
// TODO handle priority in metadata
@@ -414,11 +419,12 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a modified date {} which is more than {} hours old",
- target, lastModified, filterHoursSinceModified);
+ target,
+ lastModified,
+ filterHoursSinceModified);
continue;
}
- ExtensionMetadata[] newsAttrs = smurl
- .getAttributesForExtension(Extension.NEWS);
+ ExtensionMetadata[] newsAttrs = smurl.getAttributesForExtension(Extension.NEWS);
if (newsAttrs != null) {
// filter based on news publication date
// 2008-12-23
@@ -429,7 +435,9 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a news publication date {} which is more than {} hours old",
- target, pubDate, filterHoursSinceModified);
+ target,
+ pubDate,
+ filterHoursSinceModified);
continue sitemap_urls;
}
}
@@ -437,8 +445,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
}
// add alternative language links
- ExtensionMetadata[] linkAttrs = smurl
- .getAttributesForExtension(Extension.LINKS);
+ ExtensionMetadata[] linkAttrs = smurl.getAttributesForExtension(Extension.LINKS);
if (linkAttrs != null) {
for (ExtensionMetadata attr : linkAttrs) {
LinkAttributes linkAttr = (LinkAttributes) attr;
@@ -451,17 +458,30 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
// skip href links duplicating sitemap URL
continue;
}
- Outlink ol = filterOutlink(sURL, hrefStr,
- parentMetadata, isSitemapKey, "false",
- isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ hrefStr,
+ parentMetadata,
+ isSitemapKey,
+ "false",
+ isSitemapNewsKey,
+ "false");
if (ol != null) {
links.add(ol);
}
}
}
- Outlink ol = filterOutlink(sURL, target, parentMetadata,
- isSitemapKey, "false", isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ target,
+ parentMetadata,
+ isSitemapKey,
+ "false",
+ isSitemapNewsKey,
+ "false");
if (ol == null) {
continue;
}
@@ -469,34 +489,33 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
- LOG.info("Sitemap (found {} links, {} skipped): {}", linksFound,
- linksSkippedNotRecentlyModified, url);
+ LOG.info(
+ "Sitemap (found {} links, {} skipped): {}",
+ linksFound,
+ linksSkippedNotRecentlyModified,
+ url);
}
return siteMap;
}
@Override
- @SuppressWarnings({ "rawtypes", "unchecked" })
- public void prepare(Map stormConf, TopologyContext context,
- OutputCollector collector) {
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
super.prepare(stormConf, context, collector);
- sniffContent = ConfUtils.getBoolean(stormConf,
- "sitemap.sniffContent", false);
- filterHoursSinceModified = ConfUtils.getInt(stormConf,
- "sitemap.filter.hours.since.modified", -1);
+ sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false);
+ filterHoursSinceModified =
+ ConfUtils.getInt(stormConf, "sitemap.filter.hours.since.modified", -1);
parseFilters = ParseFilters.fromConf(stormConf);
- int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess",
- 1024);
- contentDetector = new ContentDetector(
- NewsSiteMapParserBolt.contentClues, maxOffsetGuess);
- rssContentDetector = new ContentDetector(
- FeedDetectorBolt.contentClues, maxOffsetGuess);
- averagedMetrics = context.registerMetric(
- "news_sitemap_average_processing_time",
- new ReducedMetric(new MeanReducer()), 30);
- scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf,
- "sitemap.schedule.delay", scheduleSitemapsWithDelay);
+ int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", 1024);
+ contentDetector = new ContentDetector(NewsSiteMapParserBolt.contentClues, maxOffsetGuess);
+ rssContentDetector = new ContentDetector(FeedDetectorBolt.contentClues, maxOffsetGuess);
+ averagedMetrics =
+ context.registerMetric(
+ "news_sitemap_average_processing_time",
+ new ReducedMetric(new MeanReducer()),
+ 30);
+ scheduleSitemapsWithDelay =
+ ConfUtils.getInt(stormConf, "sitemap.schedule.delay", scheduleSitemapsWithDelay);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
index b986506..18106c3 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
@@ -1,9 +1,22 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.commoncrawl.stormcrawler.news;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
-
import org.apache.commons.lang3.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
@@ -19,68 +32,67 @@
import org.slf4j.LoggerFactory;
/**
- * Variant of the URLFilterBolt to go upstream of the fetching to catch anything
- * before it goes further into the topology. If filtered, a URL gets an ERROR
- * status.
+ * Variant of the URLFilterBolt to go upstream of the fetching to catch anything before it goes
+ * further into the topology. If filtered, a URL gets an ERROR status.
*/
public class PreFilterBolt extends BaseRichBolt {
- protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- private URLFilters urlFilters;
-
- protected OutputCollector collector;
-
- private final String filterConfigFile;
-
- private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName;
-
- public PreFilterBolt(String filterConfigFile) {
- this.filterConfigFile = filterConfigFile;
- }
-
- @Override
- public void execute(Tuple input) {
-
- // must have at least a URL and metadata
- String urlString = input.getStringByField("url");
- Metadata metadata = (Metadata) input.getValueByField("metadata");
-
- String filtered = urlFilters.filter(null, null, urlString);
- if (StringUtils.isBlank(filtered)) {
- LOG.debug("URL rejected: {}", urlString);
- // emit with an error to the status stream
- metadata.addValue("error.cause", "Filtered");
- Values v = new Values(urlString, metadata, Status.ERROR);
- collector.emit(_s, input, v);
- collector.ack(input);
- return;
- }
-
- // pass to std out
- Values v = new Values(urlString, metadata);
- collector.emit(input, v);
- collector.ack(input);
- }
-
- @Override
- public void declareOutputFields(OutputFieldsDeclarer declarer) {
- declarer.declareStream(_s, new Fields("url", "metadata", "status"));
- declarer.declare(new Fields("url", "metadata"));
- }
-
- @Override
- public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
- this.collector = collector;
- if (filterConfigFile != null) {
- try {
- urlFilters = new URLFilters(stormConf, filterConfigFile);
- } catch (IOException e) {
- throw new RuntimeException("Can't load filters from " + filterConfigFile);
- }
- } else {
- urlFilters = URLFilters.fromConf(stormConf);
- }
- }
-
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private URLFilters urlFilters;
+
+ protected OutputCollector collector;
+
+ private final String filterConfigFile;
+
+ private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName;
+
+ public PreFilterBolt(String filterConfigFile) {
+ this.filterConfigFile = filterConfigFile;
+ }
+
+ @Override
+ public void execute(Tuple input) {
+
+ // must have at least a URL and metadata
+ String urlString = input.getStringByField("url");
+ Metadata metadata = (Metadata) input.getValueByField("metadata");
+
+ String filtered = urlFilters.filter(null, null, urlString);
+ if (StringUtils.isBlank(filtered)) {
+ LOG.debug("URL rejected: {}", urlString);
+ // emit with an error to the status stream
+ metadata.addValue("error.cause", "Filtered");
+ Values v = new Values(urlString, metadata, Status.ERROR);
+ collector.emit(_s, input, v);
+ collector.ack(input);
+ return;
+ }
+
+ // pass to std out
+ Values v = new Values(urlString, metadata);
+ collector.emit(input, v);
+ collector.ack(input);
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer declarer) {
+ declarer.declareStream(_s, new Fields("url", "metadata", "status"));
+ declarer.declare(new Fields("url", "metadata"));
+ }
+
+ @Override
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
+ this.collector = collector;
+ if (filterConfigFile != null) {
+ try {
+ urlFilters = new URLFilters(stormConf, filterConfigFile);
+ } catch (IOException e) {
+ throw new RuntimeException("Can't load filters from " + filterConfigFile);
+ }
+ } else {
+ urlFilters = URLFilters.fromConf(stormConf);
+ }
+ }
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
index 4adf03d..114477c 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
@@ -13,20 +13,18 @@
*/
package org.commoncrawl.stormcrawler.news;
+import com.fasterxml.jackson.databind.JsonNode;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
-
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.filtering.URLFilter;
-import com.fasterxml.jackson.databind.JsonNode;
public class PunycodeURLNormalizer extends URLFilter {
@Override
- public void configure(Map stormConf, JsonNode filterParams) {
- }
+ public void configure(Map stormConf, JsonNode filterParams) {}
private boolean isAscii(String str) {
char[] chars = str.toCharArray();
@@ -39,8 +37,7 @@ private boolean isAscii(String str) {
}
@Override
- public String filter(URL sourceUrl, Metadata sourceMetadata,
- String urlToFilter) {
+ public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) {
try {
URL url = new URL(urlToFilter);
String hostName = url.getHost();
@@ -51,12 +48,11 @@ public String filter(URL sourceUrl, Metadata sourceMetadata,
if (hostName.equals(url.getHost())) {
return urlToFilter;
}
- urlToFilter = new URL(url.getProtocol(), hostName, url.getPort(),
- url.getFile()).toString();
+ urlToFilter =
+ new URL(url.getProtocol(), hostName, url.getPort(), url.getFile()).toString();
} catch (MalformedURLException e) {
return null;
}
return urlToFilter;
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
index 821551e..14f45ff 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -19,31 +19,27 @@
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
-import org.commoncrawl.stormcrawler.news.CrawlTopology;
-import org.commoncrawl.stormcrawler.news.FeedDetectorBolt;
-import org.slf4j.LoggerFactory;
-
import org.apache.stormcrawler.ConfigurableTopology;
import org.apache.stormcrawler.Constants;
import org.apache.stormcrawler.bolt.FetcherBolt;
import org.apache.stormcrawler.bolt.JSoupParserBolt;
import org.apache.stormcrawler.bolt.URLFilterBolt;
import org.apache.stormcrawler.bolt.URLPartitionerBolt;
+import org.apache.stormcrawler.indexing.DummyIndexer;
import org.apache.stormcrawler.opensearch.persistence.AggregationSpout;
import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt;
-import org.apache.stormcrawler.indexing.DummyIndexer;
import org.apache.stormcrawler.spout.FileSpout;
import org.apache.stormcrawler.util.ConfUtils;
import org.apache.stormcrawler.util.URLStreamGrouping;
import org.apache.stormcrawler.warc.WARCHdfsBolt;
+import org.commoncrawl.stormcrawler.news.CrawlTopology;
+import org.commoncrawl.stormcrawler.news.FeedDetectorBolt;
+import org.slf4j.LoggerFactory;
-/**
- * Dummy topology to play with the spouts and bolts on ElasticSearch
- */
+/** Dummy topology to play with the spouts and bolts on ElasticSearch */
public class BootstrapTopology extends CrawlTopology {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(BootstrapTopology.class);
+ private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(BootstrapTopology.class);
public static void main(String[] args) throws Exception {
ConfigurableTopology.start(new BootstrapTopology(), args);
@@ -53,11 +49,14 @@ public static void main(String[] args) throws Exception {
protected int run(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
- LOG.debug("sitemap.sniffContent: {}",
+ LOG.debug(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
- LOG.info("sitemap.sniffContent: {}",
+ LOG.info(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
- LOG.warn("sitemap.sniffContent: {}",
+ LOG.warn(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
@@ -69,12 +68,11 @@ protected int run(String[] args) {
if (args.length >= 2) {
// arguments include seed directory and file pattern
LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
- builder.setSpout("filespout",
- new FileSpout(args[0], args[1], true));
+ builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
Fields key = new Fields("url");
- builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping(
- "filespout", Constants.StatusStreamName, key);
+ builder.setBolt("filter", new URLFilterBolt())
+ .fieldsGrouping("filespout", Constants.StatusStreamName, key);
}
builder.setSpout("spout", new AggregationSpout(), numShards);
@@ -91,12 +89,10 @@ protected int run(String[] args) {
builder.setBolt("feed", new FeedDetectorBolt(), numWorkers)
.localOrShuffleGrouping("sitemap");
- builder.setBolt("parse", new JSoupParserBolt())
- .localOrShuffleGrouping("feed");
+ builder.setBolt("parse", new JSoupParserBolt()).localOrShuffleGrouping("feed");
// don't need to parse the pages but need to update their status
- builder.setBolt("ssb", new DummyIndexer(), numWorkers)
- .localOrShuffleGrouping("parse");
+ builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("parse");
WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS-BOOTSTRAP");
@@ -109,8 +105,7 @@ protected int run(String[] args) {
.localOrShuffleGrouping("parse", Constants.StatusStreamName)
.localOrShuffleGrouping("ssb", Constants.StatusStreamName)
.setNumTasks(numShards)
- .customGrouping("filter", Constants.StatusStreamName,
- new URLStreamGrouping());
+ .customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping());
return submit(conf, builder);
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
index 5707189..bb50291 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
@@ -14,19 +14,18 @@
package org.commoncrawl.stormcrawler.news.bootstrap;
import java.util.ArrayList;
-
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.DocumentFragment;
-
import org.apache.stormcrawler.bolt.FeedParserBolt;
import org.apache.stormcrawler.parse.Outlink;
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.stormcrawler.parse.filter.LinkParseFilter;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
/**
- * ParseFilter which extracts exclusively RSS links via Xpath, all other links
- * are skipped. See {@link LinkParseFilter} how to register and configure in
- * parsefilters.json. A configuration snippet:
+ * ParseFilter which extracts exclusively RSS links via Xpath, all other links are skipped. See
+ * {@link LinkParseFilter} how to register and configure in parsefilters.json. A configuration
+ * snippet:
+ *
*
* {
* "class": "org.commoncrawl.stormcrawler.news.bootstrap.FeedLinkParseFilter",
@@ -41,12 +40,10 @@
*/
public class FeedLinkParseFilter extends LinkParseFilter {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(FeedLinkParseFilter.class);
+ private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedLinkParseFilter.class);
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc,
- ParseResult parse) {
+ public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
// skip existing links
logLinks(parse, URL, "Skipped links");
@@ -60,11 +57,8 @@ public void filter(String URL, byte[] content, DocumentFragment doc,
public static void logLinks(ParseResult parse, String URL, String message) {
if (LOG.isDebugEnabled() && parse.getOutlinks().size() > 0) {
- if (!message.isEmpty())
- LOG.debug("{} for {}:", message, URL);
- for (Outlink outlink : parse.getOutlinks())
- LOG.debug(outlink.getTargetURL());
+ if (!message.isEmpty()) LOG.debug("{} for {}:", message, URL);
+ for (Outlink outlink : parse.getOutlinks()) LOG.debug(outlink.getTargetURL());
}
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
index 1160201..f0af134 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/NewsSiteMapDetectorBolt.java
@@ -14,15 +14,10 @@
package org.commoncrawl.stormcrawler.news.bootstrap;
import java.util.Map;
-
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
-import org.commoncrawl.stormcrawler.news.ContentDetector;
-import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt;
-import org.slf4j.LoggerFactory;
-
import org.apache.stormcrawler.Constants;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.bolt.SiteMapParserBolt;
@@ -31,25 +26,26 @@
import org.apache.stormcrawler.parse.ParseFilters;
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.stormcrawler.persistence.Status;
+import org.commoncrawl.stormcrawler.news.ContentDetector;
+import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt;
+import org.slf4j.LoggerFactory;
/**
- * Detector for news
+ * Detector for news
* sitemaps and also sitemaps.
*/
@SuppressWarnings("serial")
public class NewsSiteMapDetectorBolt extends SiteMapParserBolt {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(NewsSiteMapDetectorBolt.class);
+ private static final org.slf4j.Logger LOG =
+ LoggerFactory.getLogger(NewsSiteMapDetectorBolt.class);
protected static final int maxOffsetContentGuess = 1024;
- private static ContentDetector contentDetector = new ContentDetector(
- NewsSiteMapParserBolt.contentClues, maxOffsetContentGuess);
+ private static ContentDetector contentDetector =
+ new ContentDetector(NewsSiteMapParserBolt.contentClues, maxOffsetContentGuess);
private ParseFilter parseFilters;
-
@Override
public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
@@ -57,10 +53,9 @@ public void execute(Tuple tuple) {
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
- boolean isSitemap = Boolean.valueOf(
- metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
- boolean isNewsSitemap = Boolean.valueOf(
- metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));
+ boolean isSitemap = Boolean.valueOf(metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
+ boolean isNewsSitemap =
+ Boolean.valueOf(metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));
if (!isNewsSitemap || !isSitemap) {
int match = contentDetector.getFirstMatch(content);
@@ -70,10 +65,8 @@ public void execute(Tuple tuple) {
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
if (match <= NewsSiteMapParserBolt.contentCluesSitemapNewsMatchUpTo) {
isNewsSitemap = true;
- LOG.info("{} detected as news sitemap based on content",
- url);
- metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey,
- "true");
+ LOG.info("{} detected as news sitemap based on content", url);
+ metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey, "true");
}
}
}
@@ -85,8 +78,8 @@ public void execute(Tuple tuple) {
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
// emit status
- collector.emit(Constants.StatusStreamName, tuple,
- new Values(url, metadata, Status.FETCHED));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
} else {
// pass on
collector.emit(tuple, tuple.getValues());
@@ -95,11 +88,9 @@ public void execute(Tuple tuple) {
}
@Override
- @SuppressWarnings({ "rawtypes", "unchecked" })
- public void prepare(Map stormConf, TopologyContext context,
- OutputCollector collect) {
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) {
super.prepare(stormConf, context, collect);
parseFilters = ParseFilters.fromConf(stormConf);
}
-
}
diff --git a/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java b/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
index 64df0ba..58aaa04 100644
--- a/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
+++ b/src/test/java/org/commoncrawl/stormcrawler/FastURLFilterTest.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
@@ -14,74 +14,72 @@
*/
package org.commoncrawl.stormcrawler;
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
-
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
import org.commoncrawl.stormcrawler.filter.FastURLFilter;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.apache.stormcrawler.Metadata;
-import org.apache.stormcrawler.filtering.URLFilter;
-import com.fasterxml.jackson.databind.node.JsonNodeFactory;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-
public class FastURLFilterTest {
protected static URLFilter filter;
@BeforeClass
public static void init() {
- filter = createFilter("fast-urlfilter.txt");
+ filter = createFilter("fast-urlfilter.txt");
}
public static FastURLFilter createFilter(String fileName) {
- ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance);
- filterParams.put("file", fileName);
- FastURLFilter filter = new FastURLFilter();
- Map conf = new HashMap<>();
- conf.put("fast.urlfilter.refresh", 10);
- filter.configure(conf, filterParams);
- return filter;
+ ObjectNode filterParams = new ObjectNode(JsonNodeFactory.instance);
+ filterParams.put("file", fileName);
+ FastURLFilter filter = new FastURLFilter();
+ Map conf = new HashMap<>();
+ conf.put("fast.urlfilter.refresh", 10);
+ filter.configure(conf, filterParams);
+ return filter;
}
@Test
public void testHostFilter() throws MalformedURLException {
- URL url = new URL("http://may.go.com/image.jpg");
- Metadata metadata = new Metadata();
- String filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(url.toString(), filterResult);
-
- url = new URL("http://no.go.com/");
- filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(null, filterResult);
+ URL url = new URL("http://may.go.com/image.jpg");
+ Metadata metadata = new Metadata();
+ String filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(url.toString(), filterResult);
+
+ url = new URL("http://no.go.com/");
+ filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(null, filterResult);
}
@Test
public void testDomainNotAllowed() throws MalformedURLException {
- URL url = new URL("http://domainnotallowed.com/forum/search.php");
- Metadata metadata = new Metadata();
- String filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(null, filterResult);
-
- url = new URL("http://domainnotallowed.com/");
- filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(null, filterResult);
-
- url = new URL("http://partiallyallowed.com/");
- filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(url.toString(), filterResult);
-
- url = new URL("http://partiallyallowed.com/verbotten");
- filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(null, filterResult);
+ URL url = new URL("http://domainnotallowed.com/forum/search.php");
+ Metadata metadata = new Metadata();
+ String filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(null, filterResult);
+
+ url = new URL("http://domainnotallowed.com/");
+ filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(null, filterResult);
+
+ url = new URL("http://partiallyallowed.com/");
+ filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(url.toString(), filterResult);
+
+ url = new URL("http://partiallyallowed.com/verbotten");
+ filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(null, filterResult);
- // allowed
- url = new URL("http://digitalpebble.com/");
- filterResult = filter.filter(url, metadata, url.toExternalForm());
- Assert.assertEquals(url.toString(), filterResult);
+ // allowed
+ url = new URL("http://digitalpebble.com/");
+ filterResult = filter.filter(url, metadata, url.toExternalForm());
+ Assert.assertEquals(url.toString(), filterResult);
}
}
diff --git a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
index db73d67..b0a0d5a 100644
--- a/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
+++ b/src/test/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserTest.java
@@ -16,6 +16,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
+import crawlercommons.sitemaps.UnknownFormatException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@@ -25,75 +26,83 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-
import org.apache.commons.io.IOUtils;
-import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType;
-import org.junit.Before;
-import org.junit.Test;
-
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.parse.Outlink;
import org.apache.stormcrawler.parse.ParsingTester;
-
-import crawlercommons.sitemaps.UnknownFormatException;
+import org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt.SitemapType;
+import org.junit.Before;
+import org.junit.Test;
public class NewsSiteMapParserTest extends ParsingTester {
@Before
public void setupParserBolt() {
- setupParserBolt(new NewsSiteMapParserBolt());
- Map config = new HashMap<>();
- config.put("sitemap.sniffContent", true);
- // allow items published during the last week
- config.put("sitemap.filter.hours.since.modified", 168);
- prepareParserBolt("test.parsefilters.json", config);
+ setupParserBolt(new NewsSiteMapParserBolt());
+ Map config = new HashMap<>();
+ config.put("sitemap.sniffContent", true);
+ // allow items published during the last week
+ config.put("sitemap.filter.hours.since.modified", 168);
+ prepareParserBolt("test.parsefilters.json", config);
}
@Test
public void testSiteMapParser() throws IOException, UnknownFormatException {
- String url = "https://example.org/sitemap-news.xml";
- byte[] content = readContent("sitemap-news.xml");
- String contentType = "";
- Metadata parentMetadata = new Metadata();
- List links = new ArrayList<>();
-
- SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
- assertEquals(SitemapType.NEWS, type);
-
- ((NewsSiteMapParserBolt) bolt).parseSiteMap(url, content, contentType, parentMetadata, links);
-
- // unmodified sitemap:
- // - publication date is far in the past, link should be skipped
- // 2008-12-23
- assertEquals("Outdated link not skipped", 0, links.size());
-
- // now set the publication date to yesterday
- LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
- content = (new String(content, StandardCharsets.UTF_8))
- .replace("2008-12-23", ""
- + yesterday.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + "")
- .getBytes(StandardCharsets.UTF_8);
- ((NewsSiteMapParserBolt) bolt).parseSiteMap(url, content, contentType, parentMetadata, links);
-
- assertEquals("Expected one and one additional link - image links are ignored", 2,
- links.size());
+ String url = "https://example.org/sitemap-news.xml";
+ byte[] content = readContent("sitemap-news.xml");
+ String contentType = "";
+ Metadata parentMetadata = new Metadata();
+ List links = new ArrayList<>();
+
+ SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
+ assertEquals(SitemapType.NEWS, type);
+
+ ((NewsSiteMapParserBolt) bolt)
+ .parseSiteMap(url, content, contentType, parentMetadata, links);
+
+ // unmodified sitemap:
+ // - publication date is far in the past, link should be skipped
+ // 2008-12-23
+ assertEquals("Outdated link not skipped", 0, links.size());
+
+ // now set the publication date to yesterday
+ LocalDateTime yesterday = LocalDateTime.now().minusDays(1);
+ content =
+ (new String(content, StandardCharsets.UTF_8))
+ .replace(
+ "2008-12-23",
+ ""
+ + yesterday.format(
+ DateTimeFormatter.ofPattern("yyyy-MM-dd"))
+ + "")
+ .getBytes(StandardCharsets.UTF_8);
+ ((NewsSiteMapParserBolt) bolt)
+ .parseSiteMap(url, content, contentType, parentMetadata, links);
+
+ assertEquals(
+ "Expected one and one additional link - image links are ignored",
+ 2,
+ links.size());
}
protected byte[] readContent(String filename) throws IOException {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- IOUtils.copy(getClass().getClassLoader().getResourceAsStream(filename), baos);
- return baos.toByteArray();
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ IOUtils.copy(getClass().getClassLoader().getResourceAsStream(filename), baos);
+ return baos.toByteArray();
}
- @Test
- public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatException {
- String url = "https://example.org/feed.xml";
- byte[] content = readContent("feed-with-sitemap-namespace.xml");
- SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
- assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap",
- SitemapType.NEWS, type);
- assertNotEquals("RSS feed with sitemap namespace should not be detected as sitemap",
- SitemapType.SITEMAP, type);
- }
-
+ @Test
+ public void testFeedWithSitemapNamespace() throws IOException, UnknownFormatException {
+ String url = "https://example.org/feed.xml";
+ byte[] content = readContent("feed-with-sitemap-namespace.xml");
+ SitemapType type = ((NewsSiteMapParserBolt) bolt).detectContent(url, content);
+ assertNotEquals(
+ "RSS feed with sitemap namespace should not be detected as sitemap",
+ SitemapType.NEWS,
+ type);
+ assertNotEquals(
+ "RSS feed with sitemap namespace should not be detected as sitemap",
+ SitemapType.SITEMAP,
+ type);
+ }
}