From 0d68ae50016c595104992fe85d5d931534ac084b Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Jun 2026 19:24:11 +0200 Subject: [PATCH 1/2] Upgrade to StormCrawler 3.6.0, Storm 2.8.8 --- Dockerfile | 4 ++-- README.md | 12 ++++++------ bin/status | 8 ++++---- conf/crawler-conf.yaml | 7 +++++++ conf/crawler.flux | 2 +- docker-compose.yaml | 12 ++++++------ pom.xml | 19 ++++--------------- .../stormcrawler/news/CrawlTopology.java | 2 +- 8 files changed, 31 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index 683df8f..6b469b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM storm:2.8.4 +FROM storm:2.8.8 RUN apt-get update -qq && \ apt-get install -yq --no-install-recommends \ @@ -10,7 +10,7 @@ RUN apt-get update -qq && \ # # news-crawler # -ENV CRAWLER_VERSION=3.5.1 +ENV CRAWLER_VERSION=3.6.0 RUN mkdir /news-crawler/ && \ mkdir /news-crawler/conf/ && \ diff --git a/README.md b/README.md index ad990bf..2d18792 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod ## Prerequisites -* Install OpenSearch 2.19.4 -* Install Apache Storm 2.8.4 +* Install OpenSearch 2.19.5 +* Install Apache Storm 2.8.8 * Start OpenSearch and Storm * Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh) @@ -32,14 +32,14 @@ mvn clean package And run ... ``` sh -storm local target/crawler-3.5.1.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt +storm local target/crawler-3.6.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt ``` This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand. -Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments. +Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the OpenSearch API. In this case, the can topology can be run without the last two arguments. -Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.4/flux.html). Make sure to adapt the Flux definition to your needs! +Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.8/flux.html). Make sure to adapt the Flux definition to your needs! In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging. @@ -88,7 +88,7 @@ NOTE: - Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance! -To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.4/flux.html): +To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.8/flux.html): ``` docker compose run --rm news-crawler \ storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux diff --git a/bin/status b/bin/status index 410c4d1..978fb5a 100755 --- a/bin/status +++ b/bin/status @@ -5,14 +5,14 @@ __ES_STATUS_URL_DEFAULT='http://localhost:9200/status' function ____show_help() { echo "$0 [-v|-V] [-C] []" echo - echo "Query StormCrawler's Elasticsearch status index" + echo "Query StormCrawler's Elasticsearch or OpenSearch status index" echo " with help of curl, jq and bash" echo echo "Global options" echo " -h show detailed help" echo " -v verbose, print commands before execution" echo " -V very verbose" - echo " -D dry run, do not execute request to ES (use in combination with -v)" + echo " -D dry run, do not execute request (use in combination with -v)" echo " -C colorize JSON output" echo echo "Commands" @@ -134,12 +134,12 @@ ES_STATUS_URL=${ES_STATUS_URL:-$__ES_STATUS_URL_DEFAULT} set -e -# current time in Elasticsearch date format +# current time in Elasticsearch/OpenSearch date format function ____now () { date -u '+%Y-%m-%dT%H:%M:%S.000Z' } -# given date in Elasticsearch date format +# given date in Elasticsearch/OpenSearch date format function ____date () { date -d"$1" -u '+%Y-%m-%dT%H:%M:%S.000Z' } diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml index eb2594b..6bbd52c 100644 --- a/conf/crawler-conf.yaml +++ b/conf/crawler-conf.yaml @@ -77,6 +77,13 @@ config: http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol + # the http/https protocol versions to use, in order of preference + # - the WARC writer handles HTTP/1.1 and HTTP/2 (cf. storm-crawler#1010) + # - okhttp does not support HTTP/1.0 requests (it supports responses however) + # http.protocol.versions: + # - "h2" + # - "http/1.1" + # do not fail on unknown SSL certificates http.trust.everything: true diff --git a/conf/crawler.flux b/conf/crawler.flux index 9f48a01..4d390a5 100644 --- a/conf/crawler.flux +++ b/conf/crawler.flux @@ -45,7 +45,7 @@ components: - name: "put" args: - "software" - - "StormCrawler 2.10 https://stormcrawler.net/" + - "StormCrawler 3.6.0 https://stormcrawler.apache.org/" - name: "put" args: - "description" diff --git a/docker-compose.yaml b/docker-compose.yaml index 24d048c..8d8d1fd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -17,13 +17,13 @@ services: # Apache Storm components # - Zookeeper coordinates the communication between Nimbus and the Supervisors zookeeper: - image: zookeeper:${ZOOKEEPER_VERSION:-3.9.3} + image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4} container_name: zookeeper restart: always # - the daemon Nimbus runs on the master node storm-nimbus: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-nimbus hostname: nimbus command: storm nimbus @@ -37,7 +37,7 @@ services: # - the Supervisors run on the worker nodes storm-supervisor: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-supervisor command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m depends_on: @@ -60,7 +60,7 @@ services: # - the Storm UI provides diagnostics about the Storm cluster storm-ui: - image: storm:${STORM_VERSION:-2.8.4} + image: storm:${STORM_VERSION:-2.8.8} container_name: storm-ui command: storm ui depends_on: @@ -72,7 +72,7 @@ services: restart: always opensearch-news-crawl: - image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.4} + image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-news-crawl environment: - cluster.name=opensearch-news-crawl-cluster @@ -95,7 +95,7 @@ services: - "127.0.0.1:9200:9200" # REST API opensearch-dashboard-news-crawl: - image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.4} + image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-dashboard-news-crawl ports: - "127.0.0.1:5601:5601" diff --git a/pom.xml b/pom.xml index 9deb934..0508210 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ under the License. 4.0.0 org.commoncrawl.stormcrawler.news crawler - 3.5.1 + 3.6.0 jar @@ -39,10 +39,10 @@ under the License. UTF-8 - 3.5.1 - 2.8.4 + 3.6.0 + 2.8.8 1.12.797 - 2.18.1 + 2.21.3 1.6 5.23.0 3.0.1 @@ -171,17 +171,6 @@ under the License. ${aws.version} - - - com.fasterxml.jackson.core - jackson-annotations - ${jackson.version} - - - com.fasterxml.jackson.core - jackson-core - ${jackson.version} - com.fasterxml.jackson.core jackson-databind diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java index 54c69eb..3a3c018 100644 --- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java +++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java @@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory; /** - * Dummy topology to play with the spouts and bolts on ElasticSearch + * Dummy topology to play with the spouts and bolts on OpenSearch */ public class CrawlTopology extends ConfigurableTopology { From 98e531a7b1b64057dce9bbbd8ada5e86fc6be02d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Jun 2026 09:18:41 +0200 Subject: [PATCH 2/2] Consistently name components and containers in docker compose configuration Add suffix "-news-crawl" to all Storm / Zookeeper container names to avoid name collisions with other StormCrawler setups running on the same system. --- README.md | 2 +- docker-compose.yaml | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2d18792..4efbf08 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http For inspecting the worker log files: ``` -docker exec storm-supervisor /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log' +docker exec storm-supervisor-news-crawl /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log' ``` To stop the topology: diff --git a/docker-compose.yaml b/docker-compose.yaml index 8d8d1fd..31bbc05 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,13 +18,13 @@ services: # - Zookeeper coordinates the communication between Nimbus and the Supervisors zookeeper: image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4} - container_name: zookeeper + container_name: zookeeper-news-crawl restart: always # - the daemon Nimbus runs on the master node storm-nimbus: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-nimbus + container_name: storm-nimbus-news-crawl hostname: nimbus command: storm nimbus depends_on: @@ -38,7 +38,7 @@ services: # - the Supervisors run on the worker nodes storm-supervisor: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-supervisor + container_name: storm-supervisor-news-crawl command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m depends_on: - zookeeper @@ -50,7 +50,7 @@ services: # which need to be able to access # - (in case a indexing topology is run) the # OpenSearch (http://opensearch:9200/) and - - opensearch-news-crawl + - opensearch # - the WARC output folder # - and the seed folder volumes: @@ -61,7 +61,7 @@ services: # - the Storm UI provides diagnostics about the Storm cluster storm-ui: image: storm:${STORM_VERSION:-2.8.8} - container_name: storm-ui + container_name: storm-ui-news-crawl command: storm ui depends_on: - storm-nimbus @@ -71,7 +71,7 @@ services: - "127.0.0.1:8080:8080" restart: always - opensearch-news-crawl: + opensearch: image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-news-crawl environment: @@ -94,7 +94,7 @@ services: ports: - "127.0.0.1:9200:9200" # REST API - opensearch-dashboard-news-crawl: + opensearch-dashboard: image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5} container_name: opensearch-dashboard-news-crawl ports: