diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..4205e52
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+version: 2
+updates:
+ - package-ecosystem: maven
+ directory: "/"
+ schedule:
+ interval: weekly
+ open-pull-requests-limit: 5
+ ignore:
+ # Jackson libs must be in sync with the version required by Storm
+ - dependency-name: "com.fasterxml.jackson*"
+
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: weekly
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 10acad1..5c79a4c 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -1,26 +1,69 @@
-# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time
-# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
name: Java CI with Maven
on:
+ # Run CI on Pushes to "main"" or on pull requests targeting "main".
push:
- branches: [ master ]
+ branches:
+ - main
pull_request:
- branches: [ master ]
+ branches:
+ - main
jobs:
- build:
-
+ rat:
runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+ with:
+ path: ~/.m2/repository
+ key: rat-maven-${{ hashFiles('**/pom.xml') }}
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
+ with:
+ distribution: adopt
+ java-version: 17
+ - name: Build with Maven
+ run: mvn -B --no-transfer-progress -Prat -DskipTests verify -Dskip.format.code=false
+ build:
+ needs: rat
+ runs-on: ${{ matrix.os }}
+ continue-on-error: ${{ matrix.experimental }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ java: [ 17 ]
+ experimental: [false]
steps:
- - uses: actions/checkout@v2
- - name: Set up JDK 8
- uses: actions/setup-java@v2
+ - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+ - uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
+ with:
+ path: ~/.m2/repository
+ key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-maven-
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0
with:
- java-version: '8'
- distribution: 'adopt'
- cache: maven
+ distribution: adopt
+ java-version: ${{ matrix.java }}
+ - name: Check code formatting
+ run: mvn -B --no-transfer-progress com.cosium.code:git-code-format-maven-plugin:validate-code-format -Dskip.format.code=false
- name: Build with Maven
- run: mvn -B package --file pom.xml
+ run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..37effdb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.idea
+target
+opensearchdata
+warcdata
+.java-version
\ No newline at end of file
diff --git a/.mvn/jvm.config b/.mvn/jvm.config
new file mode 100644
index 0000000..87ae20c
--- /dev/null
+++ b/.mvn/jvm.config
@@ -0,0 +1,8 @@
+--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED
+--add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED
+--add-opens jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED
+--add-opens jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED
diff --git a/Dockerfile b/Dockerfile
index eaad382..6b469b3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,133 +1,29 @@
-FROM ubuntu:22.04
+FROM storm:2.8.8
RUN apt-get update -qq && \
- apt-get upgrade -yq && \
-# apt-mark hold openjdk-11-jre-headless && \
apt-get install -yq --no-install-recommends \
- apt-transport-https \
- apt-utils \
- ca-certificates \
curl \
- git-core \
- gnupg \
jq \
less \
- maven \
-# openjdk-8-jdk-headless \
- sudo \
- supervisor \
- wget \
- tar \
- vim
-# zookeeperd
+ vim
#
-# Elasticsearch and Kibana
+# news-crawler
#
-ENV ES_VERSION=7.10.2
-RUN wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \
- | apt-key add -
-RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" \
- >> /etc/apt/sources.list.d/elasticsearch-7.x.list
-RUN apt-get update -qq && \
- apt-get install -yq --no-install-recommends \
- elasticsearch=$ES_VERSION \
- kibana=$ES_VERSION
-RUN ln -s /usr/share/elasticsearch/bin/elasticsearch /usr/bin/elasticsearch
-RUN ln -s /usr/share/kibana/bin/kibana /usr/bin/kibana
-USER root
-# system configuration, see https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
-ADD etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/60-elasticsearch.conf
-ADD etc/supervisor/conf.d/elasticsearch.conf /etc/supervisor/conf.d/elasticsearch.conf
-ADD etc/supervisor/conf.d/kibana.conf /etc/supervisor/conf.d/kibana.conf
-RUN chmod -R 644 /etc/sysctl.d/60-elasticsearch.conf /etc/supervisor/conf.d/*.conf
-ENV ES_HEAP_SIZE=20g
-# set Elasticsearch data path
-RUN sed -Ei 's@^path\.data: .*@path.data: /data/elasticsearch@' /etc/elasticsearch/elasticsearch.yml
-# TODO: enable updates via scripting
-
-
-# Zookeeper
-
-ENV ZOOKEEPER_VERSION=3.8.3
-RUN wget -q -O - https://downloads.apache.org/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz \
- | sudo tar -xzf - -C /opt
-ENV ZOOKEEPER_HOME=/opt/apache-zookeeper-$ZOOKEEPER_VERSION-bin
-RUN ln -s $ZOOKEEPER_HOME/conf/zoo_sample.cfg $ZOOKEEPER_HOME/conf/zoo.cfg
-# prevent ZK's admin UI to run on 8080
-RUN echo "admin.enableServer=false" >> $ZOOKEEPER_HOME/conf/zoo.cfg
-RUN ln -s $ZOOKEEPER_HOME /usr/share/zookeeper
-
-#
-# Apache Storm
-#
-ENV STORM_VERSION=2.5.0
-COPY downloads/apache-storm-$STORM_VERSION.tar.gz /tmp/apache-storm-$STORM_VERSION.tar.gz
-RUN tar -xzf /tmp/apache-storm-$STORM_VERSION.tar.gz -C /opt
-RUN rm /tmp/apache-storm-$STORM_VERSION.tar.gz
-ENV STORM_HOME /opt/apache-storm-$STORM_VERSION
-RUN groupadd storm && \
- useradd --gid storm --home-dir /home/storm \
- --create-home --shell /bin/bash storm && \
- chown -R storm:storm $STORM_HOME && \
- mkdir /var/log/storm && \
- chown -R storm:storm /var/log/storm
-RUN ln -s /var/log/storm $STORM_HOME/logs
-RUN ln -s $STORM_HOME/bin/storm /usr/bin/storm
-
-ADD etc/supervisor/conf.d/storm-*.conf /etc/supervisor/conf.d/
-ADD etc/supervisor/conf.d/zookeeper.conf /etc/supervisor/conf.d/
-RUN chmod -R 644 /etc/supervisor/conf.d/*.conf
+ENV CRAWLER_VERSION=3.6.0
-
-#
-# Storm crawler / news crawler
-#
-ENV CRAWLER_VERSION=2.10.0
-RUN groupadd ubuntu && \
- useradd --gid ubuntu --home-dir /home/ubuntu \
- --create-home --shell /bin/bash ubuntu && \
- chown -R ubuntu:ubuntu /home/ubuntu
-USER ubuntu
-WORKDIR /home/ubuntu
-RUN mkdir news-crawler/ && \
- mkdir news-crawler/conf/ && \
- mkdir news-crawler/lib/ && \
- mkdir news-crawler/bin/ && \
- mkdir news-crawler/seeds/ && \
- chmod -R a+rx news-crawler/
+RUN mkdir /news-crawler/ && \
+ mkdir /news-crawler/conf/ && \
+ mkdir /news-crawler/lib/ && \
+ mkdir /news-crawler/bin/ && \
+ chmod -R a+rx /news-crawler/
# add the news crawler uber-jar
-ADD target/crawler-$CRAWLER_VERSION.jar news-crawler/lib/crawler.jar
+ADD target/crawler-$CRAWLER_VERSION.jar /news-crawler/lib/crawler.jar
# and configuration files
-ADD conf/*.* news-crawler/conf/
-ADD seeds/*.txt news-crawler/seeds/
-ADD bin/*.sh news-crawler/bin/
-ADD bin/es_status news-crawler/bin/
-
-USER root
-RUN chown -R ubuntu:ubuntu /home/ubuntu && \
- chmod -R a+r /home/ubuntu && \
- chmod u+x news-crawler/bin/*
-
-
-# Ports:
-# 8080 - Storm UI
-# 9200 - Elasticsearch http
-# 9300 - Elasticsearch java
-# 5601 - Kibana
-EXPOSE 8080 9200 9300 5601
-
-# volumes for persistent data
-USER root
-RUN mkdir /data
-RUN mkdir /data/elasticsearch && chown elasticsearch:elasticsearch /data/elasticsearch
-VOLUME ["/data/elasticsearch"]
-RUN mkdir /data/warc && chown storm:storm /data/warc
-VOLUME ["/data/warc"]
-
-# start all services
-CMD ["/usr/bin/supervisord"]
+ADD conf/*.* /news-crawler/conf/
+ADD bin/*.sh /news-crawler/bin/
+ADD bin/status /news-crawler/bin/
-# launch the crawl
-# CMD ["/home/ubuntu/news-crawler/bin/run-crawler.sh"]
+USER storm
+WORKDIR /news-crawler/
diff --git a/README.md b/README.md
index 58f0d4d..874da21 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,29 @@
-# NEWS-CRAWL
+# News Crawler
-Crawler for news based on [StormCrawler](https://stormcrawler.net/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/).
+Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Produces WARC files to be stored as part of the [Common Crawl](https://commoncrawl.org/). The data is hosted as [AWS Open Data Set](https://registry.opendata.aws/) – if you want to use the data and not the crawler software please read [the announcement of the news dataset](https://commoncrawl.org/2016/10/news-dataset-available/).
-Prerequisites
--------------
+## Prerequisites
+* JVM 17 or higher
+* Install OpenSearch 2.19.5
+* Install Apache Storm 2.8.8
+* Start OpenSearch and Storm
+* Create the OpenSearch indices by running [bin/OS_IndexInit.sh](bin/OS_IndexInit.sh) and the dashboards by [OS_ImportDashboards.sh](bin/OS_ImportDashboards.sh)
-* Install Elasticsearch 7.10.2 (ev. also Kibana)
-* Install Apache Storm 2.5.0
-* Start Elasticsearch and Storm
-* Build ES indices by running `bin/ES_IndexInit.sh`
+Alternatively, use the Docker Compose setup, see below.
-Crawler Seeds
--------------
-The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](//github.com/commoncrawl/news-crawl/issues/41).
+## Crawler Seeds
+The crawler relies on [RSS](https://en.wikipedia.org/wiki/RSS)/[Atom](https://en.wikipedia.org/wiki/Atom_(Web_standard)) feeds and [news sitemaps](https://en.wikipedia.org/wiki/Sitemaps#Google_News_Sitemaps) to find links to news articles on news sites. A small collection of example seeds (feeds and sitemaps) is provided in [./seeds/](./seeds/). Adding support for news sites which do not provide a news feed or sitemap is an open issue, see [#41](https://github.com/commoncrawl/news-crawl/issues/41).
-Configuration
--------------
+
+## Configuration
The default configuration should work out-of-the-box. The only thing to do is to configure the user agent properties send in the HTTP request header. Open the file `conf/crawler-conf.yaml` in an editor and fill in the values for `http.agent.name` and all further properties starting with the `http.agent.` prefix.
-Run the crawl
--------------
+## Run the crawl
Generate an uberjar:
``` sh
@@ -33,23 +32,23 @@ mvn clean package
And run ...
``` sh
-storm local target/crawler-2.10.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
+storm local target/crawler-3.6.0.jar --local-ttl 60 -- org.commoncrawl.stormcrawler.news.CrawlTopology -conf $PWD/conf/opensearch-conf.yaml -conf $PWD/conf/crawler-conf.yaml $PWD/seeds/ feeds.txt
```
This will launch the crawl topology in local mode for 60 seconds. It will also "inject" all URLs found in the file `./seeds/feeds.txt` in the status index. The URLs point to news feeds and sitemaps from which links to news articles are extracted and fetched. The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand.
-Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the Elasticsearch API. In this case, the can topology can be run without the last two arguments.
+Of course, it's also possible to add (or remove) the seeds (feeds and sitemaps) using the OpenSearch API. In this case, the can topology can be run without the last two arguments.
-Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.5.0/flux.html). Make sure to adapt the Flux definition to your needs!
+Alternatively, the topology can be run from the [crawler.flux](./conf/crawler.flux), please see the [Storm Flux documentation](https://storm.apache.org/releases/2.8.8/flux.html). Make sure to adapt the Flux definition to your needs!
In production, you should use `storm jar ...` to run the topology in distributed mode and continuously (no time limit) including the Storm UI and logging.
-Monitor the crawl
------------------
-When the topology is running you can check that URLs have been injected and news are getting fetched on [http://localhost:9200/status/_search?pretty]. Or use StormCrawler's Kibana dashboards to monitor the crawling process. Please follow the instructions to install the templates for Kibana provided as part of [StormCrawler's Elasticsearch module documentation](//github.com/DigitalPebble/storm-crawler/tree/master/external/elasticsearch).
+## Monitor the crawl
+
+When the topology is running you can check that URLs have been injected and news are getting fetched on . Or use StormCrawler's OpenSearch dashboards to monitor the crawling process on .
-There is also a shell script [bin/es_status](./bin/es_status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g.,
+There is also a shell script [bin/status](./bin/status) to get aggregated counts from the status index, and to add, delete or force a re-fetch of URLs. E.g.,
```
$> bin/es_status aggregate_status
@@ -59,47 +58,79 @@ $> bin/es_status aggregate_status
```
-Run Crawl from Docker Container
--------------------------------
+## Run Crawl with Docker Compose
+
+Do not forget to create the uberjar (see above) which is included in the Docker image. Simply run:
-First, download Apache Storm 2.5.0. from the [download page](https://storm.apache.org/downloads.html) and place it in the directory `downloads`:
```
-STORM_VERSION=2.5.0
-mkdir downloads
-wget -q -P downloads --timestamping https://downloads.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz
+mvn clean package
```
-Do not forget to create the uberjar (see above) which is included in the Docker image. Simply run:
+Verify the configuration in the file [docker-compose.yaml](docker-compose.yaml) and [conf/](conf/) is correct:
+- Don't forget to adapt the paths to mounted volumes used to persist data (OpenSearch indexes and WARC files).
+- Make sure to add the user agent configuration in conf/crawler-conf.yaml.
+
+Then download and build the Docker images:
```
-mvn clean package
+docker compose -f docker-compose.yaml up --build --renew-anon-volumes --remove-orphans
+```
+
+Wait until the containers are running, then initialize the OpenSearch index and the dashboards:
+
+```
+./bin/OS_IndexInit.sh
+./bin/dashboards/OS_ImportDashboards.sh
```
-Then build the Docker image from the [Dockerfile](./Dockerfile):
+NOTE:
+- This will delete existing indexes!
+- Make sure that the OpenSearch port 9200 is not already in use or mapped by a running OpenSearch instance. Otherwise OpenSearch commands may affect the running instance!
-Note: the uberjar is included in the Docker image and needs to be built first (see above).
+To launch the topology using [Storm Flux](https://storm.apache.org/releases/2.8.8/flux.html):
```
-docker build -t newscrawler:2.10.0 .
+docker compose run --rm news-crawler \
+ storm jar lib/crawler.jar org.apache.storm.flux.Flux --remote /news-crawler/conf/crawler.flux
+```
+Or using the Java topology:
+```
+docker compose run --rm news-crawler \
+ storm jar lib/crawler.jar -- org.commoncrawl.stormcrawler.news.CrawlTopology \
+ /data/seeds '*' -conf conf/opensearch-conf.yaml -conf conf/crawler-conf.yaml
```
-To launch an interactive container:
+After 1-2 minutes if everything is up, connect to OpenSearch on port [9200](http://localhost:9200/) or the OpenSearch dashboards on port [5601](http://localhost:5601/).
+For inspecting the worker log files:
+```
+docker exec storm-supervisor-news-crawl /bin/bash -c 'cat /logs/workers-artifacts/*/*/worker.log'
```
-docker run --net=host \
- -v $PWD/data/elasticsearch:/data/elasticsearch \
- -v $PWD/data/warc:/data/warc \
- --rm --name newscrawler -i -t newscrawler:2.10.0 /bin/bash
+
+To stop the topology:
```
+docker compose run --rm -ti news-crawler /bin/bash
+
+$> storm list
+Topology_name Status Num_tasks Num_workers Uptime_secs Topology_Id Owner
+----------------------------------------------------------------------------------------
+NewsCrawl ACTIVE 48 1 146 NewsCrawl-1-1774977605 storm
-NOTE: don't forget to adapt the paths to mounted volumes used to persist data on the host. Make sure to add the user agent configuration in conf/crawler-conf.yaml.
+$> storm kill NewsCrawl
+```
-CAVEAT: Make sure that the Elasticsearch port 9200 is not already in use or mapped by a running ES instance. Otherwise Elasticsearch commands may affect the running instance!
+## Note for developers
-Once you are logged onto the Docker container, start the services and crawl with
+Please format your code before submitting a PR with
```
-/home/ubuntu/news-crawler/bin/run-crawler.sh
+mvn git-code-format:format-code -Dgcf.globPattern="**/*" -Dskip.format.code=false
```
-After 1-2 minutes if everything is up, connect to Elasticsearch on port [9200](http://127.0.0.1:9200/) or Kibana on port [5601](http://127.0.0.1:5601/).
+You can enable pre-commit format hooks by running:
+
+```
+mvn clean install -Dskip.format.code=false
+```
+
+
diff --git a/bin/ES_IndexInit.sh b/bin/ES_IndexInit.sh
deleted file mode 100755
index 394faf7..0000000
--- a/bin/ES_IndexInit.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-# modified version of
-# https://github.com/DigitalPebble/storm-crawler/blob/master/external/elasticsearch/ES_IndexInit.sh
-
-ESHOST="http://localhost:9200"
-#ESCREDENTIALS="-u elastic:passwordhere"
-
-# deletes and recreates a status index with a bespoke schema
-
-curl $ESCREDENTIALS -s -XDELETE "$ESHOST/status/" > /dev/null
-
-echo "Deleted status index"
-
-# http://localhost:9200/status/_mapping/status?pretty
-
-echo "Creating status index with mapping"
-
-curl $ESCREDENTIALS -s -XPUT $ESHOST/status -H 'Content-Type: application/json' -d '
-{
- "settings": {
- "index": {
- "number_of_shards": 16,
- "number_of_replicas": 1,
- "refresh_interval": "5s"
- }
- },
- "mappings": {
- "dynamic_templates": [{
- "metadata": {
- "path_match": "metadata.*",
- "match_mapping_type": "string",
- "mapping": {
- "type": "keyword"
- }
- }
- }],
- "_source": {
- "enabled": true
- },
- "properties": {
- "nextFetchDate": {
- "type": "date",
- "format": "dateOptionalTime"
- },
- "status": {
- "type": "keyword"
- },
- "url": {
- "type": "keyword"
- }
- }
- }
-}'
-
-# deletes and recreates a status index with a bespoke schema
-
-curl $ESCREDENTIALS -s -XDELETE "$ESHOST/metrics*/" > /dev/null
-
-echo ""
-echo "Deleted metrics index"
-
-curl $ESCREDENTIALS -s -XPUT $ESHOST/_ilm/policy/14d-deletion_policy -H 'Content-Type:application/json' -d '
-{
- "policy": {
- "phases": {
- "delete": {
- "min_age": "14d",
- "actions": {
- "delete": {}
- }
- }
- }
- }
-}
-'
-
-echo "Creating metrics index with mapping"
-
-# http://localhost:9200/metrics/_mapping/status?pretty
-curl $ESCREDENTIALS -s -XPOST $ESHOST/_template/storm-metrics-template -H 'Content-Type: application/json' -d '
-{
- "index_patterns": "metrics*",
- "settings": {
- "index": {
- "number_of_shards": 1,
- "refresh_interval": "30s"
- },
- "number_of_replicas": 0,
- "lifecycle.name": "14d-deletion_policy"
- },
- "mappings": {
- "_source": { "enabled": true },
- "properties": {
- "name": {
- "type": "keyword"
- },
- "stormId": {
- "type": "keyword"
- },
- "srcComponentId": {
- "type": "keyword"
- },
- "srcTaskId": {
- "type": "short"
- },
- "srcWorkerHost": {
- "type": "keyword"
- },
- "srcWorkerPort": {
- "type": "integer"
- },
- "timestamp": {
- "type": "date",
- "format": "dateOptionalTime"
- },
- "value": {
- "type": "double"
- }
- }
- }
-}'
-
diff --git a/bin/OS_IndexInit.sh b/bin/OS_IndexInit.sh
new file mode 100755
index 0000000..81b4066
--- /dev/null
+++ b/bin/OS_IndexInit.sh
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+# set -e
+
+OSHOST=${1:-"http://localhost:9200"}
+OSCREDENTIALS=${2:-"-u opensearch:passwordhere"}
+
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/status/" > /dev/null
+echo "Deleted 'status' index, now recreating it..."
+curl $OSCREDENTIALS -s -XPUT "$OSHOST/status" -H 'Content-Type: application/json' --upload-file src/main/resources/status.mapping
+
+echo
+
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/content/" > /dev/null
+echo "Deleted 'content' index, now recreating it..."
+curl $OSCREDENTIALS -s -XPUT "$OSHOST/content" -H 'Content-Type: application/json' --upload-file src/main/resources/indexer.mapping
+
+echo
+
+curl $OSCREDENTIALS -s -XDELETE "$OSHOST/metrics*/" > /dev/null
+
+echo "Deleted 'metrics' index, now recreating it..."
+
+# http://localhost:9200/metrics/_mapping/status?pretty
+curl $OSCREDENTIALS -s -XPOST "$OSHOST/_template/metrics-template" -H 'Content-Type: application/json' --upload-file src/main/resources/metrics.mapping
+
+echo
diff --git a/bin/dashboards/OS_ImportDashboards.sh b/bin/dashboards/OS_ImportDashboards.sh
new file mode 100755
index 0000000..561f739
--- /dev/null
+++ b/bin/dashboards/OS_ImportDashboards.sh
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/sh
+
+BIN=$(dirname $0)
+
+echo "Importing status dashboard into OpenSearch Dashboards"
+curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson
+echo ""
+
+echo "Importing metrics dashboard into OpenSearch Dashboards"
+curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson
+echo ""
+
+# Storm internal metrics
+# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson
diff --git a/bin/dashboards/metrics.ndjson b/bin/dashboards/metrics.ndjson
new file mode 100644
index 0000000..20cbb2b
--- /dev/null
+++ b/bin/dashboards/metrics.ndjson
@@ -0,0 +1,10 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="}
+{"exportedCount":9,"missingRefCount":0,"missingReferences":[]}
diff --git a/bin/dashboards/status.ndjson b/bin/dashboards/status.ndjson
new file mode 100644
index 0000000..b3d0122
--- /dev/null
+++ b/bin/dashboards/status.ndjson
@@ -0,0 +1,5 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"}
+{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/bin/dashboards/storm.ndjson b/bin/dashboards/storm.ndjson
new file mode 100644
index 0000000..880c232
--- /dev/null
+++ b/bin/dashboards/storm.ndjson
@@ -0,0 +1,5 @@
+{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"}
+{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"}
+{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"}
+{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
\ No newline at end of file
diff --git a/bin/run-crawler.sh b/bin/run-crawler.sh
deleted file mode 100755
index ca32e47..0000000
--- a/bin/run-crawler.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# in case volumes are on the host need to adjust permissions
-chown -R elasticsearch:elasticsearch /data/elasticsearch
-chown -R storm:storm /data/warc
-
-# export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-
-# as root
-/usr/bin/supervisord
-
-# wait until Storm and Elasticsearch are running
-sleep 60
-
-mkdir /tmp/seeds
-cp -rf /home/ubuntu/news-crawler/seeds /tmp/
-chmod -R a+r /tmp/seeds
-
-# start the news crawler as user ubuntu
-sudo -iu ubuntu /bin/bash <<"EOF"
-
-set -e
-
-cd $HOME/news-crawler/
-
-# initialize Elasticsearch indices
-# CAVEAT: this deletes existing indices!
-bin/ES_IndexInit.sh
-sleep 10
-
-STORMCRAWLER="storm jar $PWD/lib/crawler.jar"
-
-# run the crawler
-$STORMCRAWLER -- org.commoncrawl.stormcrawler.news.CrawlTopology \
- /tmp/seeds '*' -conf $PWD/conf/es-conf.yaml -conf $PWD/conf/crawler-conf.yaml
-# alternatively running the flux
-#$STORMCRAWLER org.apache.storm.flux.Flux --remote $PWD/conf/crawler.flux
-# suppress warnings about malformed XML in sitemaps
-storm set_log_level NewsCrawl \
- -l crawlercommons.sitemaps.SiteMapParser=ERROR
-
-
-EOF
diff --git a/bin/es_status b/bin/status
similarity index 98%
rename from bin/es_status
rename to bin/status
index 410c4d1..978fb5a 100755
--- a/bin/es_status
+++ b/bin/status
@@ -5,14 +5,14 @@ __ES_STATUS_URL_DEFAULT='http://localhost:9200/status'
function ____show_help() {
echo "$0 [-v|-V] [-C] []"
echo
- echo "Query StormCrawler's Elasticsearch status index"
+ echo "Query StormCrawler's Elasticsearch or OpenSearch status index"
echo " with help of curl, jq and bash"
echo
echo "Global options"
echo " -h show detailed help"
echo " -v verbose, print commands before execution"
echo " -V very verbose"
- echo " -D dry run, do not execute request to ES (use in combination with -v)"
+ echo " -D dry run, do not execute request (use in combination with -v)"
echo " -C colorize JSON output"
echo
echo "Commands"
@@ -134,12 +134,12 @@ ES_STATUS_URL=${ES_STATUS_URL:-$__ES_STATUS_URL_DEFAULT}
set -e
-# current time in Elasticsearch date format
+# current time in Elasticsearch/OpenSearch date format
function ____now () {
date -u '+%Y-%m-%dT%H:%M:%S.000Z'
}
-# given date in Elasticsearch date format
+# given date in Elasticsearch/OpenSearch date format
function ____date () {
date -d"$1" -u '+%Y-%m-%dT%H:%M:%S.000Z'
}
diff --git a/conf/crawler-conf.yaml b/conf/crawler-conf.yaml
index 00032e1..6bbd52c 100644
--- a/conf/crawler-conf.yaml
+++ b/conf/crawler-conf.yaml
@@ -19,8 +19,8 @@ config:
# mandatory when using Flux
topology.kryo.register:
- - com.digitalpebble.stormcrawler.Metadata
- - com.digitalpebble.stormcrawler.persistence.Status
+ - org.apache.stormcrawler.Metadata
+ - org.apache.stormcrawler.persistence.Status
topology.backpressure.enable: false
@@ -31,7 +31,7 @@ config:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1
- - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
+ - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
parallelism.hint: 1
# status index and fetcher queues are partitioned by domain
@@ -70,9 +70,19 @@ config:
# increased network timeout (ms) for news sites from Asia and eastern Europe
http.timeout: 30000
+ # allowed URL protocols
+ protocols: "http,https"
+
# use okhttp
- http.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
- https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
+ http.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
+ https.protocol.implementation: org.apache.stormcrawler.protocol.okhttp.HttpProtocol
+
+ # the http/https protocol versions to use, in order of preference
+ # - the WARC writer handles HTTP/1.1 and HTTP/2 (cf. storm-crawler#1010)
+ # - okhttp does not support HTTP/1.0 requests (it supports responses however)
+ # http.protocol.versions:
+ # - "h2"
+ # - "http/1.1"
# do not fail on unknown SSL certificates
http.trust.everything: true
@@ -82,10 +92,26 @@ config:
# or transferred protocol metadata must also be prefixed.
protocol.md.prefix: "protocol."
+ # number of instances for each protocol implementation
+ protocol.instances.num: 8
+ # connection pool configuration of OkHttp protocol
+ okhttp.protocol.connection.pool:
+ # maximum number of idle connections (in addition to active connections)
+ max.idle.connections: 256
+ # maximum keep-alive time of the connections in seconds
+ connection.keep.alive: 300
+ # See also
+ # https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html
+ # Note that OkHttp's connection pool (v4.9.1) is not optimized for fast
+ # look-up of connections, the pool size (idle and active connections)
+ # should not exceed 1000. To allow for efficient pooling in large and
+ # diverse crawls, it's recommended to increase also the number of protocol
+ # instances, see `protocol.instance.num`.
+
# delay between successive requests to the same host/domain
# (be defensive, a delay of 5 sec. means about 1000 fetches per hour
# which should be enough even for large news sites)
- fetcher.server.delay: 6.0
+ fetcher.server.delay: 9.0
# generous max. crawl delay
# (fetch content even if the robots.txt specifies a large host-specific crawl delay:
@@ -103,7 +129,7 @@ config:
fetcher.max.urls.in.queues: 6000
# fetch Scheduler implementation
- scheduler.class: "com.digitalpebble.stormcrawler.persistence.AdaptiveScheduler"
+ scheduler.class: "org.apache.stormcrawler.persistence.AdaptiveScheduler"
# AdaptiveScheduler properties
scheduler.adaptive.setLastModified: true
# frequently changing feeds or news sitemaps are refetched after 90 min.
diff --git a/conf/crawler.flux b/conf/crawler.flux
index 5367184..4d390a5 100644
--- a/conf/crawler.flux
+++ b/conf/crawler.flux
@@ -6,11 +6,11 @@ includes:
override: false
- resource: false
- file: "crawler-conf.yaml"
+ file: "conf/crawler-conf.yaml"
override: true
- resource: false
- file: "es-conf.yaml"
+ file: "conf/opensearch-conf.yaml"
override: true
config:
@@ -21,7 +21,7 @@ config:
components:
- id: "WARCFileNameFormat"
- className: "com.digitalpebble.stormcrawler.warc.WARCFileNameFormat"
+ className: "org.apache.stormcrawler.warc.WARCFileNameFormat"
configMethods:
- name: "withPath"
args:
@@ -30,7 +30,7 @@ components:
args:
- "CC-NEWS"
- id: "WARCFileRotationPolicy"
- className: "com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy"
+ className: "org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy"
constructorArgs:
- 1024
- MB
@@ -45,7 +45,7 @@ components:
- name: "put"
args:
- "software"
- - "StormCrawler 2.10 https://stormcrawler.net/"
+ - "StormCrawler 3.6.0 https://stormcrawler.apache.org/"
- name: "put"
args:
- "description"
@@ -77,19 +77,19 @@ components:
spouts:
- id: "spout"
- className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout"
+ className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout"
parallelism: 16
- id: "filespout"
- className: "com.digitalpebble.stormcrawler.spout.FileSpout"
+ className: "org.apache.stormcrawler.spout.FileSpout"
parallelism: 1
constructorArgs:
- - "/path/to/seeds/"
+ - "/data/seeds/"
- "feeds.txt"
- true
bolts:
- id: "filter"
- className: "com.digitalpebble.stormcrawler.bolt.URLFilterBolt"
+ className: "org.apache.stormcrawler.bolt.URLFilterBolt"
parallelism: 1
- id: "prefilter"
className: "org.commoncrawl.stormcrawler.news.PreFilterBolt"
@@ -97,22 +97,22 @@ bolts:
constructorArgs:
- "pre-urlfilters.json"
- id: "partitioner"
- className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt"
+ className: "org.apache.stormcrawler.bolt.URLPartitionerBolt"
parallelism: 1
- id: "fetcher"
- className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt"
+ className: "org.apache.stormcrawler.bolt.FetcherBolt"
parallelism: 1
- id: "sitemap"
className: "org.commoncrawl.stormcrawler.news.NewsSiteMapParserBolt"
parallelism: 1
- id: "feed"
- className: "com.digitalpebble.stormcrawler.bolt.FeedParserBolt"
+ className: "org.apache.stormcrawler.bolt.FeedParserBolt"
parallelism: 1
- id: "ssbolt"
- className: "com.digitalpebble.stormcrawler.indexing.DummyIndexer"
+ className: "org.apache.stormcrawler.indexing.DummyIndexer"
parallelism: 1
- id: "warc"
- className: "com.digitalpebble.stormcrawler.warc.WARCHdfsBolt"
+ className: "org.apache.stormcrawler.warc.WARCHdfsBolt"
parallelism: 1
configMethods:
- name: "withFileNameFormat"
@@ -129,7 +129,7 @@ bolts:
args:
- "warc"
- id: "status"
- className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
+ className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
parallelism: 1
streams:
@@ -219,7 +219,7 @@ streams:
streamId: "status"
type: CUSTOM
customClass:
- className: "com.digitalpebble.stormcrawler.util.URLStreamGrouping"
+ className: "org.apache.stormcrawler.util.URLStreamGrouping"
constructorArgs:
- "byDomain"
diff --git a/conf/es-conf.yaml b/conf/es-conf.yaml
deleted file mode 100644
index 9b02a56..0000000
--- a/conf/es-conf.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# configuration for Elasticsearch resources
-
-config:
- # ES metricsConsumer
- es.metrics.addresses: "http://localhost:9200"
- es.metrics.index.name: "metrics"
-
- # ES spout and persistence bolt
- es.status.addresses: "http://localhost:9200"
- es.status.index.name: "status"
- #es.status.user: "USERNAME"
- #es.status.password: "PASSWORD"
- # the routing is done on the value of 'partition.url.mode'
- es.status.routing: true
- # stores the value used for grouping the URLs as a separate field
- # needed by the spout implementations
- # also used for routing if the value above is set to true
- es.status.routing.fieldname: "metadata.hostname"
- es.status.bulkActions: 500
- es.status.flushInterval: "5s"
- es.status.concurrentRequests: 1
-
- ################
- # spout config #
- ################
-
- # positive or negative filter parsable by the Lucene Query Parser
- # es.status.filterQuery:
- # - "-(metadata.hostname:stormcrawler.net)"
- # - "-(key:digitalpebble.com)"
-
- # time in secs for which the URLs will be considered for fetching after a ack or fail
- # need a high value to avoid duplicates by URLs added multiple times to the fetcher
- # queues, should be close to
- # fetcher.max.crawl.delay * fetcher.max.queue.size
- spout.ttl.purgatory: 1200
-
- # Min time (in msecs) to allow between 2 successive queries (per bucket) to ES
- spout.min.delay.queries: 30000
-
- # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
- # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results
- # might be returned.
- # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look
- # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs.
- spout.reset.fetchdate.after: 240
-
- es.status.max.buckets: 200
- # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6
- # but set to a lower number for domains with longer crawl-delay
- # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues
- es.status.max.urls.per.bucket: 5
- # field to group the URLs into buckets
- es.status.bucket.field: "metadata.hostname"
- # fields to sort the URLs within a bucket
- es.status.bucket.sort.field:
- - "nextFetchDate"
- - "url"
- # field to sort the buckets
- es.status.global.sort.field: "nextFetchDate"
-
- # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
- es.status.max.start.offset: 500
-
- # AggregationSpout : sampling improves the performance on large crawls
- es.status.sample: false
-
- # max allowed duration of a query in sec
- es.status.query.timeout: -1
-
- # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
- # use it as nextFetchDate
- es.status.recentDate.increase: -1
- es.status.recentDate.min.gap: -1
-
- topology.metrics.consumer.register:
- - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
- parallelism.hint: 1
- #whitelist:
- # - "fetcher_counter"
- # - "fetcher_average.bytes_fetched"
- #blacklist:
- # - "__receive.*"
diff --git a/conf/opensearch-conf.yaml b/conf/opensearch-conf.yaml
new file mode 100644
index 0000000..e6d2025
--- /dev/null
+++ b/conf/opensearch-conf.yaml
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# configuration for OpenSearch resources
+
+config:
+
+ # address to use unless a more specific one has been
+ # defined for a component
+ # also accepts a list or multiple values in a single line
+ # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
+ # Note: here the address from inside the docker-compose cluster is required
+ opensearch.addresses: "http://opensearch-news-crawl:9200"
+ #opensearch.user: "USERNAME"
+ #opensearch.password: "PASSWORD"
+ opensearch.concurrentRequests: 2
+
+ # Disable TLS validation for connection to OpenSearch
+ # opensearch.disable.tls.validation: false
+
+ # Indexer bolt
+ # addresses can be specified as a full URL
+ # if not we assume that the protocol is http and the port 9200
+ opensearch.indexer.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.indexer.index.name: "content"
+ # opensearch.indexer.pipeline: "_PIPELINE_"
+ opensearch.indexer.create: false
+ opensearch.indexer.bulkActions: 100
+ opensearch.indexer.flushInterval: "2s"
+ opensearch.indexer.concurrentRequests: 1
+ opensearch.indexer.sniff: true
+
+ # MetricsConsumer
+ opensearch.metrics.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.metrics.index.name: "metrics"
+ opensearch.metrics.sniff: true
+
+ # Spout and persistence bolt
+ opensearch.status.addresses: "http://opensearch-news-crawl:9200"
+ opensearch.status.index.name: "status"
+ #opensearch.status.user: "USERNAME"
+ #opensearch.status.password: "PASSWORD"
+ # the routing is done on the value of 'partition.url.mode'
+ opensearch.status.routing: true
+ # stores the value used for grouping the URLs as a separate field
+ # needed by the spout implementations
+ # also used for routing if the value above is set to true
+ opensearch.status.routing.fieldname: "key"
+ opensearch.status.bulkActions: 500
+ opensearch.status.flushInterval: "5s"
+ opensearch.status.concurrentRequests: 1
+ opensearch.status.sniff: true
+
+ ################
+ # spout config #
+ ################
+
+ # positive or negative filters parsable by the Lucene Query Parser
+ # opensearch.status.filterQuery:
+ # - "-(key:stormcrawler.net)"
+ # - "-(key:stormcrawler.apache.org)"
+
+ # time in secs for which the URLs will be considered for fetching after a ack or fail
+ # need a high value to avoid duplicates by URLs added multiple times to the fetcher
+ # queues, should be close to
+ # fetcher.max.crawl.delay * fetcher.max.queue.size
+ spout.ttl.purgatory: 1200
+
+ # Min time (in msecs) to allow between 2 successive queries to OpenSearch
+ spout.min.delay.queries: 30000
+
+ # Max time (in msecs) to allow between 2 successive queries to OpenSearch
+ spout.max.delay.queries: 60000
+
+ # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
+ # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer results
+ # might be returned.
+ # - should reset to avoid that a bucket with many URLs blocks incrementing the date to look
+ # for next fetches for a specific bucket. May happen if a news sitemap adds 1000s of URLs.
+ spout.reset.fetchdate.after: 240
+
+ opensearch.status.max.buckets: 200
+ # max. URLs per bucket (= domain name): 30 sec. / 5 sec. fetch delay = 6
+ # but set to a lower number for domains with longer crawl-delay
+ # cf. also fetcher.max.queue.size and fetcher.max.urls.in.queues
+ opensearch.status.max.urls.per.bucket: 5
+ # field to group the URLs into buckets
+ opensearch.status.bucket.field: "key"
+ # fields to sort the URLs within a bucket
+ opensearch.status.bucket.sort.field:
+ - "nextFetchDate"
+ - "url"
+ # field to sort the buckets
+ opensearch.status.global.sort.field: "nextFetchDate"
+
+ # AggregationSpout : sampling improves the performance on large crawls
+ opensearch.status.sample: false
+
+ # max allowed duration of a query in sec
+ opensearch.status.query.timeout: -1
+
+ # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
+ # use it as nextFetchDate
+ opensearch.status.recentDate.increase: -1
+ opensearch.status.recentDate.min.gap: -1
+
+ topology.metrics.consumer.register:
+ - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
+ parallelism.hint: 1
+ #whitelist:
+ # - "fetcher_counter"
+ # - "fetcher_average.bytes_fetched"
+ #blacklist:
+ # - "__receive.*"
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..31bbc05
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+services:
+
+ # Apache Storm components
+ # - Zookeeper coordinates the communication between Nimbus and the Supervisors
+ zookeeper:
+ image: zookeeper:${ZOOKEEPER_VERSION:-3.9.4}
+ container_name: zookeeper-news-crawl
+ restart: always
+
+ # - the daemon Nimbus runs on the master node
+ storm-nimbus:
+ image: storm:${STORM_VERSION:-2.8.8}
+ container_name: storm-nimbus-news-crawl
+ hostname: nimbus
+ command: storm nimbus
+ depends_on:
+ - zookeeper
+ links:
+ - zookeeper
+ ports:
+ - 6627:6627
+ restart: always
+
+ # - the Supervisors run on the worker nodes
+ storm-supervisor:
+ image: storm:${STORM_VERSION:-2.8.8}
+ container_name: storm-supervisor-news-crawl
+ command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
+ depends_on:
+ - zookeeper
+ - storm-nimbus
+ links:
+ - zookeeper
+ - storm-nimbus:nimbus
+ # supervisor launches the worker processes
+ # which need to be able to access
+ # - (in case a indexing topology is run) the
+ # OpenSearch (http://opensearch:9200/) and
+ - opensearch
+ # - the WARC output folder
+ # - and the seed folder
+ volumes:
+ - ${WARCOUTPUT:-./warcdata}:/data/warc
+ - ${SEEDDIR:-./seeds}:/data/seeds
+ restart: always
+
+ # - the Storm UI provides diagnostics about the Storm cluster
+ storm-ui:
+ image: storm:${STORM_VERSION:-2.8.8}
+ container_name: storm-ui-news-crawl
+ command: storm ui
+ depends_on:
+ - storm-nimbus
+ links:
+ - storm-nimbus:nimbus
+ ports:
+ - "127.0.0.1:8080:8080"
+ restart: always
+
+ opensearch:
+ image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-2.19.5}
+ container_name: opensearch-news-crawl
+ environment:
+ - cluster.name=opensearch-news-crawl-cluster
+ - node.name=opensearch-news-crawl
+ - discovery.type=single-node
+ - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
+ - "OPENSEARCH_JAVA_OPTS=-Xms4G -Xmx4G"
+ - plugins.security.disabled=true
+ - "DISABLE_INSTALL_DEMO_CONFIG=true"
+ volumes:
+ - ${OPENSEARCHDATA:-./opensearchdata}:/usr/share/opensearch/data
+ ulimits:
+ memlock:
+ soft: -1
+ hard: -1
+ nofile:
+ soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
+ hard: 65536
+ ports:
+ - "127.0.0.1:9200:9200" # REST API
+
+ opensearch-dashboard:
+ image: opensearchproject/opensearch-dashboards:${OPENSEARCH_VERSION:-2.19.5}
+ container_name: opensearch-dashboard-news-crawl
+ ports:
+ - "127.0.0.1:5601:5601"
+ expose:
+ - "5601"
+ environment:
+ - 'OPENSEARCH_HOSTS=["http://opensearch-news-crawl:9200"]'
+ - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards
+
+ # - to launch a topology
+ # - will exit on startup
+ news-crawler:
+ build: .
+ container_name: news-crawler
+ command: /bin/bash
+ depends_on:
+ - storm-nimbus
+ links:
+ - storm-nimbus:nimbus
+ volumes:
+ - ${WARCOUTPUT:-./warcdata}:/data/warc
+ - ${SEEDDIR:-./seeds}:/data/seeds
+ restart: "no"
+
diff --git a/etc/supervisor/conf.d/elasticsearch.conf b/etc/supervisor/conf.d/elasticsearch.conf
deleted file mode 100644
index a4f0020..0000000
--- a/etc/supervisor/conf.d/elasticsearch.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-[program:elasticsearch]
-command=/usr/share/elasticsearch/bin/elasticsearch -Enetwork.host=127.0.0.1 -Ehttp.port=9200 -Etransport.tcp.port=9300
-numprocs=1
-autostart=true
-autorestart=true
-user=elasticsearch
-echo environment=ES_HEAP_SIZE="20g"
-environment=ES_PATH_CONF=/etc/elasticsearch
\ No newline at end of file
diff --git a/etc/supervisor/conf.d/kibana.conf b/etc/supervisor/conf.d/kibana.conf
deleted file mode 100644
index 45e1812..0000000
--- a/etc/supervisor/conf.d/kibana.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-[program:kibana]
-command=/usr/share/kibana/bin/kibana -c /etc/kibana/kibana.yml
-numprocs=1
-autostart=true
-autorestart=true
-user=kibana
-directory=/usr/share/kibana/
diff --git a/etc/sysctl.d/60-elasticsearch.conf b/etc/sysctl.d/60-elasticsearch.conf
deleted file mode 100644
index ae43f01..0000000
--- a/etc/sysctl.d/60-elasticsearch.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-
-# Elasticsearch settings
-# see
-# https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-configuration-memory.html#swappiness
-# https://www.elastic.co/guide/en/elasticsearch/reference/current/vm-max-map-count.html
-vm.swappiness=1
-vm.max_map_count=262144
diff --git a/pom.xml b/pom.xml
index db70413..69b7487 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,11 +1,34 @@
-
+
+
+4.0.0org.commoncrawl.stormcrawler.newscrawler
- 2.10.0
+ 3.6.0jar
+
+ https://github.com/commoncrawl/news-crawlApache License, Version 2.0
@@ -13,35 +36,119 @@
- https://github.com/commoncrawl/news-crawl
-
UTF-8
- 2.10
- 2.5.0
- 1.12.467
- 2.11.1
- 1.1
- 5.5.0
- 2.26.3
- 4.13
+ 3.6.0
+ 2.8.8
+ 1.12.797
+ 2.21.3
+ 1.6
+ 5.23.0
+ 3.0.1
+ 4.13.2
+ 5.4
+ true
+
+
+ org.apache.stormcrawler
+ stormcrawler-core
+ ${stormcrawler.version}
+
+
+
+ org.apache.storm
+ storm-client
+ ${storm.version}
+ provided
+
+
+ org.apache.storm
+ flux-core
+ ${storm.version}
+
+
+
+ org.apache.stormcrawler
+ stormcrawler-opensearch
+ ${stormcrawler.version}
+
+
+ org.apache.stormcrawler
+ stormcrawler-warc
+ ${stormcrawler.version}
+
+
+ com.github.crawler-commons
+ crawler-commons
+ ${crawler-commons.version}
+
+
+
+ com.amazonaws
+ aws-java-sdk-s3
+ ${aws.version}
+
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
+
+
+
+ org.apache.stormcrawler
+ stormcrawler-core
+ ${stormcrawler.version}
+ test-jar
+ test
+
+
+
+ org.mockito
+ mockito-core
+ ${mockito.version}
+ test
+
+
+
+ com.github.tomakehurst
+ wiremock
+ ${wiremock.version}
+ test
+
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
+
+
org.apache.maven.pluginsmaven-compiler-plugin
- 3.11.0
+ 3.15.0
- 11
- 11
+ 17
+ 17org.codehaus.mojoexec-maven-plugin
- 3.1.0
+ 3.6.3
+
+ java
+ true
+ false
+ compile
+
@@ -49,34 +156,28 @@
-
- java
- true
- false
- compile
- org.apache.maven.pluginsmaven-shade-plugin
- 3.5.0
+ 3.6.2
- packageshade
+ packagefalse
+ implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
org.apache.storm.flux.Flux
-
-
+
+
@@ -105,119 +206,46 @@
-
- org.owasp
- dependency-check-maven
- 6.1.0
-
- true
-
-
-
-
- aggregate
-
-
-
-
+ com.cosium.code
+ git-code-format-maven-plugin
+ ${git-code-format-maven-plugin.version}
+
+
+
+ install-formatter-hook
+
+ install-hooks
+
+
+
+
+ validate-code-format
+
+ validate-code-format
+
+
+
+
+
+
+ com.cosium.code
+ google-java-format
+ ${git-code-format-maven-plugin.version}
+
+
+
+ ${skip.format.code}
+
+ true
+ false
+ false
+ false
+
+
+
-
-
-
- com.digitalpebble.stormcrawler
- storm-crawler-core
- ${stormcrawler.version}
-
-
-
- org.apache.storm
- storm-client
- ${storm.version}
- provided
-
-
- org.apache.storm
- flux-core
- ${storm.version}
-
-
-
- com.digitalpebble.stormcrawler
- storm-crawler-elasticsearch
- ${stormcrawler.version}
-
-
- com.digitalpebble.stormcrawler
- storm-crawler-warc
- ${stormcrawler.version}
-
-
-
- jdk.tools
- jdk.tools
-
-
-
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- ${jackson-databind.version}
-
-
-
- com.github.crawler-commons
- crawler-commons
- ${crawler-commons.version}
-
-
-
- com.amazonaws
- aws-java-sdk-s3
- ${aws.version}
-
-
-
-
- com.digitalpebble.stormcrawler
- storm-crawler-core
- ${stormcrawler.version}
- test-jar
- test
-
-
-
- org.mockito
- mockito-core
- ${mockito.version}
- test
-
-
-
- com.github.tomakehurst
- wiremock
- ${wiremock.version}
- test
-
-
-
- junit
- junit
- ${junit.version}
- test
-
-
-
-
-
-
- commons-io
- commons-io
- 2.11.0
-
-
-
diff --git a/seeds/feeds.txt b/seeds/feeds.txt
index b74ad0e..88468a8 100644
--- a/seeds/feeds.txt
+++ b/seeds/feeds.txt
@@ -1,26 +1,3 @@
-https://www.usatoday.com/news-sitemap.xml isSitemapNews=true
-https://www.theguardian.com/sitemaps/news.xml isSitemapNews=true
-https://www.theguardian.com/international/rss isFeed=true
-https://www.theguardian.com/world/rss isFeed=true
-https://www.theguardian.com/uk/rss isFeed=true
-https://www.theguardian.com/us/rss isFeed=true
-https://www.theguardian.com/world/eu/rss isFeed=true
-https://www.theguardian.com/politics/rss isFeed=true
-https://www.theguardian.com/science/rss isFeed=true
-https://www.theguardian.com/education/rss isFeed=true
-https://www.theguardian.com/football/rss isFeed=true
-https://www.elwatannews.com/home/rssfeeds isFeed=true
-https://www.corriere.it/rss/sitemap_v2.xml isSitemapIndex=true
-https://www.repubblica.it/rss/homepage/rss2.0.xml isFeed=true
-https://www.repubblica.it/rss/economia/rss2.0.xml isFeed=true
-https://www.repubblica.it/rss/politica/rss2.0.xml isFeed=true
-https://www.lemonde.fr/sitemap_news.xml isSitemapNews=true
-https://www.lemonde.fr/economie/rss_full.xml isFeed=true
-https://www.lemonde.fr/rss/une.xml isFeed=true
-https://www.lemonde.fr/international/rss_full.xml isFeed=true
-https://www.lemonde.fr/politique/rss_full.xml isFeed=true
-https://www.lemonde.fr/livres/rss_full.xml isFeed=true
-https://www.lemonde.fr/afrique/rss_full.xml isFeed=true
-https://www.lemonde.fr/ameriques/rss_full.xml isFeed=true
-https://www.cnn.com/sitemaps/cnn/news.xml isSitemapNews=true
-https://www.bbc.com/sitemaps/https-index-com-news.xml isSitemapNews=true
+https://commoncrawl.org/blog/rss.xml
+
+# Please, add your news feeds and sitemaps below - one line, one URL.
\ No newline at end of file
diff --git a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
index 6478ec1..9b2ee32 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/filter/FastURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,6 +16,16 @@
*/
package org.commoncrawl.stormcrawler.filter;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.GetObjectRequest;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.model.S3Object;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonMappingException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimap;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
@@ -29,37 +39,23 @@
import java.util.TimerTask;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
-
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.stormcrawler.JSONResource;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
+import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.amazonaws.services.s3.AmazonS3;
-import com.amazonaws.services.s3.AmazonS3ClientBuilder;
-import com.amazonaws.services.s3.model.GetObjectRequest;
-import com.amazonaws.services.s3.model.ObjectMetadata;
-import com.amazonaws.services.s3.model.S3Object;
-import com.digitalpebble.stormcrawler.JSONResource;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilter;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-import com.fasterxml.jackson.core.JsonParseException;
-import com.fasterxml.jackson.databind.JsonMappingException;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.google.common.collect.LinkedHashMultimap;
-import com.google.common.collect.Multimap;
-
/**
- * Version of the FastURLFilter that can load from a text representation instead
- * of the JSON that the SC version handles. Can also reload periodically and get
- * its content from S3.
- *
- * Filters URLs based on a file of regular expressions using host/domains
- * matching first. The default policy is to accept a URL if no matches are
- * found.
+ * Version of the FastURLFilter that can load from a text representation instead of the JSON that
+ * the SC version handles. Can also reload periodically and get its content from S3.
+ *
+ *
Filters URLs based on a file of regular expressions using host/domains matching first. The
+ * default policy is to accept a URL if no matches are found.
+ *
+ *
- *
- * Host rules are evaluated before Domain rules. For
- * Host rules the entire host name of a URL must match while the
- * domain names in Domain rules are considered as matches if the
- * domain is a suffix of the host name (consisting of complete host name parts).
- * Shorter domain suffixes are checked first, a single dot
- * "." as "domain name" can be used to specify
- * global rules applied to every URL.
- *
- * E.g., for "www.example.com" the rules given above are looked up in the
- * following order:
+ *
+ * Host rules are evaluated before Domain rules. For Host
+ * rules the entire host name of a URL must match while the domain names in Domain
+ * rules are considered as matches if the domain is a suffix of the host name (consisting of
+ * complete host name parts). Shorter domain suffixes are checked first, a single dot ".
+ * " as "domain name" can be used to specify global rules applied to every
+ * URL.
+ *
+ *
E.g., for "www.example.com" the rules given above are looked up in the following order:
+ *
*
- *
check "www.example.com" whether host-based rules exist and whether one of
- * them matches
- *
check "www.example.com" for domain-based rules
- *
check "example.com" for domain-based rules
- *
check "com" for domain-based rules
- *
check for global rules ("Domain .")
+ *
check "www.example.com" whether host-based rules exist and whether one of them matches
+ *
check "www.example.com" for domain-based rules
+ *
check "example.com" for domain-based rules
+ *
check "com" for domain-based rules
+ *
check for global rules ("Domain .")
*
- * The first matching rule will reject the URL and no further rules are checked.
- * If no rule matches the URL is accepted. URLs without a host name (e.g.,
- * file:/path/file.txt are checked for global rules only. URLs
- * which fail to be parsed as {@link java.net.URL} are always rejected.
- *
- * For rules either the URL path (DenyPath) or path and query
- * (DenyPathQuery) are checked whether the given
- * {@link java.util.regex Java Regular expression} is found (see
- * {@link java.util.regex.Matcher#find()}) in the URL path (and query).
- *
- * Rules are applied in the order of their definition. For better performance,
- * regular expressions which are simpler/faster or match more URLs should be
- * defined earlier.
- *
- * Comments in the rule file start with the # character and reach
- * until the end of the line.
- *
- * The rules file is defined via the property urlfilter.fast.file,
- * the default name is fast-urlfilter.txt.
+ *
+ * The first matching rule will reject the URL and no further rules are checked. If no rule matches
+ * the URL is accepted. URLs without a host name (e.g., file:/path/file.txt are checked
+ * for global rules only. URLs which fail to be parsed as {@link java.net.URL} are always rejected.
+ *
+ *
For rules either the URL path (DenyPath) or path and query (DenyPathQuery
+ * ) are checked whether the given {@link java.util.regex Java Regular expression} is found
+ * (see {@link java.util.regex.Matcher#find()}) in the URL path (and query).
+ *
+ *
Rules are applied in the order of their definition. For better performance, regular
+ * expressions which are simpler/faster or match more URLs should be defined earlier.
+ *
+ *
Comments in the rule file start with the # character and reach until the end of
+ * the line.
+ *
+ *
The rules file is defined via the property urlfilter.fast.file, the default name
+ * is fast-urlfilter.txt.
*/
-public class FastURLFilter extends URLFilter implements JSONResource {
+public class FastURLFilter extends URLFilter implements JSONResource {
- protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
private Multimap hostRules = LinkedHashMultimap.create();
@@ -121,302 +114,320 @@ public class FastURLFilter extends URLFilter implements JSONResource {
private String resourceFile;
- private static final Pattern CATCH_ALL_RULE = Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
+ private static final Pattern CATCH_ALL_RULE =
+ Pattern.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
private String resourceETAG;
public void configure(@SuppressWarnings("rawtypes") Map stormConf, JsonNode filterParams) {
- // read from conf first
- int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1);
- this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null);
-
- // then from the param file (which needs recompiling in case of change)
- if (filterParams != null) {
- JsonNode node = filterParams.get("file");
- if (node != null && node.isTextual() && this.resourceFile == null) {
- this.resourceFile = node.asText();
- }
- node = filterParams.get("refresh");
- if (node != null && node.isInt() && refreshRate == -1) {
- refreshRate = node.asInt();
- }
- }
-
- try {
- loadJSONResources();
- } catch (Exception e) {
- LOG.error("Exception while loading resources", e);
- }
-
- if (refreshRate != -1) {
- LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate);
- new Timer().schedule(new TimerTask() {
- public void run() {
- LOG.info("Reloading resources");
- try {
- loadJSONResources();
- } catch (Exception e) {
- LOG.error("Can't load resources", e);
- }
- }
- }, refreshRate * 1000, refreshRate * 1000);
- }
+ // read from conf first
+ int refreshRate = ConfUtils.getInt(stormConf, "fast.urlfilter.refresh", -1);
+ this.resourceFile = ConfUtils.getString(stormConf, "fast.urlfilter.file", null);
+
+ // then from the param file (which needs recompiling in case of change)
+ if (filterParams != null) {
+ JsonNode node = filterParams.get("file");
+ if (node != null && node.isTextual() && this.resourceFile == null) {
+ this.resourceFile = node.asText();
+ }
+ node = filterParams.get("refresh");
+ if (node != null && node.isInt() && refreshRate == -1) {
+ refreshRate = node.asInt();
+ }
+ }
+
+ try {
+ loadJSONResources();
+ } catch (Exception e) {
+ LOG.error("Exception while loading resources", e);
+ }
+
+ if (refreshRate != -1) {
+ LOG.info("Filter set to reload from {} every {} sec", getResourceFile(), refreshRate);
+ new Timer()
+ .schedule(
+ new TimerTask() {
+ public void run() {
+ LOG.info("Reloading resources");
+ try {
+ loadJSONResources();
+ } catch (Exception e) {
+ LOG.error("Can't load resources", e);
+ }
+ }
+ },
+ refreshRate * 1000,
+ refreshRate * 1000);
+ }
}
/**
* Load the resources from the JSON file in the uber jar or from S3
- *
+ *
* @throws Exception
- **/
+ */
@Override
public void loadJSONResources() throws Exception {
- InputStream inputStream = null;
- AmazonS3 s3client = null;
- try {
- if (getResourceFile().startsWith("s3://")) {
- // try loading from S3
- s3client = AmazonS3ClientBuilder.standard().build();
- java.net.URI uri = new java.net.URI(getResourceFile());
-
- String bucketName = uri.getHost();
- // remove the first "/"
- String path = uri.getPath().substring(1);
-
- // optimisation - avoid a full reload if the resource has not changed
- ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path);
- final String ETAG = metadata.getETag();
- if (ETAG != null && ETAG.equals(resourceETAG)) {
- LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile());
- return;
- } else {
- resourceETAG = ETAG;
- }
-
- final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path));
- inputStream = object.getObjectContent();
- } else {
- inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile());
- if (inputStream == null) {
- LOG.error("Can't load conf from {}", getResourceFile());
- return;
- }
- }
- if (getResourceFile().endsWith(".gz")) {
- inputStream = new GZIPInputStream(inputStream);
- }
-
- loadJSONResources(new BufferedInputStream(inputStream));
- } finally {
- if (inputStream != null) {
- inputStream.close();
- }
- if (s3client != null) {
- s3client.shutdown();
- }
- }
+ InputStream inputStream = null;
+ AmazonS3 s3client = null;
+ try {
+ if (getResourceFile().startsWith("s3://")) {
+ // try loading from S3
+ s3client = AmazonS3ClientBuilder.standard().build();
+ java.net.URI uri = new java.net.URI(getResourceFile());
+
+ String bucketName = uri.getHost();
+ // remove the first "/"
+ String path = uri.getPath().substring(1);
+
+ // optimisation - avoid a full reload if the resource has not changed
+ ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path);
+ final String ETAG = metadata.getETag();
+ if (ETAG != null && ETAG.equals(resourceETAG)) {
+ LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile());
+ return;
+ } else {
+ resourceETAG = ETAG;
+ }
+
+ final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path));
+ inputStream = object.getObjectContent();
+ } else {
+ inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile());
+ if (inputStream == null) {
+ LOG.error("Can't load conf from {}", getResourceFile());
+ return;
+ }
+ }
+ if (getResourceFile().endsWith(".gz")) {
+ inputStream = new GZIPInputStream(inputStream);
+ }
+
+ loadJSONResources(new BufferedInputStream(inputStream));
+ } finally {
+ if (inputStream != null) {
+ inputStream.close();
+ }
+ if (s3client != null) {
+ s3client.shutdown();
+ }
+ }
}
@Override
public void loadJSONResources(InputStream inputStream)
- throws JsonParseException, JsonMappingException, IOException {
- long start = System.currentTimeMillis();
-
- try (Reader r = new InputStreamReader(inputStream)) {
- reloadRules(r);
- }
-
- long end = System.currentTimeMillis();
- LOG.info("Loaded {} hostrules and {} domain rules in {} msec from {}", hostRules.size(), domainRules.size(),
- (end - start), resourceFile);
+ throws JsonParseException, JsonMappingException, IOException {
+ long start = System.currentTimeMillis();
+
+ try (Reader r = new InputStreamReader(inputStream)) {
+ reloadRules(r);
+ }
+
+ long end = System.currentTimeMillis();
+ LOG.info(
+ "Loaded {} hostrules and {} domain rules in {} msec from {}",
+ hostRules.size(),
+ domainRules.size(),
+ (end - start),
+ resourceFile);
}
@Override
public String getResourceFile() {
- return resourceFile;
+ return resourceFile;
}
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) {
- synchronized (this) {
- URL u;
-
- try {
- u = new URL(urlToFilter);
- } catch (Exception e) {
- LOG.debug("Rejected {} because failed to parse as URL: {}", urlToFilter, e.getMessage());
- return null;
- }
-
- String hostname = u.getHost();
-
- // first check for host-specific rules
- for (Rule rule : hostRules.get(hostname)) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // also look up domain rules for host name
- for (Rule rule : domainRules.get(hostname)) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // check suffixes of host name from longer to shorter:
- // subdomains, domain, top-level domain
- int start = 0;
- int pos;
- while ((pos = hostname.indexOf('.', start)) != -1) {
- start = pos + 1;
- String domain = hostname.substring(start);
- for (Rule rule : domainRules.get(domain)) {
- if (rule.match(u)) {
- return null;
- }
- }
- }
-
- // finally check "global" rules defined for `Domain .`
- for (Rule rule : domainRules.get(".")) {
- if (rule.match(u)) {
- return null;
- }
- }
-
- // no reject rules found
- return urlToFilter;
- }
+ synchronized (this) {
+ URL u;
+
+ try {
+ u = new URL(urlToFilter);
+ } catch (Exception e) {
+ LOG.debug(
+ "Rejected {} because failed to parse as URL: {}",
+ urlToFilter,
+ e.getMessage());
+ return null;
+ }
+
+ String hostname = u.getHost();
+
+ // first check for host-specific rules
+ for (Rule rule : hostRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // also look up domain rules for host name
+ for (Rule rule : domainRules.get(hostname)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // check suffixes of host name from longer to shorter:
+ // subdomains, domain, top-level domain
+ int start = 0;
+ int pos;
+ while ((pos = hostname.indexOf('.', start)) != -1) {
+ start = pos + 1;
+ String domain = hostname.substring(start);
+ for (Rule rule : domainRules.get(domain)) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+ }
+
+ // finally check "global" rules defined for `Domain .`
+ for (Rule rule : domainRules.get(".")) {
+ if (rule.match(u)) {
+ return null;
+ }
+ }
+
+ // no reject rules found
+ return urlToFilter;
+ }
}
private void reloadRules(Reader rules) throws IOException {
- synchronized (this) {
- domainRules.clear();
- hostRules.clear();
-
- BufferedReader reader = new BufferedReader(rules);
-
- String current = null;
- boolean host = false;
- int lineno = 0;
-
- String line;
- try {
- while ((line = reader.readLine()) != null) {
- lineno++;
- line = line.trim();
-
- if (line.indexOf("#") != -1) {
- // strip comments
- line = line.substring(0, line.indexOf("#")).trim();
- }
-
- if (StringUtils.isBlank(line)) {
- continue;
- }
-
- if (line.startsWith("Host")) {
- host = true;
- current = line.split("\\s+")[1];
- } else if (line.startsWith("Domain")) {
- host = false;
- current = line.split("\\s+")[1];
- } else {
- if (current == null) {
- continue;
- }
-
- Rule rule = null;
- try {
- if (CATCH_ALL_RULE.matcher(line).matches()) {
- rule = DenyAllRule.getInstance();
- } else if (line.startsWith("DenyPathQuery")) {
- rule = new DenyPathQueryRule(line.split("\\s+")[1]);
- } else if (line.startsWith("DenyPath")) {
- rule = new DenyPathRule(line.split("\\s+")[1]);
- } else {
- LOG.warn("Problem reading rule on line {}: {}", lineno, line);
- continue;
- }
- } catch (Exception e) {
- LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage());
- continue;
- }
-
- if (host) {
- LOG.trace("Adding host rule [{}] [{}]", current, rule);
- hostRules.put(current, rule);
- } else {
- LOG.trace("Adding domain rule [{}] [{}]", current, rule);
- domainRules.put(current, rule);
- }
- }
- }
-
- } catch (IOException e) {
- LOG.warn("Caught exception while reading rules file at line {}: {}", lineno, e.getMessage());
- throw e;
- }
- }
+ synchronized (this) {
+ domainRules.clear();
+ hostRules.clear();
+
+ BufferedReader reader = new BufferedReader(rules);
+
+ String current = null;
+ boolean host = false;
+ int lineno = 0;
+
+ String line;
+ try {
+ while ((line = reader.readLine()) != null) {
+ lineno++;
+ line = line.trim();
+
+ if (line.indexOf("#") != -1) {
+ // strip comments
+ line = line.substring(0, line.indexOf("#")).trim();
+ }
+
+ if (StringUtils.isBlank(line)) {
+ continue;
+ }
+
+ if (line.startsWith("Host")) {
+ host = true;
+ current = line.split("\\s+")[1];
+ } else if (line.startsWith("Domain")) {
+ host = false;
+ current = line.split("\\s+")[1];
+ } else {
+ if (current == null) {
+ continue;
+ }
+
+ Rule rule = null;
+ try {
+ if (CATCH_ALL_RULE.matcher(line).matches()) {
+ rule = DenyAllRule.getInstance();
+ } else if (line.startsWith("DenyPathQuery")) {
+ rule = new DenyPathQueryRule(line.split("\\s+")[1]);
+ } else if (line.startsWith("DenyPath")) {
+ rule = new DenyPathRule(line.split("\\s+")[1]);
+ } else {
+ LOG.warn("Problem reading rule on line {}: {}", lineno, line);
+ continue;
+ }
+ } catch (Exception e) {
+ LOG.warn(
+ "Problem reading rule on line {}: {} - {}",
+ lineno,
+ line,
+ e.getMessage());
+ continue;
+ }
+
+ if (host) {
+ LOG.trace("Adding host rule [{}] [{}]", current, rule);
+ hostRules.put(current, rule);
+ } else {
+ LOG.trace("Adding domain rule [{}] [{}]", current, rule);
+ domainRules.put(current, rule);
+ }
+ }
+ }
+
+ } catch (IOException e) {
+ LOG.warn(
+ "Caught exception while reading rules file at line {}: {}",
+ lineno,
+ e.getMessage());
+ throw e;
+ }
+ }
}
public static class Rule {
- protected Pattern pattern;
+ protected Pattern pattern;
- Rule() {
- }
+ Rule() {}
- public Rule(String regex) {
- pattern = Pattern.compile(regex);
- }
+ public Rule(String regex) {
+ pattern = Pattern.compile(regex);
+ }
- public boolean match(URL url) {
- return pattern.matcher(url.toString()).find();
- }
+ public boolean match(URL url) {
+ return pattern.matcher(url.toString()).find();
+ }
- public String toString() {
- return pattern.toString();
- }
+ public String toString() {
+ return pattern.toString();
+ }
}
public static class DenyPathRule extends Rule {
- public DenyPathRule(String regex) {
- super(regex);
- }
-
- public boolean match(URL url) {
- String haystack = url.getPath();
- return pattern.matcher(haystack).find();
- }
+ public DenyPathRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getPath();
+ return pattern.matcher(haystack).find();
+ }
}
/** Rule for DenyPath .* or DenyPath .? */
public static class DenyAllRule extends Rule {
- private static Rule instance = new DenyAllRule(".");
+ private static Rule instance = new DenyAllRule(".");
- private DenyAllRule(String regex) {
- super(regex);
- }
+ private DenyAllRule(String regex) {
+ super(regex);
+ }
- public static Rule getInstance() {
- return instance;
- }
+ public static Rule getInstance() {
+ return instance;
+ }
- public boolean match(URL url) {
- return true;
- }
+ public boolean match(URL url) {
+ return true;
+ }
}
public static class DenyPathQueryRule extends Rule {
- public DenyPathQueryRule(String regex) {
- super(regex);
- }
-
- public boolean match(URL url) {
- String haystack = url.getFile();
- return pattern.matcher(haystack).find();
- }
+ public DenyPathQueryRule(String regex) {
+ super(regex);
+ }
+
+ public boolean match(URL url) {
+ String haystack = url.getFile();
+ return pattern.matcher(haystack).find();
+ }
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
index 1c3a4a5..9979781 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/ContentDetector.java
@@ -13,33 +13,28 @@
*/
package org.commoncrawl.stormcrawler.news;
+import com.google.common.primitives.Bytes;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
-import com.google.common.primitives.Bytes;
-
public class ContentDetector {
protected byte[][][] clues;
protected int maxOffset;
/**
- * Set up detector to detect content sniffing for a set of clue strings in a
- * prefix of the binary content.
+ * Set up detector to detect content sniffing for a set of clue strings in a prefix of the
+ * binary content.
*
- * @param clues
- * nested list of literal clues. Outer list defines an OR-group,
- * inner list contained ANDed clues required to match all, e.g.
- * the following definition would match if either
- * "clue1" and "and_clue2" are matched, or
- * alternatively "or_clue3" is found
+ * @param clues nested list of literal clues. Outer list defines an OR-group, inner list
+ * contained ANDed clues required to match all, e.g. the following definition would match if
+ * either "clue1" and "and_clue2" are matched, or alternatively
+ * "or_clue3" is found
+ *
+ * { { clue1, and_clue2 }, { or_clue3 } }
+ *
*
- *
- * { { clue1, and_clue2 }, { or_clue3 } }
- *
- *
- * @param maxOffset
- * max. offset of content prefix checked for clues
+ * @param maxOffset max. offset of content prefix checked for clues
*/
public ContentDetector(String[][] clues, int maxOffset) {
this.maxOffset = maxOffset;
@@ -60,8 +55,7 @@ public int getFirstMatch(byte[] content) {
for (int i = 0; i < clues.length; i++) {
byte[][] group = clues[i];
for (byte[] clue : group) {
- if (Bytes.indexOf(beginning, clue) == -1)
- continue OR;
+ if (Bytes.indexOf(beginning, clue) == -1) continue OR;
}
// success, all members of one group matched
return i;
@@ -72,5 +66,4 @@ public int getFirstMatch(byte[] content) {
public boolean matches(byte[] content) {
return (getFirstMatch(content) >= 0);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
index ce21c7a..cc2d85a 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/CrawlTopology.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,146 +14,153 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.commoncrawl.stormcrawler.news;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
-
import org.apache.storm.topology.BoltDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
+import org.apache.stormcrawler.ConfigurableTopology;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.bolt.FetcherBolt;
+import org.apache.stormcrawler.bolt.URLFilterBolt;
+import org.apache.stormcrawler.bolt.URLPartitionerBolt;
+import org.apache.stormcrawler.indexing.DummyIndexer;
+import org.apache.stormcrawler.opensearch.persistence.AggregationSpout;
+import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt;
+import org.apache.stormcrawler.protocol.AbstractHttpProtocol;
+import org.apache.stormcrawler.spout.FileSpout;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.apache.stormcrawler.util.URLStreamGrouping;
+import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy;
+import org.apache.stormcrawler.warc.FileTimeSizeRotationPolicy.Units;
+import org.apache.stormcrawler.warc.WARCFileNameFormat;
+import org.apache.stormcrawler.warc.WARCHdfsBolt;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.ConfigurableTopology;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
-import com.digitalpebble.stormcrawler.bolt.URLFilterBolt;
-import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt;
-import com.digitalpebble.stormcrawler.indexing.DummyIndexer;
-import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol;
-import com.digitalpebble.stormcrawler.spout.FileSpout;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-import com.digitalpebble.stormcrawler.util.URLStreamGrouping;
-import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy;
-import com.digitalpebble.stormcrawler.warc.FileTimeSizeRotationPolicy.Units;
-import com.digitalpebble.stormcrawler.warc.WARCFileNameFormat;
-import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt;
-
-/**
- * Dummy topology to play with the spouts and bolts on ElasticSearch
- */
+/** Dummy topology to play with the spouts and bolts on OpenSearch */
public class CrawlTopology extends ConfigurableTopology {
private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(CrawlTopology.class);
public static void main(String[] args) throws Exception {
- ConfigurableTopology.start(new CrawlTopology(), args);
+ ConfigurableTopology.start(new CrawlTopology(), args);
}
@Override
protected int run(String[] args) {
- TopologyBuilder builder = new TopologyBuilder();
+ TopologyBuilder builder = new TopologyBuilder();
- int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
+ int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
- // set to the real number of shards ONLY if es.status.routing is set to
- // true in the configuration
- int numShards = 16;
+ // set to the real number of shards ONLY if es.status.routing is set to
+ // true in the configuration
+ int numShards = 16;
- if (args.length >= 2) {
- // arguments include seed directory and file pattern
- LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
- builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
- Fields key = new Fields("url");
+ if (args.length >= 2) {
+ // arguments include seed directory and file pattern
+ LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
+ builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
+ Fields key = new Fields("url");
- builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping("filespout", Constants.StatusStreamName, key);
- }
+ builder.setBolt("filter", new URLFilterBolt())
+ .fieldsGrouping("filespout", Constants.StatusStreamName, key);
+ }
- builder.setSpout("spout", new AggregationSpout(), numShards);
+ builder.setSpout("spout", new AggregationSpout(), numShards);
- builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers).shuffleGrouping("spout");
+ builder.setBolt("prefilter", new PreFilterBolt("pre-urlfilters.json"), numWorkers)
+ .shuffleGrouping("spout");
- builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers).shuffleGrouping("prefilter");
+ builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers)
+ .shuffleGrouping("prefilter");
- builder.setBolt("fetch", new FetcherBolt(), numWorkers).fieldsGrouping("partitioner", new Fields("key"));
+ builder.setBolt("fetch", new FetcherBolt(), numWorkers)
+ .fieldsGrouping("partitioner", new Fields("key"));
- builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers).setNumTasks(2)
- .localOrShuffleGrouping("fetch");
+ builder.setBolt("sitemap", new NewsSiteMapParserBolt(), numWorkers)
+ .setNumTasks(2)
+ .localOrShuffleGrouping("fetch");
- builder.setBolt("feed", new FeedParserBolt(), numWorkers).setNumTasks(4).localOrShuffleGrouping("sitemap");
+ builder.setBolt("feed", new FeedParserBolt(), numWorkers)
+ .setNumTasks(4)
+ .localOrShuffleGrouping("sitemap");
- // don't need to parse the pages but need to update their status
- builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed");
+ // don't need to parse the pages but need to update their status
+ builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("feed");
- WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS");
+ WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS");
- // take it from feed default output so that the feed files themselves
- // don't get included - unless we want them too of course!
- builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed");
-
- final Fields furl = new Fields("url");
+ // take it from feed default output so that the feed files themselves
+ // don't get included - unless we want them too of course!
+ builder.setBolt("warc", warcbolt, numWorkers).localOrShuffleGrouping("feed");
- BoltDeclarer statusBolt = builder.setBolt("status", new StatusUpdaterBolt(), numWorkers)
- .fieldsGrouping("fetch", Constants.StatusStreamName, furl)
- .fieldsGrouping("sitemap", Constants.StatusStreamName, furl)
- .fieldsGrouping("feed", Constants.StatusStreamName, furl)
- .fieldsGrouping("ssb", Constants.StatusStreamName, furl)
- .fieldsGrouping("prefilter", Constants.StatusStreamName, furl);
-
- if (args.length >= 2) {
- statusBolt.customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping());
- }
- statusBolt.setNumTasks(numShards);
+ final Fields furl = new Fields("url");
- return submit(conf, builder);
+ BoltDeclarer statusBolt =
+ builder.setBolt("status", new StatusUpdaterBolt(), numWorkers)
+ .fieldsGrouping("fetch", Constants.StatusStreamName, furl)
+ .fieldsGrouping("sitemap", Constants.StatusStreamName, furl)
+ .fieldsGrouping("feed", Constants.StatusStreamName, furl)
+ .fieldsGrouping("ssb", Constants.StatusStreamName, furl)
+ .fieldsGrouping("prefilter", Constants.StatusStreamName, furl);
+
+ if (args.length >= 2) {
+ statusBolt.customGrouping(
+ "filter", Constants.StatusStreamName, new URLStreamGrouping());
+ }
+ statusBolt.setNumTasks(numShards);
+
+ return submit(conf, builder);
}
protected WARCHdfsBolt getWarcBolt(String filePrefix) {
- // path is absolute
- String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc");
-
- WARCFileNameFormat fileNameFormat = new WARCFileNameFormat();
- fileNameFormat.withPath(warcFilePath);
- fileNameFormat.withPrefix(filePrefix);
-
- Map fields = new LinkedHashMap<>();
- fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/");
- fields.put("description", "News crawl for Common Crawl");
- String userAgent = AbstractHttpProtocol.getAgentString(getConf());
- fields.put("http-header-user-agent", userAgent);
- fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email"));
- String robotsTxtParser = "checked by crawler-commons " + crawlercommons.CrawlerCommons.getVersion()
- + " (https://github.com/crawler-commons/crawler-commons)";
- fields.put("robots", robotsTxtParser);
- fields.put("format", "WARC File Format 1.1");
- fields.put("conformsTo", "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/");
-
- WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt();
- warcbolt.withConfigKey("warc");
- warcbolt.withFileNameFormat(fileNameFormat);
- warcbolt.withHeader(fields);
- warcbolt.withRequestRecords();
-
- // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that
- // WARC files are truncated if the topology is stopped because of a
- // delayed sync of the default ChecksumFileSystem
- Map hdfsConf = new HashMap<>();
- hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
- getConf().put("warc", hdfsConf);
-
- // will rotate if reaches size or time limit
- int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024);
- int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440);
- FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB);
- rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES);
- warcbolt.withRotationPolicy(rotpol);
-
- return warcbolt;
+ // path is absolute
+ String warcFilePath = ConfUtils.getString(getConf(), "warc.dir", "/data/warc");
+
+ WARCFileNameFormat fileNameFormat = new WARCFileNameFormat();
+ fileNameFormat.withPath(warcFilePath);
+ fileNameFormat.withPrefix(filePrefix);
+
+ Map fields = new LinkedHashMap<>();
+ fields.put("software", "StormCrawler 2.10 https://stormcrawler.net/");
+ fields.put("description", "News crawl for Common Crawl");
+ String userAgent = AbstractHttpProtocol.getAgentString(getConf());
+ fields.put("http-header-user-agent", userAgent);
+ fields.put("http-header-from", ConfUtils.getString(getConf(), "http.agent.email"));
+ String robotsTxtParser =
+ "checked by crawler-commons "
+ + crawlercommons.CrawlerCommons.getVersion()
+ + " (https://github.com/crawler-commons/crawler-commons)";
+ fields.put("robots", robotsTxtParser);
+ fields.put("format", "WARC File Format 1.1");
+ fields.put(
+ "conformsTo",
+ "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/");
+
+ WARCHdfsBolt warcbolt = (WARCHdfsBolt) new WARCHdfsBolt();
+ warcbolt.withConfigKey("warc");
+ warcbolt.withFileNameFormat(fileNameFormat);
+ warcbolt.withHeader(fields);
+ warcbolt.withRequestRecords();
+
+ // use RawLocalFileSystem (instead of ChecksumFileSystem) to avoid that
+ // WARC files are truncated if the topology is stopped because of a
+ // delayed sync of the default ChecksumFileSystem
+ Map hdfsConf = new HashMap<>();
+ hdfsConf.put("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
+ getConf().put("warc", hdfsConf);
+
+ // will rotate if reaches size or time limit
+ int maxMB = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-mb", 1024);
+ int maxMinutes = ConfUtils.getInt(getConf(), "warc.rotation.policy.max-minutes", 1440);
+ FileTimeSizeRotationPolicy rotpol = new FileTimeSizeRotationPolicy(maxMB, Units.MB);
+ rotpol.setTimeRotationInterval(maxMinutes, FileTimeSizeRotationPolicy.TimeUnit.MINUTES);
+ warcbolt.withRotationPolicy(rotpol);
+
+ return warcbolt;
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
index bdf5f58..e607dc2 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/FeedDetectorBolt.java
@@ -14,43 +14,36 @@
package org.commoncrawl.stormcrawler.news;
import java.util.Map;
-
+import org.apache.http.HttpHeaders;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.parse.ParseData;
+import org.apache.stormcrawler.parse.ParseFilter;
+import org.apache.stormcrawler.parse.ParseFilters;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.persistence.Status;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.parse.ParseData;
-import com.digitalpebble.stormcrawler.parse.ParseFilter;
-import com.digitalpebble.stormcrawler.parse.ParseFilters;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.persistence.Status;
-import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
-
/** Detect RSS and Atom feeds, but do not parse and extract links */
@SuppressWarnings("serial")
public class FeedDetectorBolt extends FeedParserBolt {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(FeedDetectorBolt.class);
+ private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedDetectorBolt.class);
- public static final String[] mimeTypeClues = {
- "rss+xml", "atom+xml", "text/rss"
- };
+ public static final String[] mimeTypeClues = {"rss+xml", "atom+xml", "text/rss"};
- public static String[][] contentClues = { { "<{}> for {}",
- ct, url);
+ LOG.info("Feed detected from content type <{}> for {}", ct, url);
break;
}
}
@@ -90,8 +82,8 @@ public void execute(Tuple tuple) {
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
// emit status
- collector.emit(Constants.StatusStreamName, tuple,
- new Values(url, metadata, Status.FETCHED));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
} else {
// pass on
collector.emit(tuple, tuple.getValues());
@@ -100,11 +92,9 @@ public void execute(Tuple tuple) {
}
@Override
- @SuppressWarnings({ "rawtypes" })
- public void prepare(Map stormConf, TopologyContext context,
- OutputCollector collect) {
+ @SuppressWarnings({"rawtypes"})
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) {
super.prepare(stormConf, context, collect);
parseFilters = ParseFilters.fromConf(stormConf);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
index 42f4de3..187d086 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/NewsSiteMapParserBolt.java
@@ -13,8 +13,20 @@
*/
package org.commoncrawl.stormcrawler.news;
-import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
+import static org.apache.stormcrawler.Constants.StatusStreamName;
+import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.Namespace;
+import crawlercommons.sitemaps.SiteMap;
+import crawlercommons.sitemaps.SiteMapIndex;
+import crawlercommons.sitemaps.SiteMapParser;
+import crawlercommons.sitemaps.SiteMapURL;
+import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency;
+import crawlercommons.sitemaps.UnknownFormatException;
+import crawlercommons.sitemaps.extension.Extension;
+import crawlercommons.sitemaps.extension.ExtensionMetadata;
+import crawlercommons.sitemaps.extension.LinkAttributes;
+import crawlercommons.sitemaps.extension.NewsAttributes;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
@@ -24,79 +36,66 @@
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpHeaders;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.ReducedMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.bolt.SiteMapParserBolt;
+import org.apache.stormcrawler.parse.Outlink;
+import org.apache.stormcrawler.parse.ParseData;
+import org.apache.stormcrawler.parse.ParseFilter;
+import org.apache.stormcrawler.parse.ParseFilters;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.persistence.DefaultScheduler;
+import org.apache.stormcrawler.persistence.Status;
+import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt;
-import com.digitalpebble.stormcrawler.parse.Outlink;
-import com.digitalpebble.stormcrawler.parse.ParseData;
-import com.digitalpebble.stormcrawler.parse.ParseFilter;
-import com.digitalpebble.stormcrawler.parse.ParseFilters;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.persistence.DefaultScheduler;
-import com.digitalpebble.stormcrawler.persistence.Status;
-import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-
-import crawlercommons.sitemaps.AbstractSiteMap;
-import crawlercommons.sitemaps.Namespace;
-import crawlercommons.sitemaps.SiteMap;
-import crawlercommons.sitemaps.SiteMapIndex;
-import crawlercommons.sitemaps.SiteMapParser;
-import crawlercommons.sitemaps.SiteMapURL;
-import crawlercommons.sitemaps.SiteMapURL.ChangeFrequency;
-import crawlercommons.sitemaps.UnknownFormatException;
-import crawlercommons.sitemaps.extension.Extension;
-import crawlercommons.sitemaps.extension.ExtensionMetadata;
-import crawlercommons.sitemaps.extension.LinkAttributes;
-import crawlercommons.sitemaps.extension.NewsAttributes;
-
-
/**
- * ParserBolt for news
+ * ParserBolt for news
* sitemaps.
*/
@SuppressWarnings("serial")
public class NewsSiteMapParserBolt extends SiteMapParserBolt {
// TODO:
- // this is a modified copy of c.d.s.bolt.SiteMapParserBolt
- // - make parent class extensible and overridable
- // modifications:
- // - detect and process only Google news sitemaps
- // - or a sitemapindex because some subsitemaps may
- // be news sitemaps
- // - pass "isSitemapNews" to status metadata
+ // this is a modified copy of c.d.s.bolt.SiteMapParserBolt
+ // - make parent class extensible and overridable
+ // modifications:
+ // - detect and process only Google news sitemaps
+ // - or a sitemapindex because some subsitemaps may
+ // be news sitemaps
+ // - pass "isSitemapNews" to status metadata
public static enum SitemapType {
- NEWS, INDEX, SITEMAP
+ NEWS,
+ INDEX,
+ SITEMAP
}
public static final String isSitemapNewsKey = "isSitemapNews";
public static final String isSitemapIndexKey = "isSitemapIndex";
+
/**
- * A sitemap (not necessarily a news sitemap) which is verified to contain
- * links to news articles. Necessary to crawl news sites which provide a
- * sitemap but neither a news feed or sitemap.
+ * A sitemap (not necessarily a news sitemap) which is verified to contain links to news
+ * articles. Necessary to crawl news sites which provide a sitemap but neither a news feed or
+ * sitemap.
*/
public static final String isSitemapVerifiedKey = "isSitemapVerified";
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(NewsSiteMapParserBolt.class);
+ private static final org.slf4j.Logger LOG =
+ LoggerFactory.getLogger(NewsSiteMapParserBolt.class);
/* content clues for news sitemaps, sitemap indexes or any sitemaps */
public static String[][] contentClues;
public static int contentCluesSitemapNewsMatchUpTo = -1;
public static int contentCluesSitemapIndexMatchUpTo = -1;
+
static {
int cluesSize = Namespace.NEWS.length + 1 + 1 + Namespace.SITEMAP_LEGACY.length;
contentClues = new String[cluesSize][1];
@@ -130,7 +129,7 @@ public static enum SitemapType {
private ReducedMetric averagedMetrics;
- /** Delay in minutes used for scheduling sub-sitemaps **/
+ /** Delay in minutes used for scheduling sub-sitemaps * */
private int scheduleSitemapsWithDelay = -1;
@Override
@@ -141,14 +140,10 @@ public void execute(Tuple tuple) {
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
- boolean isSitemap = Boolean.valueOf(
- metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
- boolean isNewsSitemap = Boolean
- .valueOf(metadata.getFirstValue(isSitemapNewsKey));
- boolean isSitemapIndex = Boolean
- .valueOf(metadata.getFirstValue(isSitemapIndexKey));
- boolean isSitemapVerified = Boolean
- .valueOf(metadata.getFirstValue(isSitemapVerifiedKey));
+ boolean isSitemap = Boolean.valueOf(metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
+ boolean isNewsSitemap = Boolean.valueOf(metadata.getFirstValue(isSitemapNewsKey));
+ boolean isSitemapIndex = Boolean.valueOf(metadata.getFirstValue(isSitemapIndexKey));
+ boolean isSitemapVerified = Boolean.valueOf(metadata.getFirstValue(isSitemapVerifiedKey));
if (sniffContent) {
SitemapType type = detectContent(url, content);
@@ -184,14 +179,16 @@ public void execute(Tuple tuple) {
if (isNewsSitemap || isSitemapIndex || isSitemapVerified) {
/*
- * remove the isSitemap key from metadata to avoid that the default
- * sitemap fetch interval is applied to news sitemaps, sitemap
- * indexes and verified sitemaps
+ * remove the isSitemap key from metadata to avoid that the default sitemap
+ * fetch interval is applied to news sitemaps, sitemap indexes and verified
+ * sitemaps
*/
metadata.remove(isSitemapKey);
} else {
if (isSitemap) {
- collector.emit(Constants.StatusStreamName, tuple,
+ collector.emit(
+ Constants.StatusStreamName,
+ tuple,
new Values(url, metadata, Status.FETCHED));
} else {
// not a sitemap, just pass it on
@@ -218,8 +215,8 @@ public void execute(Tuple tuple) {
metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
metadata.remove("numLinks");
- collector.emit(Constants.StatusStreamName, tuple, new Values(url,
- metadata, Status.ERROR));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
collector.ack(tuple);
return;
}
@@ -233,15 +230,12 @@ public void execute(Tuple tuple) {
parseFilters.filter(url, content, null, parse);
} catch (RuntimeException e) {
- String errorMessage = "Exception while running parse filters on "
- + url + ": " + e;
+ String errorMessage = "Exception while running parse filters on " + url + ": " + e;
LOG.error(errorMessage);
- metadata.setValue(Constants.STATUS_ERROR_SOURCE,
- "content filtering");
+ metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
metadata.remove("numLinks");
- collector.emit(StatusStreamName, tuple, new Values(url, metadata,
- Status.ERROR));
+ collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
collector.ack(tuple);
return;
}
@@ -264,8 +258,7 @@ public void execute(Tuple tuple) {
ol.getMetadata().setValue(isSitemapVerifiedKey, "true");
}
}
- Values v = new Values(ol.getTargetURL(), ol.getMetadata(),
- Status.DISCOVERED);
+ Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED);
collector.emit(Constants.StatusStreamName, tuple, v);
}
@@ -273,8 +266,8 @@ public void execute(Tuple tuple) {
metadata.setValue("numLinks", String.valueOf(outlinks.size()));
// marking the main URL as successfully fetched
- collector.emit(Constants.StatusStreamName, tuple, new Values(url,
- metadata, Status.FETCHED));
+ collector.emit(
+ Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
collector.ack(tuple);
}
@@ -292,12 +285,10 @@ public SitemapType detectContent(String url, byte[] content) {
if (match >= 0) {
// a sitemap, need to detect type of sitemap
if (match <= contentCluesSitemapNewsMatchUpTo) {
- LOG.info("{} detected as news sitemap based on content",
- url);
+ LOG.info("{} detected as news sitemap based on content", url);
return SitemapType.NEWS;
} else if (match <= contentCluesSitemapIndexMatchUpTo) {
- LOG.info("{} detected as sitemap index based on content",
- url);
+ LOG.info("{} detected as sitemap index based on content", url);
return SitemapType.INDEX;
} else {
return SitemapType.SITEMAP;
@@ -318,12 +309,15 @@ private boolean recentlyModified(Date lastModified) {
return true;
}
- protected AbstractSiteMap parseSiteMap(String url, byte[] content,
- String contentType, Metadata parentMetadata, List links)
+ protected AbstractSiteMap parseSiteMap(
+ String url,
+ byte[] content,
+ String contentType,
+ Metadata parentMetadata,
+ List links)
throws UnknownFormatException, IOException {
- SiteMapParser parser = new SiteMapParser(strictModeSitemaps,
- allowPartialSitemaps);
+ SiteMapParser parser = new SiteMapParser(strictModeSitemaps, allowPartialSitemaps);
parser.setStrictNamespace(true);
parser.addAcceptedNamespace(Namespace.SITEMAP_LEGACY);
parser.addAcceptedNamespace(Namespace.EMPTY);
@@ -335,8 +329,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
long start = System.currentTimeMillis();
AbstractSiteMap siteMap;
// let the parser guess what the mimetype is
- if (StringUtils.isBlank(contentType)
- || contentType.contains("octet-stream")) {
+ if (StringUtils.isBlank(contentType) || contentType.contains("octet-stream")) {
siteMap = parser.parseSiteMap(content, sURL);
} else {
siteMap = parser.parseSiteMap(contentType, content, sURL);
@@ -352,8 +345,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
Collection subsitemaps = smi.getSitemaps();
int delay = 0;
/*
- * keep the subsitemaps as outlinks they will be fetched and parsed
- * in the following steps
+ * keep the subsitemaps as outlinks they will be fetched and parsed in the
+ * following steps
*/
Iterator iter = subsitemaps.iterator();
while (iter.hasNext()) {
@@ -366,13 +359,21 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a modified date {} which is more than {} hours old",
- target, lastModified.toString(),
+ target,
+ lastModified.toString(),
filterHoursSinceModified);
continue;
}
- Outlink ol = filterOutlink(sURL, target, parentMetadata,
- isSitemapKey, "true", isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ target,
+ parentMetadata,
+ isSitemapKey,
+ "true",
+ isSitemapNewsKey,
+ "false");
if (ol == null) {
continue;
}
@@ -380,9 +381,8 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
// add a delay
if (this.scheduleSitemapsWithDelay > 0) {
if (delay > 0) {
- ol.getMetadata().setValue(
- DefaultScheduler.DELAY_METADATA,
- Integer.toString(delay));
+ ol.getMetadata()
+ .setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(delay));
}
delay += this.scheduleSitemapsWithDelay;
}
@@ -390,15 +390,19 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
- LOG.info("Sitemap index (found {} sitemaps, {} skipped): {}",
- linksFound, linksSkippedNotRecentlyModified, url);
+ LOG.info(
+ "Sitemap index (found {} sitemaps, {} skipped): {}",
+ linksFound,
+ linksSkippedNotRecentlyModified,
+ url);
}
// sitemap files
else {
SiteMap sm = (SiteMap) siteMap;
Collection sitemapURLs = sm.getSiteMapUrls();
Iterator iter = sitemapURLs.iterator();
- sitemap_urls: while (iter.hasNext()) {
+ sitemap_urls:
+ while (iter.hasNext()) {
linksFound++;
SiteMapURL smurl = iter.next();
// TODO handle priority in metadata
@@ -415,11 +419,12 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a modified date {} which is more than {} hours old",
- target, lastModified, filterHoursSinceModified);
+ target,
+ lastModified,
+ filterHoursSinceModified);
continue;
}
- ExtensionMetadata[] newsAttrs = smurl
- .getAttributesForExtension(Extension.NEWS);
+ ExtensionMetadata[] newsAttrs = smurl.getAttributesForExtension(Extension.NEWS);
if (newsAttrs != null) {
// filter based on news publication date
// 2008-12-23
@@ -430,7 +435,9 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
linksSkippedNotRecentlyModified++;
LOG.debug(
"{} has a news publication date {} which is more than {} hours old",
- target, pubDate, filterHoursSinceModified);
+ target,
+ pubDate,
+ filterHoursSinceModified);
continue sitemap_urls;
}
}
@@ -438,8 +445,7 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
}
// add alternative language links
- ExtensionMetadata[] linkAttrs = smurl
- .getAttributesForExtension(Extension.LINKS);
+ ExtensionMetadata[] linkAttrs = smurl.getAttributesForExtension(Extension.LINKS);
if (linkAttrs != null) {
for (ExtensionMetadata attr : linkAttrs) {
LinkAttributes linkAttr = (LinkAttributes) attr;
@@ -452,17 +458,30 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
// skip href links duplicating sitemap URL
continue;
}
- Outlink ol = filterOutlink(sURL, hrefStr,
- parentMetadata, isSitemapKey, "false",
- isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ hrefStr,
+ parentMetadata,
+ isSitemapKey,
+ "false",
+ isSitemapNewsKey,
+ "false");
if (ol != null) {
links.add(ol);
}
}
}
- Outlink ol = filterOutlink(sURL, target, parentMetadata,
- isSitemapKey, "false", isSitemapNewsKey, "false");
+ Outlink ol =
+ filterOutlink(
+ sURL,
+ target,
+ parentMetadata,
+ isSitemapKey,
+ "false",
+ isSitemapNewsKey,
+ "false");
if (ol == null) {
continue;
}
@@ -470,34 +489,33 @@ protected AbstractSiteMap parseSiteMap(String url, byte[] content,
links.add(ol);
LOG.debug("{} : [sitemap] {}", url, target);
}
- LOG.info("Sitemap (found {} links, {} skipped): {}", linksFound,
- linksSkippedNotRecentlyModified, url);
+ LOG.info(
+ "Sitemap (found {} links, {} skipped): {}",
+ linksFound,
+ linksSkippedNotRecentlyModified,
+ url);
}
return siteMap;
}
@Override
- @SuppressWarnings({ "rawtypes", "unchecked" })
- public void prepare(Map stormConf, TopologyContext context,
- OutputCollector collector) {
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
super.prepare(stormConf, context, collector);
- sniffContent = ConfUtils.getBoolean(stormConf,
- "sitemap.sniffContent", false);
- filterHoursSinceModified = ConfUtils.getInt(stormConf,
- "sitemap.filter.hours.since.modified", -1);
+ sniffContent = ConfUtils.getBoolean(stormConf, "sitemap.sniffContent", false);
+ filterHoursSinceModified =
+ ConfUtils.getInt(stormConf, "sitemap.filter.hours.since.modified", -1);
parseFilters = ParseFilters.fromConf(stormConf);
- int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess",
- 1024);
- contentDetector = new ContentDetector(
- NewsSiteMapParserBolt.contentClues, maxOffsetGuess);
- rssContentDetector = new ContentDetector(
- FeedDetectorBolt.contentClues, maxOffsetGuess);
- averagedMetrics = context.registerMetric(
- "news_sitemap_average_processing_time",
- new ReducedMetric(new MeanReducer()), 30);
- scheduleSitemapsWithDelay = ConfUtils.getInt(stormConf,
- "sitemap.schedule.delay", scheduleSitemapsWithDelay);
+ int maxOffsetGuess = ConfUtils.getInt(stormConf, "sitemap.offset.guess", 1024);
+ contentDetector = new ContentDetector(NewsSiteMapParserBolt.contentClues, maxOffsetGuess);
+ rssContentDetector = new ContentDetector(FeedDetectorBolt.contentClues, maxOffsetGuess);
+ averagedMetrics =
+ context.registerMetric(
+ "news_sitemap_average_processing_time",
+ new ReducedMetric(new MeanReducer()),
+ 30);
+ scheduleSitemapsWithDelay =
+ ConfUtils.getInt(stormConf, "sitemap.schedule.delay", scheduleSitemapsWithDelay);
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
index 5a880da..18106c3 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PreFilterBolt.java
@@ -1,10 +1,23 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.commoncrawl.stormcrawler.news;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
-
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
@@ -12,76 +25,74 @@
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilters;
+import org.apache.stormcrawler.persistence.Status;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilters;
-import com.digitalpebble.stormcrawler.persistence.Status;
-
/**
- * Variant of the URLFilterBolt to go upstream of the fetching to catch anything
- * before it goes further into the topology. If filtered, a URL gets an ERROR
- * status.
+ * Variant of the URLFilterBolt to go upstream of the fetching to catch anything before it goes
+ * further into the topology. If filtered, a URL gets an ERROR status.
*/
public class PreFilterBolt extends BaseRichBolt {
- protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- private URLFilters urlFilters;
-
- protected OutputCollector collector;
-
- private final String filterConfigFile;
-
- private static final String _s = com.digitalpebble.stormcrawler.Constants.StatusStreamName;
-
- public PreFilterBolt(String filterConfigFile) {
- this.filterConfigFile = filterConfigFile;
- }
-
- @Override
- public void execute(Tuple input) {
-
- // must have at least a URL and metadata
- String urlString = input.getStringByField("url");
- Metadata metadata = (Metadata) input.getValueByField("metadata");
-
- String filtered = urlFilters.filter(null, null, urlString);
- if (StringUtils.isBlank(filtered)) {
- LOG.debug("URL rejected: {}", urlString);
- // emit with an error to the status stream
- metadata.addValue("error.cause", "Filtered");
- Values v = new Values(urlString, metadata, Status.ERROR);
- collector.emit(_s, input, v);
- collector.ack(input);
- return;
- }
-
- // pass to std out
- Values v = new Values(urlString, metadata);
- collector.emit(input, v);
- collector.ack(input);
- }
-
- @Override
- public void declareOutputFields(OutputFieldsDeclarer declarer) {
- declarer.declareStream(_s, new Fields("url", "metadata", "status"));
- declarer.declare(new Fields("url", "metadata"));
- }
-
- @Override
- public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
- this.collector = collector;
- if (filterConfigFile != null) {
- try {
- urlFilters = new URLFilters(stormConf, filterConfigFile);
- } catch (IOException e) {
- throw new RuntimeException("Can't load filters from " + filterConfigFile);
- }
- } else {
- urlFilters = URLFilters.fromConf(stormConf);
- }
- }
-
+ protected static final Logger LOG =
+ LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private URLFilters urlFilters;
+
+ protected OutputCollector collector;
+
+ private final String filterConfigFile;
+
+ private static final String _s = org.apache.stormcrawler.Constants.StatusStreamName;
+
+ public PreFilterBolt(String filterConfigFile) {
+ this.filterConfigFile = filterConfigFile;
+ }
+
+ @Override
+ public void execute(Tuple input) {
+
+ // must have at least a URL and metadata
+ String urlString = input.getStringByField("url");
+ Metadata metadata = (Metadata) input.getValueByField("metadata");
+
+ String filtered = urlFilters.filter(null, null, urlString);
+ if (StringUtils.isBlank(filtered)) {
+ LOG.debug("URL rejected: {}", urlString);
+ // emit with an error to the status stream
+ metadata.addValue("error.cause", "Filtered");
+ Values v = new Values(urlString, metadata, Status.ERROR);
+ collector.emit(_s, input, v);
+ collector.ack(input);
+ return;
+ }
+
+ // pass to std out
+ Values v = new Values(urlString, metadata);
+ collector.emit(input, v);
+ collector.ack(input);
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer declarer) {
+ declarer.declareStream(_s, new Fields("url", "metadata", "status"));
+ declarer.declare(new Fields("url", "metadata"));
+ }
+
+ @Override
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
+ this.collector = collector;
+ if (filterConfigFile != null) {
+ try {
+ urlFilters = new URLFilters(stormConf, filterConfigFile);
+ } catch (IOException e) {
+ throw new RuntimeException("Can't load filters from " + filterConfigFile);
+ }
+ } else {
+ urlFilters = URLFilters.fromConf(stormConf);
+ }
+ }
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
index a28e2e9..114477c 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/PunycodeURLNormalizer.java
@@ -13,20 +13,18 @@
*/
package org.commoncrawl.stormcrawler.news;
+import com.fasterxml.jackson.databind.JsonNode;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
-
-import com.digitalpebble.stormcrawler.Metadata;
-import com.digitalpebble.stormcrawler.filtering.URLFilter;
-import com.fasterxml.jackson.databind.JsonNode;
+import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.filtering.URLFilter;
public class PunycodeURLNormalizer extends URLFilter {
@Override
- public void configure(Map stormConf, JsonNode filterParams) {
- }
+ public void configure(Map stormConf, JsonNode filterParams) {}
private boolean isAscii(String str) {
char[] chars = str.toCharArray();
@@ -39,8 +37,7 @@ private boolean isAscii(String str) {
}
@Override
- public String filter(URL sourceUrl, Metadata sourceMetadata,
- String urlToFilter) {
+ public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) {
try {
URL url = new URL(urlToFilter);
String hostName = url.getHost();
@@ -51,12 +48,11 @@ public String filter(URL sourceUrl, Metadata sourceMetadata,
if (hostName.equals(url.getHost())) {
return urlToFilter;
}
- urlToFilter = new URL(url.getProtocol(), hostName, url.getPort(),
- url.getFile()).toString();
+ urlToFilter =
+ new URL(url.getProtocol(), hostName, url.getPort(), url.getFile()).toString();
} catch (MalformedURLException e) {
return null;
}
return urlToFilter;
}
-
}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
index 51aa116..14f45ff 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/BootstrapTopology.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -19,31 +19,27 @@
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
+import org.apache.stormcrawler.ConfigurableTopology;
+import org.apache.stormcrawler.Constants;
+import org.apache.stormcrawler.bolt.FetcherBolt;
+import org.apache.stormcrawler.bolt.JSoupParserBolt;
+import org.apache.stormcrawler.bolt.URLFilterBolt;
+import org.apache.stormcrawler.bolt.URLPartitionerBolt;
+import org.apache.stormcrawler.indexing.DummyIndexer;
+import org.apache.stormcrawler.opensearch.persistence.AggregationSpout;
+import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt;
+import org.apache.stormcrawler.spout.FileSpout;
+import org.apache.stormcrawler.util.ConfUtils;
+import org.apache.stormcrawler.util.URLStreamGrouping;
+import org.apache.stormcrawler.warc.WARCHdfsBolt;
import org.commoncrawl.stormcrawler.news.CrawlTopology;
import org.commoncrawl.stormcrawler.news.FeedDetectorBolt;
import org.slf4j.LoggerFactory;
-import com.digitalpebble.stormcrawler.ConfigurableTopology;
-import com.digitalpebble.stormcrawler.Constants;
-import com.digitalpebble.stormcrawler.bolt.FetcherBolt;
-import com.digitalpebble.stormcrawler.bolt.JSoupParserBolt;
-import com.digitalpebble.stormcrawler.bolt.URLFilterBolt;
-import com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout;
-import com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt;
-import com.digitalpebble.stormcrawler.indexing.DummyIndexer;
-import com.digitalpebble.stormcrawler.spout.FileSpout;
-import com.digitalpebble.stormcrawler.util.ConfUtils;
-import com.digitalpebble.stormcrawler.util.URLStreamGrouping;
-import com.digitalpebble.stormcrawler.warc.WARCHdfsBolt;
-
-/**
- * Dummy topology to play with the spouts and bolts on ElasticSearch
- */
+/** Dummy topology to play with the spouts and bolts on ElasticSearch */
public class BootstrapTopology extends CrawlTopology {
- private static final org.slf4j.Logger LOG = LoggerFactory
- .getLogger(BootstrapTopology.class);
+ private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(BootstrapTopology.class);
public static void main(String[] args) throws Exception {
ConfigurableTopology.start(new BootstrapTopology(), args);
@@ -53,11 +49,14 @@ public static void main(String[] args) throws Exception {
protected int run(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
- LOG.debug("sitemap.sniffContent: {}",
+ LOG.debug(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
- LOG.info("sitemap.sniffContent: {}",
+ LOG.info(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
- LOG.warn("sitemap.sniffContent: {}",
+ LOG.warn(
+ "sitemap.sniffContent: {}",
ConfUtils.getBoolean(getConf(), "sitemap.sniffContent", false));
int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1);
@@ -69,12 +68,11 @@ protected int run(String[] args) {
if (args.length >= 2) {
// arguments include seed directory and file pattern
LOG.info("Injecting seeds from {} by pattern {}", args[0], args[1]);
- builder.setSpout("filespout",
- new FileSpout(args[0], args[1], true));
+ builder.setSpout("filespout", new FileSpout(args[0], args[1], true));
Fields key = new Fields("url");
- builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping(
- "filespout", Constants.StatusStreamName, key);
+ builder.setBolt("filter", new URLFilterBolt())
+ .fieldsGrouping("filespout", Constants.StatusStreamName, key);
}
builder.setSpout("spout", new AggregationSpout(), numShards);
@@ -91,12 +89,10 @@ protected int run(String[] args) {
builder.setBolt("feed", new FeedDetectorBolt(), numWorkers)
.localOrShuffleGrouping("sitemap");
- builder.setBolt("parse", new JSoupParserBolt())
- .localOrShuffleGrouping("feed");
+ builder.setBolt("parse", new JSoupParserBolt()).localOrShuffleGrouping("feed");
// don't need to parse the pages but need to update their status
- builder.setBolt("ssb", new DummyIndexer(), numWorkers)
- .localOrShuffleGrouping("parse");
+ builder.setBolt("ssb", new DummyIndexer(), numWorkers).localOrShuffleGrouping("parse");
WARCHdfsBolt warcbolt = getWarcBolt("CC-NEWS-BOOTSTRAP");
@@ -109,9 +105,8 @@ protected int run(String[] args) {
.localOrShuffleGrouping("parse", Constants.StatusStreamName)
.localOrShuffleGrouping("ssb", Constants.StatusStreamName)
.setNumTasks(numShards)
- .customGrouping("filter", Constants.StatusStreamName,
- new URLStreamGrouping());
+ .customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping());
return submit(conf, builder);
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
index bc36e64..bb50291 100644
--- a/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
+++ b/src/main/java/org/commoncrawl/stormcrawler/news/bootstrap/FeedLinkParseFilter.java
@@ -14,19 +14,18 @@
package org.commoncrawl.stormcrawler.news.bootstrap;
import java.util.ArrayList;
-
+import org.apache.stormcrawler.bolt.FeedParserBolt;
+import org.apache.stormcrawler.parse.Outlink;
+import org.apache.stormcrawler.parse.ParseResult;
+import org.apache.stormcrawler.parse.filter.LinkParseFilter;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
-import com.digitalpebble.stormcrawler.bolt.FeedParserBolt;
-import com.digitalpebble.stormcrawler.parse.Outlink;
-import com.digitalpebble.stormcrawler.parse.ParseResult;
-import com.digitalpebble.stormcrawler.parse.filter.LinkParseFilter;
-
/**
- * ParseFilter which extracts exclusively RSS links via Xpath, all other links
- * are skipped. See {@link LinkParseFilter} how to register and configure in
- * parsefilters.json. A configuration snippet:
+ * ParseFilter which extracts exclusively RSS links via Xpath, all other links are skipped. See
+ * {@link LinkParseFilter} how to register and configure in parsefilters.json. A configuration
+ * snippet:
+ *
*