Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,7 @@ jobs:
with:
distribution: adopt
java-version: ${{ matrix.java }}
- name: Check code formatting
run: mvn -B --no-transfer-progress com.cosium.code:git-code-format-maven-plugin:validate-code-format -Dskip.format.code=false
- name: Build with Maven
run: mvn -B --no-transfer-progress package --file pom.xml -DCI_ENV=true verify
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.idea
target
opensearchdata
warcdata
.java-version
8 changes: 8 additions & 0 deletions .mvn/jvm.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED
--add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED
--add-exports jdk.compiler/com.sun.tools.javac.main=ALL-UNNAMED
--add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED
--add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED
--add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED
--add-opens jdk.compiler/com.sun.tools.javac.code=ALL-UNNAMED
--add-opens jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Crawler for news based on [StormCrawler](https://stormcrawler.apache.org/). Prod


## Prerequisites

* JVM 17 or higher
* Install OpenSearch 2.19.5
* Install Apache Storm 2.8.8
* Start OpenSearch and Storm
Expand Down Expand Up @@ -119,3 +119,18 @@ NewsCrawl ACTIVE 48 1 146 NewsCrawl-1
$> storm kill NewsCrawl
```

## Note for developers

Please format your code before submitting a PR with

```
mvn git-code-format:format-code -Dgcf.globPattern="**/*" -Dskip.format.code=false
```

You can enable pre-commit format hooks by running:

```
mvn clean install -Dskip.format.code=false
```


217 changes: 129 additions & 88 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
Expand All @@ -18,25 +17,25 @@ KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>org.commoncrawl.stormcrawler.news</groupId>
<artifactId>crawler</artifactId>
<version>3.6.0</version>
<packaging>jar</packaging>

<url>https://github.com/commoncrawl/news-crawl</url>
<licenses>
<license>
<name>Apache License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
</license>
</licenses>

<url>https://github.com/commoncrawl/news-crawl</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<stormcrawler.version>3.6.0</stormcrawler.version>
Expand All @@ -47,89 +46,10 @@ under the License.
<mockito.version>5.23.0</mockito.version>
<wiremock.version>3.0.1</wiremock.version>
<junit.version>4.13.2</junit.version>
<git-code-format-maven-plugin.version>5.4</git-code-format-maven-plugin.version>
<skip.format.code>true</skip.format.code>
</properties>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.15.0</version>
<configuration>
<source>17</source>
<target>17</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.6.3</version>
<executions>
<execution>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>java</executable>
<includeProjectDependencies>true</includeProjectDependencies>
<includePluginDependencies>false</includePluginDependencies>
<classpathScope>compile</classpathScope>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
<manifestEntries>
<Change></Change>
<Build-Date></Build-Date>
</manifestEntries>
</transformer>
</transformers>
<!-- The filters below are necessary if you want to include the Tika
module -->
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
<filter>
<!-- https://issues.apache.org/jira/browse/STORM-2428 -->
<artifact>org.apache.storm:flux-core</artifact>
<excludes>
<exclude>org/apache/commons/**</exclude>
<exclude>org/apache/http/**</exclude>
<exclude>org/yaml/**</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

<dependencies>
<dependency>
<groupId>org.apache.stormcrawler</groupId>
Expand Down Expand Up @@ -207,4 +127,125 @@ under the License.
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.15.0</version>
<configuration>
<source>17</source>
<target>17</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.6.3</version>
<configuration>
<executable>java</executable>
<includeProjectDependencies>true</includeProjectDependencies>
<includePluginDependencies>false</includePluginDependencies>
<classpathScope>compile</classpathScope>
</configuration>
<executions>
<execution>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.2</version>
<executions>
<execution>
<goals>
<goal>shade</goal>
</goals>
<phase>package</phase>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
<manifestEntries>
<Change/>
<Build-Date/>
</manifestEntries>
</transformer>
</transformers>
<!-- The filters below are necessary if you want to include the Tika
module -->
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
<filter>
<!-- https://issues.apache.org/jira/browse/STORM-2428 -->
<artifact>org.apache.storm:flux-core</artifact>
<excludes>
<exclude>org/apache/commons/**</exclude>
<exclude>org/apache/http/**</exclude>
<exclude>org/yaml/**</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.cosium.code</groupId>
<artifactId>git-code-format-maven-plugin</artifactId>
<version>${git-code-format-maven-plugin.version}</version>
<executions>
<!-- On commit, format the modified java files -->
<execution>
<id>install-formatter-hook</id>
<goals>
<goal>install-hooks</goal>
</goals>
</execution>
<!-- On Maven verify phase, fail if any file (including
unmodified)
is badly formatted -->
<execution>
<id>validate-code-format</id>
<goals>
<goal>validate-code-format</goal>
</goals>
</execution>
</executions>
<dependencies>
<!-- Enable https://github.com/google/google-java-format -->
<dependency>
<groupId>com.cosium.code</groupId>
<artifactId>google-java-format</artifactId>
<version>${git-code-format-maven-plugin.version}</version>
</dependency>
</dependencies>
<configuration>
<skip>${skip.format.code}</skip>
<formatterOptions>
<googleJavaFormat.aosp>true</googleJavaFormat.aosp>
<googleJavaFormat.fixImportsOnly>false</googleJavaFormat.fixImportsOnly>
<googleJavaFormat.skipSortingImports>false</googleJavaFormat.skipSortingImports>
<googleJavaFormat.skipRemovingUnusedImports>false</googleJavaFormat.skipRemovingUnusedImports>
</formatterOptions>
</configuration>
</plugin>
</plugins>
</build>
</project>
Loading