Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ under the License.
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<stormcrawler.version>3.6.0</stormcrawler.version>
<storm.version>2.8.8</storm.version>
<aws.version>1.12.797</aws.version>
<aws.version>2.46.7</aws.version>
<jackson.version>2.21.3</jackson.version>
<crawler-commons.version>1.6</crawler-commons.version>
<mockito.version>5.23.0</mockito.version>
Expand All @@ -50,6 +50,18 @@ under the License.
<skip.format.code>true</skip.format.code>
</properties>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>bom</artifactId>
<version>${aws.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>org.apache.stormcrawler</groupId>
Expand Down Expand Up @@ -86,9 +98,8 @@ under the License.
</dependency>

<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>${aws.version}</version>
<groupId>software.amazon.awssdk</groupId>
<artifactId>s3</artifactId>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@
*/
package org.commoncrawl.stormcrawler.filter;

import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.JsonNode;
Expand All @@ -46,6 +41,10 @@
import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.GetObjectRequest;
import software.amazon.awssdk.services.s3.model.HeadObjectRequest;
import software.amazon.awssdk.services.s3.model.HeadObjectResponse;

/**
* Version of the FastURLFilter that can load from a text representation instead of the JSON that
Expand Down Expand Up @@ -170,29 +169,40 @@ public void run() {
@Override
public void loadJSONResources() throws Exception {
InputStream inputStream = null;
AmazonS3 s3client = null;
S3Client s3client = null;
try {
if (getResourceFile().startsWith("s3://")) {
// try loading from S3
s3client = AmazonS3ClientBuilder.standard().build();
s3client = S3Client.builder().build();
java.net.URI uri = new java.net.URI(getResourceFile());

String bucketName = uri.getHost();
// remove the first "/"
String path = uri.getPath().substring(1);

// optimisation - avoid a full reload if the resource has not changed
ObjectMetadata metadata = s3client.getObjectMetadata(bucketName, path);
final String ETAG = metadata.getETag();
HeadObjectResponse headResponse =
s3client.headObject(
HeadObjectRequest.builder()
.bucket(bucketName)
.key(path)
.build()
);
final String ETAG = headResponse.eTag();
if (ETAG != null && ETAG.equals(resourceETAG)) {
LOG.info("Unchanged ETAG for {} - skipping reload", getResourceFile());
return;
} else {
resourceETAG = ETAG;
}

final S3Object object = s3client.getObject(new GetObjectRequest(bucketName, path));
inputStream = object.getObjectContent();
inputStream =
s3client.getObject(
GetObjectRequest.builder()
.bucket(bucketName)
.key(path)
.build()
);
} else {
inputStream = getClass().getClassLoader().getResourceAsStream(getResourceFile());
if (inputStream == null) {
Expand All @@ -210,7 +220,7 @@ public void loadJSONResources() throws Exception {
inputStream.close();
}
if (s3client != null) {
s3client.shutdown();
s3client.close();
}
}
}
Expand Down
Loading