From 3bf55416a8fe8cf2394175157d68f7fbc1cb9ab7 Mon Sep 17 00:00:00 2001
From: Lewis John McGibbney
Date: Sat, 21 Feb 2026 08:42:16 -0800
Subject: [PATCH 1/8] NUTCH-3154 Implement integration testing framework for
Nutch IndexWriter plugins using Testcontainers (#895)
---
.github/workflows/master-build.yml | 6 +
build.xml | 4 +
ivy/ivy.xml | 4 +
src/plugin/build-plugin.xml | 31 +++-
src/plugin/build.xml | 11 ++
src/plugin/indexer-elastic/ivy.xml | 5 +-
.../elastic/ElasticIndexWriterIT.java | 113 +++++++++++++++
src/plugin/indexer-kafka/ivy.xml | 1 +
.../indexwriter/kafka/KafkaIndexWriterIT.java | 96 ++++++++++++
src/plugin/indexer-rabbit/ivy.xml | 6 +-
.../rabbit/RabbitIndexWriterIT.java | 90 ++++++++++++
src/plugin/indexer-solr/ivy.xml | 1 +
.../indexwriter/solr/SolrIndexWriterIT.java | 137 ++++++++++++++++++
.../nutch/indexer/AbstractIndexWriterIT.java | 90 ++++++++++++
.../indexer/IndexWriterIntegrationTest.java | 53 +++++++
15 files changed, 644 insertions(+), 4 deletions(-)
create mode 100644 src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java
create mode 100644 src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java
create mode 100644 src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java
create mode 100644 src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java
create mode 100644 src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java
create mode 100644 src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java
diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index d73bb3a693..a8675bf22a 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -104,6 +104,8 @@ jobs:
- 'src/testresources/**'
plugins:
- 'src/plugin/**'
+ indexer_plugins:
+ - 'src/plugin/indexer-*/**'
buildconf:
- 'build.xml'
- 'ivy/ivy.xml'
@@ -120,6 +122,10 @@ jobs:
- name: test plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
+ # run indexer integration tests when indexer plugin files change (Docker required, ubuntu-latest only)
+ - name: test indexer integration
+ if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
+ run: ant clean test-indexer-integration -buildfile build.xml
- name: Check for test results
id: check_tests
if: always() && matrix.os == 'ubuntu-latest'
diff --git a/build.xml b/build.xml
index d8ee908824..37b8f4cd2a 100644
--- a/build.xml
+++ b/build.xml
@@ -535,6 +535,10 @@
+
+
+
+
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 9d396ee7b1..dfdb81c0fb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -151,6 +151,10 @@
+
+
+
+
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index b0aca71038..6ea8d86201 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -189,7 +189,7 @@
-
+
Tests failed!
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Indexer integration tests failed!
+
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 81d3ece682..160febb6cf 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -174,6 +174,17 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml
index ee812a225c..04c1a071d0 100644
--- a/src/plugin/indexer-elastic/ivy.xml
+++ b/src/plugin/indexer-elastic/ivy.xml
@@ -36,7 +36,10 @@
-
+
+
+
+
diff --git a/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java
new file mode 100644
index 0000000000..0479213c3f
--- /dev/null
+++ b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.elastic;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.http.HttpHost;
+import org.apache.nutch.indexer.AbstractIndexWriterIT;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.NutchConfiguration;
+import org.elasticsearch.action.get.GetRequest;
+import org.elasticsearch.action.get.GetResponse;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.RestClient;
+import org.elasticsearch.client.RestHighLevelClient;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.elasticsearch.ElasticsearchContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Integration tests for ElasticIndexWriter using Testcontainers.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class ElasticIndexWriterIT extends AbstractIndexWriterIT {
+
+ private static final String ELASTICSEARCH_IMAGE =
+ "docker.elastic.co/elasticsearch/elasticsearch:7.10.2";
+
+ @Container
+ private static final ElasticsearchContainer elasticsearchContainer =
+ new ElasticsearchContainer(ELASTICSEARCH_IMAGE)
+ .withEnv("discovery.type", "single-node")
+ .withEnv("xpack.security.enabled", "false");
+
+ private ElasticIndexWriter indexWriter;
+ private Configuration conf;
+
+ @Override
+ public void setUpIndexWriter() throws Exception {
+ conf = NutchConfiguration.create();
+ indexWriter = new ElasticIndexWriter();
+ indexWriter.setConf(conf);
+
+ Map params = new HashMap<>();
+ params.put(ElasticConstants.HOSTS, elasticsearchContainer.getHost());
+ params.put(ElasticConstants.PORT, String.valueOf(elasticsearchContainer.getMappedPort(9200)));
+ params.put(ElasticConstants.INDEX, "test-index");
+ params.put(ElasticConstants.SCHEME, "http");
+
+ IndexWriterParams writerParams = new IndexWriterParams(params);
+ indexWriter.open(writerParams);
+ }
+
+ @Override
+ public void tearDownIndexWriter() throws Exception {
+ if (indexWriter != null) {
+ try {
+ indexWriter.close();
+ } catch (Exception e) {
+ // Ignore if open() failed and close state is invalid
+ }
+ indexWriter = null;
+ }
+ }
+
+ @Override
+ public IndexWriter getIndexWriter() {
+ return indexWriter;
+ }
+
+ @Override
+ public boolean supportsDelete() {
+ return true;
+ }
+
+ @Override
+ public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception {
+ try (RestHighLevelClient client = new RestHighLevelClient(
+ RestClient.builder(
+ new HttpHost(elasticsearchContainer.getHost(),
+ elasticsearchContainer.getMappedPort(9200),
+ "http")))) {
+ GetRequest getRequest = new GetRequest("test-index", docId);
+ GetResponse getResponse = client.get(getRequest, RequestOptions.DEFAULT);
+ assertTrue(getResponse.isExists(), "Document should exist in index");
+ assertNotNull(getResponse.getSource());
+ assertEquals(expectedTitle, getResponse.getSource().get("title"));
+ }
+ }
+}
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
index d6157d953e..ffba6746d1 100644
--- a/src/plugin/indexer-kafka/ivy.xml
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -37,6 +37,7 @@
+
diff --git a/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java
new file mode 100644
index 0000000000..4f6a306d46
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.AbstractIndexWriterIT;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.NutchConfiguration;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+import org.testcontainers.kafka.KafkaContainer;
+
+/**
+ * Integration tests for KafkaIndexWriter using Testcontainers.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class KafkaIndexWriterIT extends AbstractIndexWriterIT {
+
+ private static final String KAFKA_IMAGE = "apache/kafka-native:3.8.0";
+ private static final String TEST_TOPIC = "nutch-indexer-test";
+
+ @Container
+ private static final KafkaContainer kafkaContainer =
+ new KafkaContainer(KAFKA_IMAGE);
+
+ private KafkaIndexWriter indexWriter;
+ private Configuration conf;
+
+ @Override
+ public void setUpIndexWriter() throws Exception {
+ conf = NutchConfiguration.create();
+ indexWriter = new KafkaIndexWriter();
+ indexWriter.setConf(conf);
+
+ String bootstrapServers = kafkaContainer.getBootstrapServers();
+ String hostPort = bootstrapServers.contains("://")
+ ? bootstrapServers.substring(bootstrapServers.indexOf("://") + 3)
+ : bootstrapServers;
+ String[] parts = hostPort.split(":");
+ String host = parts[0];
+ int port = Integer.parseInt(parts[1]);
+
+ Map params = new HashMap<>();
+ params.put(KafkaConstants.HOST, host);
+ params.put(KafkaConstants.PORT, String.valueOf(port));
+ params.put(KafkaConstants.TOPIC, TEST_TOPIC);
+ params.put(KafkaConstants.VALUE_SERIALIZER,
+ "org.apache.kafka.connect.json.JsonSerializer");
+ params.put(KafkaConstants.KEY_SERIALIZER,
+ "org.apache.kafka.common.serialization.StringSerializer");
+
+ IndexWriterParams writerParams = new IndexWriterParams(params);
+ indexWriter.open(writerParams);
+ }
+
+ @Override
+ public void tearDownIndexWriter() throws Exception {
+ if (indexWriter != null) {
+ try {
+ indexWriter.close();
+ } catch (Exception e) {
+ // Ignore if open() failed and close state is invalid
+ }
+ indexWriter = null;
+ }
+ }
+
+ @Override
+ public IndexWriter getIndexWriter() {
+ return indexWriter;
+ }
+
+ @Override
+ public boolean supportsDelete() {
+ return false;
+ }
+}
diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml
index 81822a0fb7..54930331cc 100644
--- a/src/plugin/indexer-rabbit/ivy.xml
+++ b/src/plugin/indexer-rabbit/ivy.xml
@@ -35,5 +35,9 @@
-
+
+
+
+
+
diff --git a/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java
new file mode 100644
index 0000000000..ed7d055350
--- /dev/null
+++ b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.rabbit;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.AbstractIndexWriterIT;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.NutchConfiguration;
+import org.testcontainers.containers.RabbitMQContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for RabbitIndexWriter using Testcontainers.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class RabbitIndexWriterIT extends AbstractIndexWriterIT {
+
+ private static final String RABBITMQ_IMAGE = "rabbitmq:3.13-management";
+
+ @Container
+ private static final RabbitMQContainer rabbitContainer =
+ new RabbitMQContainer(RABBITMQ_IMAGE);
+
+ private RabbitIndexWriter indexWriter;
+ private Configuration conf;
+
+ @Override
+ public void setUpIndexWriter() throws Exception {
+ conf = NutchConfiguration.create();
+ indexWriter = new RabbitIndexWriter();
+ indexWriter.setConf(conf);
+
+ Map params = new HashMap<>();
+ params.put(RabbitMQConstants.SERVER_URI, rabbitContainer.getAmqpUrl());
+ params.put(RabbitMQConstants.EXCHANGE_NAME, "nutch-indexer-test");
+ params.put(RabbitMQConstants.ROUTING_KEY, "indexer");
+ params.put(RabbitMQConstants.COMMIT_MODE, "single");
+ params.put(RabbitMQConstants.COMMIT_SIZE, "10");
+ params.put(RabbitMQConstants.BINDING, "true");
+ params.put(RabbitMQConstants.QUEUE_NAME, "nutch-indexer-queue");
+ params.put(RabbitMQConstants.EXCHANGE_OPTIONS, "type=direct,durable=true");
+ params.put(RabbitMQConstants.QUEUE_OPTIONS,
+ "durable=true,exclusive=false,auto-delete=false");
+
+ IndexWriterParams writerParams = new IndexWriterParams(params);
+ indexWriter.open(writerParams);
+ }
+
+ @Override
+ public void tearDownIndexWriter() throws Exception {
+ if (indexWriter != null) {
+ try {
+ indexWriter.close();
+ } catch (Exception e) {
+ // Ignore if open() failed and close state is invalid
+ }
+ indexWriter = null;
+ }
+ }
+
+ @Override
+ public IndexWriter getIndexWriter() {
+ return indexWriter;
+ }
+
+ @Override
+ public boolean supportsDelete() {
+ return true;
+ }
+}
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index 99a713c18b..4d2120955c 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -38,6 +38,7 @@
+
diff --git a/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java
new file mode 100644
index 0000000000..dcd88bdacb
--- /dev/null
+++ b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexerMapReduce;
+import org.apache.nutch.indexer.AbstractIndexWriterIT;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.impl.Http2SolrClient;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+import org.testcontainers.solr.SolrContainer;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Integration tests for SolrIndexWriter using Testcontainers.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class SolrIndexWriterIT extends AbstractIndexWriterIT {
+
+ private static final String SOLR_IMAGE = "solr:8.11.2";
+ private static final String COLLECTION = "nutch-test";
+
+ @Container
+ private static final SolrContainer solrContainer =
+ new SolrContainer(SOLR_IMAGE).withCollection(COLLECTION);
+
+ private SolrIndexWriter indexWriter;
+ private Configuration conf;
+
+ @Override
+ public void setUpIndexWriter() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+
+ indexWriter = new SolrIndexWriter();
+ indexWriter.setConf(conf);
+
+ String solrUrl = "http://" + solrContainer.getHost() + ":"
+ + solrContainer.getSolrPort() + "/solr/" + COLLECTION;
+
+ Map params = new HashMap<>();
+ params.put(SolrConstants.SERVER_TYPE, "http");
+ params.put(SolrConstants.SERVER_URLS, solrUrl);
+ params.put(SolrConstants.COLLECTION, COLLECTION);
+ params.put(SolrConstants.COMMIT_SIZE, "100");
+
+ IndexWriterParams writerParams = new IndexWriterParams(params);
+ indexWriter.open(writerParams);
+ }
+
+ @Override
+ public void tearDownIndexWriter() throws Exception {
+ if (indexWriter != null) {
+ try {
+ indexWriter.close();
+ } catch (Exception e) {
+ // Ignore if open() failed and close state is invalid
+ }
+ indexWriter = null;
+ }
+ }
+
+ @Override
+ public IndexWriter getIndexWriter() {
+ return indexWriter;
+ }
+
+ @Override
+ public boolean supportsDelete() {
+ return true;
+ }
+
+ @Override
+ public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception {
+ try (SolrClient client = new Http2SolrClient.Builder(
+ "http://" + solrContainer.getHost() + ":"
+ + solrContainer.getSolrPort() + "/solr/" + COLLECTION).build()) {
+ ModifiableSolrParams queryParams = new ModifiableSolrParams();
+ queryParams.set("q", "id:" + docId);
+ QueryResponse response = client.query(queryParams);
+ assertTrue(response.getResults().getNumFound() >= 1,
+ "Document should exist in Solr");
+ Object titleValue = response.getResults().get(0).getFieldValue("title");
+ String title = titleValue instanceof Collection
+ ? ((Collection>) titleValue).iterator().next().toString()
+ : titleValue.toString();
+ assertEquals(expectedTitle, title);
+ }
+ }
+
+ @Override
+ public IndexWriter prepareWriterForDeleteTest() throws Exception {
+ tearDownIndexWriter();
+
+ Configuration deleteConf = NutchConfiguration.create();
+ deleteConf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
+ SolrIndexWriter deleteWriter = new SolrIndexWriter();
+ deleteWriter.setConf(deleteConf);
+
+ String solrUrl = "http://" + solrContainer.getHost() + ":"
+ + solrContainer.getSolrPort() + "/solr/" + COLLECTION;
+ Map params = new HashMap<>();
+ params.put(SolrConstants.SERVER_TYPE, "http");
+ params.put(SolrConstants.SERVER_URLS, solrUrl);
+ params.put(SolrConstants.COLLECTION, COLLECTION);
+ deleteWriter.open(new IndexWriterParams(params));
+
+ return deleteWriter;
+ }
+}
diff --git a/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java
new file mode 100644
index 0000000000..b0bf6e0239
--- /dev/null
+++ b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+
+import org.apache.nutch.indexer.NutchDocument;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Abstract base for IndexWriter integration tests. Provides common test logic
+ * for write/commit and delete operations.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public abstract class AbstractIndexWriterIT implements IndexWriterIntegrationTest {
+
+ @BeforeEach
+ void setUp() throws Exception {
+ setUpIndexWriter();
+ }
+
+ @AfterEach
+ void tearDown() throws Exception {
+ tearDownIndexWriter();
+ }
+
+ @Test
+ void testWriteAndCommitDocument() throws Exception {
+ NutchDocument doc = createTestDocument("test-doc-1", "Test Document",
+ "This is a test document for integration testing.");
+ assertDoesNotThrow(() -> getIndexWriter().write(doc));
+ assertDoesNotThrow(() -> getIndexWriter().commit());
+ tearDownIndexWriter();
+ verifyDocumentWritten("test-doc-1", "Test Document");
+ }
+
+ @Test
+ void testDeleteDocument() throws Exception {
+ if (!supportsDelete()) {
+ return;
+ }
+ String docId = "test-doc-to-delete";
+ NutchDocument doc = createTestDocument(docId, "Document to Delete", "");
+
+ IndexWriter writer = getIndexWriter();
+ writer.write(doc);
+ writer.commit();
+
+ IndexWriter deleteWriter = prepareWriterForDeleteTest();
+ if (deleteWriter == null) {
+ deleteWriter = writer;
+ }
+ final IndexWriter writerForDelete = deleteWriter;
+ assertDoesNotThrow(() -> writerForDelete.delete(docId));
+ assertDoesNotThrow(() -> writerForDelete.commit());
+ if (deleteWriter != writer) {
+ try {
+ deleteWriter.close();
+ } catch (Exception e) {
+ // Ignore
+ }
+ }
+ }
+
+ /** Create a NutchDocument with id, title, and content. */
+ protected NutchDocument createTestDocument(String id, String title, String content) {
+ NutchDocument doc = new NutchDocument();
+ doc.add("id", id);
+ doc.add("title", title);
+ doc.add("content", content);
+ return doc;
+ }
+}
diff --git a/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java
new file mode 100644
index 0000000000..c6f1027da9
--- /dev/null
+++ b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+/**
+ * Contract for IndexWriter integration tests. Implementations run against
+ * real backends via Testcontainers.
+ */
+public interface IndexWriterIntegrationTest {
+
+ /** Open the index writer before tests. */
+ void setUpIndexWriter() throws Exception;
+
+ /** Close the index writer after tests. */
+ void tearDownIndexWriter() throws Exception;
+
+ /** The IndexWriter under test. */
+ IndexWriter getIndexWriter();
+
+ /** Whether this writer supports document deletion (e.g. Kafka does not). */
+ boolean supportsDelete();
+
+ /**
+ * Optional verification that a document was indexed.
+ * Default no-op; override for Elastic, Solr.
+ */
+ default void verifyDocumentWritten(String docId, String expectedTitle) throws Exception {
+ // no-op
+ }
+
+ /**
+ * Optional writer configured for delete operations. Used when the main
+ * writer has delete disabled (e.g. Solr requires INDEXER_DELETE=true).
+ * Default returns null to use {@link #getIndexWriter()}.
+ */
+ default IndexWriter prepareWriterForDeleteTest() throws Exception {
+ return null;
+ }
+}
From d0099c025b0c799d55d9437310af711f58a07a57 Mon Sep 17 00:00:00 2001
From: Luca <15426+lfoppiano@users.noreply.github.com>
Date: Fri, 27 Feb 2026 00:25:16 +0000
Subject: [PATCH 2/8] [NUTCH-3160] Remove System.exit(..) from reusable code
(#903)
---
.../apache/nutch/metrics/NutchMetrics.java | 3 +++
.../nutch/tools/CommonCrawlDataDumper.java | 8 ++++++-
.../tools/TestCommonCrawlDataDumper.java | 21 +++++++++++++++++++
3 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java
index ccb2d70ed3..14979803a1 100644
--- a/src/java/org/apache/nutch/metrics/NutchMetrics.java
+++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java
@@ -86,6 +86,9 @@ private NutchMetrics() {
/** Counter group for WARC export operations. */
public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter";
+ /** Counter group for Common Crawl data dumper tool. */
+ public static final String GROUP_COMMONCRAWL_DUMPER = "nutch_commoncrawl_dumper";
+
/** Counter group for domain statistics operations. */
public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats";
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index d5d5035e89..8e37c21fcf 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -67,6 +67,8 @@
import org.apache.nutch.crawl.LinkDbReader;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
@@ -188,6 +190,7 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool {
private GzipCompressorOutputStream gzipOutput = null;
private TarArchiveOutputStream tarOutput = null;
private ArrayList fileList = null;
+ private ErrorTracker errorTracker;
/**
* Main method for invoking this tool
@@ -210,6 +213,7 @@ public static void main(String[] args) throws Exception {
* @param config A populated {@link CommonCrawlConfig}
*/
public CommonCrawlDataDumper(CommonCrawlConfig config) {
+ this();
this.config = config;
}
@@ -217,6 +221,7 @@ public CommonCrawlDataDumper(CommonCrawlConfig config) {
* Constructor
*/
public CommonCrawlDataDumper() {
+ this.errorTracker = new ErrorTracker(NutchMetrics.GROUP_COMMONCRAWL_DUMPER);
}
/**
@@ -274,7 +279,8 @@ public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip,
if (parts == null || parts.size() == 0) {
LOG.error( "No segment directories found in {} ",
segmentRootDir.getAbsolutePath());
- System.exit(1);
+ this.errorTracker.recordError(ErrorTracker.ErrorType.OTHER);
+ return;
}
LOG.info("Found {} segment parts", parts.size());
if (gzip && !warc) {
diff --git a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
index fee72b65a5..8124fe20b1 100644
--- a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
+++ b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -17,14 +17,19 @@
package org.apache.nutch.tools;
import java.io.File;
+import java.lang.reflect.Field;
import java.nio.file.Files;
import java.util.Collection;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
+import org.apache.nutch.metrics.ErrorTracker;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
/**
*
@@ -106,6 +111,22 @@ public void testDump() throws Exception {
}
+ @Test
+ public void testDumpWithNoSegmentDirectoriesRecordsOtherError() throws Exception {
+ File emptySegmentDir = Files.createTempDirectory("empty-segments").toFile();
+ File outputDir = Files.createTempDirectory("dump-output").toFile();
+
+ ErrorTracker mockErrorTracker = mock(ErrorTracker.class);
+ CommonCrawlDataDumper dumper = new CommonCrawlDataDumper();
+ Field errorTrackerField = CommonCrawlDataDumper.class.getDeclaredField("errorTracker");
+ errorTrackerField.setAccessible(true);
+ errorTrackerField.set(dumper, mockErrorTracker);
+
+ dumper.dump(outputDir, emptySegmentDir, null, false, null, false, "", false);
+
+ verify(mockErrorTracker, times(1)).recordError(ErrorTracker.ErrorType.OTHER);
+ }
+
private boolean hasFile(String fileName, Collection files) {
for (File f : files) {
if (f.getName().equals(fileName)) {
From 0ea90c3ec9cf16110bf1b473311ebcb64aa04a6d Mon Sep 17 00:00:00 2001
From: Lewis John McGibbney
Date: Mon, 13 Apr 2026 19:17:22 -0700
Subject: [PATCH 3/8] NUTCH-3168 Sandbox Commons JEXL usage in crawl and index
pipelines (#909)
---
conf/nutch-default.xml | 18 +-
docker/README.md | 6 +
.../org/apache/nutch/crawl/CrawlDbReader.java | 2 +-
.../org/apache/nutch/crawl/Generator.java | 14 +-
.../org/apache/nutch/hostdb/ReadHostDb.java | 9 +-
src/java/org/apache/nutch/util/JexlUtil.java | 151 +++++++++++++++--
.../nutch/exchange/jexl/JexlExchange.java | 3 +-
.../indexer/jexl/JexlIndexingFilter.java | 2 +-
.../org/apache/nutch/util/TestJexlUtil.java | 160 ++++++++++++++++++
9 files changed, 334 insertions(+), 31 deletions(-)
create mode 100644 src/test/org/apache/nutch/util/TestJexlUtil.java
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index cdac434830..7a8a73f81b 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1707,7 +1707,10 @@
plugins
Directories where Nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
- as is. If relative, it is searched for on the classpath.
+ as is. If relative, it is searched for on the classpath.
+ For secure deployments, treat these directories as trusted code: use
+ read-only filesystem permissions or immutable images so untrusted
+ parties cannot add or replace plugin JARs or plugin.xml files.
@@ -2367,6 +2370,19 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
each property value is always an array of Strings (so if you expect one value, use [0])
* doc - contains all the NutchFields from the NutchDocument.
each property value is always an array of Objects.
+ Expressions are evaluated in a sandboxed JEXL engine (see also
+ nutch.jexl.disable.sandbox).
+
+
+
+
+ nutch.jexl.disable.sandbox
+ false
+ If true, disables the Commons JEXL sandbox and the restriction
+ on the JEXL "new" operator for all Nutch JEXL expressions (index filter,
+ generator, hostdb filter, crawl_db_reader, exchange-jexl, etc.). This is
+ unsafe and should only be used in fully trusted environments when a
+ legitimate expression cannot be expressed under the default sandbox.
diff --git a/docker/README.md b/docker/README.md
index 80e1a1d6d9..720fdf8165 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary
docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080
```
+## Security and plugin directories
+
+Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree.
+
+User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments.
+
## Usage
If not already running, start docker
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 03cf0fbd39..57e684374c 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -906,7 +906,7 @@ public void setup(
retry = config.getInt("retry", -1);
if (config.get("expr", null) != null) {
- expr = JexlUtil.parseExpression(config.get("expr", null));
+ expr = JexlUtil.parseExpression(config, config.get("expr", null));
}
sample = config.getFloat("sample", 1);
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 102ce39b94..aa8cfcbbfa 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -227,7 +227,7 @@ public void setup(
if (!restrictStatusString.isEmpty()) {
restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
}
- expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
+ expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null));
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
// Initialize cached counter references
@@ -453,10 +453,10 @@ public void setup(Context context) throws IOException {
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
if (conf.get(GENERATOR_HOSTDB) != null) {
- maxCountExpr = JexlUtil
- .parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null));
- fetchDelayExpr = JexlUtil
- .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
+ maxCountExpr = JexlUtil.parseExpression(conf,
+ conf.get(GENERATOR_MAX_COUNT_EXPR, null));
+ fetchDelayExpr = JexlUtil.parseExpression(conf,
+ conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
@@ -871,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
- * @see JexlUtil#parseExpression(String)
+ * @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
@@ -922,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* @param hostdb
* name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
- * @see JexlUtil#parseExpression(String)
+ * @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 9f2e4a384e..23d94bc881 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -44,11 +44,10 @@
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
-import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.JexlScript;
-import org.apache.commons.jexl3.JexlEngine;
import org.apache.commons.jexl3.MapContext;
+import org.apache.nutch.util.JexlUtil;
/**
* @see Commons
@@ -77,11 +76,7 @@ public void setup(Context context) {
fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true);
String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
if (expr != null) {
- // Create or retrieve a JexlEngine
- JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();
-
- // Create an expression object
- this.expr = jexl.createScript(expr);
+ this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr);
}
}
diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index 549aebc419..29e8a4f204 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java
@@ -23,33 +23,159 @@
import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlEngine;
+import org.apache.commons.jexl3.JexlFeatures;
import org.apache.commons.jexl3.JexlScript;
+import org.apache.commons.jexl3.introspection.JexlSandbox;
import org.apache.commons.lang3.time.DateUtils;
+import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Utility methods for handling JEXL expressions
+ * Utility methods for handling JEXL expressions used in crawl and index
+ * pipelines. Expressions are evaluated under a {@link JexlSandbox} with
+ * {@link JexlFeatures#newInstance(boolean)} disabled so arbitrary classes cannot
+ * be instantiated from user-supplied configuration.
*/
public class JexlUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+ /**
+ * When {@code true}, JEXL parsing skips the sandbox (unsafe). For trusted
+ * environments only; not recommended.
+ */
+ public static final String DISABLE_SANDBOX_KEY = "nutch.jexl.disable.sandbox";
+
/** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
- private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+ private static final Pattern DATE_PATTERN = Pattern
+ .compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+
+ /**
+ * Classes and interfaces that may be introspected when evaluating Nutch JEXL
+ * scripts. Default-deny sandbox: anything not listed is blocked.
+ */
+ private static final String[] SANDBOX_ALLOW_CLASSES = {
+ "java.lang.String",
+ "java.lang.Boolean",
+ "java.lang.Byte",
+ "java.lang.Character",
+ "java.lang.Short",
+ "java.lang.Integer",
+ "java.lang.Long",
+ "java.lang.Float",
+ "java.lang.Double",
+ "java.lang.Number",
+ "java.lang.Math",
+ "java.lang.Comparable",
+ "java.lang.CharSequence",
+ "java.util.Map",
+ "java.util.List",
+ "java.util.Collection",
+ "java.util.Set",
+ "java.util.SortedMap",
+ "java.util.SortedSet",
+ "java.util.Iterator",
+ "java.lang.Iterable",
+ "java.util.AbstractList",
+ "java.util.AbstractCollection",
+ "java.util.AbstractMap",
+ "java.util.AbstractSet",
+ "java.util.ArrayList",
+ "java.util.LinkedList",
+ "java.util.HashMap",
+ "java.util.LinkedHashMap",
+ "java.util.HashSet",
+ "java.util.LinkedHashSet",
+ "java.util.TreeMap",
+ "java.util.TreeSet",
+ "java.util.Collections",
+ "java.util.Arrays",
+ "java.util.regex.Pattern",
+ "java.util.regex.Matcher",
+ "org.apache.commons.jexl3.MapContext",
+ "org.apache.nutch.indexer.NutchDocument",
+ "org.apache.nutch.indexer.NutchField",
+ };
+
+ private static volatile JexlEngine sandboxedEngine;
+ private static volatile JexlEngine legacyEngine;
+
+ private JexlUtil() {
+ }
+
+ private static JexlSandbox createSandbox() {
+ JexlSandbox sandbox = new JexlSandbox(false);
+ for (String name : SANDBOX_ALLOW_CLASSES) {
+ sandbox.allow(name);
+ }
+ return sandbox;
+ }
+
+ private static JexlFeatures createFeatures() {
+ return new JexlFeatures(JexlFeatures.createDefault()).newInstance(false);
+ }
+
+ private static JexlEngine getSandboxedEngine() {
+ if (sandboxedEngine == null) {
+ synchronized (JexlUtil.class) {
+ if (sandboxedEngine == null) {
+ sandboxedEngine = new JexlBuilder().silent(true).strict(true)
+ .sandbox(createSandbox()).features(createFeatures()).create();
+ }
+ }
+ }
+ return sandboxedEngine;
+ }
+
+ private static JexlEngine getLegacyEngine() {
+ if (legacyEngine == null) {
+ synchronized (JexlUtil.class) {
+ if (legacyEngine == null) {
+ legacyEngine = new JexlBuilder().silent(true).strict(true).create();
+ }
+ }
+ }
+ return legacyEngine;
+ }
+
+ private static JexlEngine engineFor(Configuration conf) {
+ if (conf != null && conf.getBoolean(DISABLE_SANDBOX_KEY, false)) {
+ LOG.warn("{}=true: JEXL sandbox is disabled; only use in fully trusted environments.",
+ DISABLE_SANDBOX_KEY);
+ return getLegacyEngine();
+ }
+ return getSandboxedEngine();
+ }
/**
- * Parses the given expression to a JEXL expression. This supports
- * date parsing.
+ * Parses a JEXL expression using the default (sandboxed) engine. Use
+ * {@link #parseExpression(Configuration, String)} when a {@link Configuration}
+ * is available so {@link #DISABLE_SANDBOX_KEY} can be honored.
*
* @param expr string JEXL expression
* @return parsed JEXL expression or null in case of parse error
*/
public static JexlScript parseExpression(String expr) {
- if (expr == null) return null;
-
+ return parseExpression(null, expr);
+ }
+
+ /**
+ * Parses a JEXL expression. Unless {@link #DISABLE_SANDBOX_KEY} is set to
+ * {@code true} in {@code conf}, the expression is parsed for execution under
+ * a restrictive sandbox.
+ *
+ * @param conf Hadoop configuration, or null to always use the sandbox
+ * @param expr string JEXL expression
+ * @return parsed JEXL expression or null in case of parse error
+ */
+ public static JexlScript parseExpression(Configuration conf, String expr) {
+ if (expr == null) {
+ return null;
+ }
+
try {
// Translate any date object into a long. Dates must be in the DATE_PATTERN
// format. For example: 2016-03-20T00:00:00Z
@@ -57,22 +183,21 @@ public static JexlScript parseExpression(String expr) {
if (matcher.find()) {
String date = matcher.group();
-
+
// parse the matched substring and get the epoch
- Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+ Date parsedDate = DateUtils.parseDateStrictly(date,
+ new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
long time = parsedDate.getTime();
-
+
// replace the original string date with the numeric value
expr = expr.replace(date, Long.toString(time));
}
- JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();
-
- return jexl.createScript(expr);
+ return engineFor(conf).createScript(expr);
} catch (Exception e) {
LOG.error(e.getMessage());
}
-
+
return null;
}
}
diff --git a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
index a55557595d..be6bf0dbe8 100644
--- a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
+++ b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
@@ -41,7 +41,8 @@ public class JexlExchange implements Exchange {
*/
@Override
public void open(Map parameters) {
- expression = JexlUtil.parseExpression(parameters.get(EXPRESSION_KEY));
+ expression = JexlUtil.parseExpression(getConf(),
+ parameters.get(EXPRESSION_KEY));
}
/**
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index e1fa792951..a89be63826 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -114,7 +114,7 @@ public void setConf(Configuration conf) {
"The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
}
- expr = JexlUtil.parseExpression(strExpr);
+ expr = JexlUtil.parseExpression(conf, strExpr);
if (expr == null) {
LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
diff --git a/src/test/org/apache/nutch/util/TestJexlUtil.java b/src/test/org/apache/nutch/util/TestJexlUtil.java
new file mode 100644
index 0000000000..221fffea22
--- /dev/null
+++ b/src/test/org/apache/nutch/util/TestJexlUtil.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import org.apache.commons.jexl3.JexlScript;
+import org.apache.commons.jexl3.MapContext;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Unit tests for {@link JexlUtil} sandboxing.
+ */
+public class TestJexlUtil {
+
+ @Test
+ public void testSandboxAllowsDocFieldCompare() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("doc.lang == 'en'");
+ assertNotNull(script);
+ MapContext doc = new MapContext();
+ doc.set("lang", "en");
+ MapContext root = new MapContext();
+ root.set("doc", doc);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testSandboxAllowsScoreCompare() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("score > 0.5");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("score", 0.9f);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testNewInstanceIoBlocked() {
+ assertNull(JexlUtil.parseExpression("new java.io.File('/')"));
+ }
+
+ @Test
+ public void testNewInstanceFileOutputStreamBlocked() {
+ assertNull(JexlUtil.parseExpression(
+ "new java.io.FileOutputStream('/tmp/nutch-jexl-poc')"));
+ }
+
+ @Test
+ public void testDisableSandboxAllowsNewExpressionParse() {
+ Configuration conf = new Configuration();
+ conf.setBoolean(JexlUtil.DISABLE_SANDBOX_KEY, true);
+ JexlScript script = JexlUtil.parseExpression(conf,
+ "new java.io.File('/')");
+ assertNotNull(script);
+ }
+
+ @Test
+ public void testArithmeticAllowed() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("2 * 3 + 1 == 7");
+ assertNotNull(script);
+ assertTrue(Boolean.TRUE.equals(script.execute(new MapContext())));
+ }
+
+ @Test
+ public void testStringMethodsAllowed() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "url.startsWith('http://')");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("url", "http://example.org/");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testDateRewriteStillParses() {
+ JexlScript script = JexlUtil.parseExpression(
+ "fetchTime > 2016-03-20T00:00:00Z");
+ assertNotNull(script);
+ }
+
+ @Test
+ public void testNullExpression() {
+ assertNull(JexlUtil.parseExpression(null));
+ assertNull(JexlUtil.parseExpression(new Configuration(), null));
+ }
+
+ @Test
+ public void testInvalidSyntaxReturnsNull() {
+ assertNull(JexlUtil.parseExpression("doc.lang=<>:='en'"));
+ }
+
+ @Test
+ public void testListSize() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("doc.tags.size() == 2");
+ assertNotNull(script);
+ MapContext doc = new MapContext();
+ java.util.List tags = new java.util.ArrayList<>();
+ tags.add("a");
+ tags.add("b");
+ doc.set("tags", tags);
+ MapContext root = new MapContext();
+ root.set("doc", doc);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testGeneratorStyleMetadata() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "warc_import_time > 0 && score > 0");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("warc_import_time", 1);
+ root.set("score", 1.0f);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testEqualsIgnoreCase() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "status.equalsIgnoreCase('FETCHED')");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("status", "fetched");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testRegex() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "url =~ 'https?://.*\\.example\\.org/.*'");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("url", "http://foo.example.org/bar");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testTernary() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("true ? 1 : 0");
+ assertNotNull(script);
+ assertEquals(1, script.execute(new MapContext()));
+ }
+}
From 2b72674a2d90b5b927e7b284ea6f8c38329ee577 Mon Sep 17 00:00:00 2001
From: Sai Asish Y
Date: Thu, 16 Apr 2026 15:29:05 -0700
Subject: [PATCH 4/8] parse: fix 'occured' -> 'occurred' typo in ParseStatus
FAILED javadoc (#910)
Javadoc on the FAILED ParseStatus constant in src/java/org/apache/nutch/parse/ParseStatus.java read 'Parsing failed. An Exception occured'. Doc-only change.
Signed-off-by: SAY-5
Co-authored-by: SAY-5
---
src/java/org/apache/nutch/parse/ParseStatus.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java
index 052a342247..25b8ae1b47 100644
--- a/src/java/org/apache/nutch/parse/ParseStatus.java
+++ b/src/java/org/apache/nutch/parse/ParseStatus.java
@@ -56,7 +56,7 @@ public class ParseStatus implements Writable {
// Secondary failure codes go here:
/**
- * Parsing failed. An Exception occured (which may be retrieved from the
+ * Parsing failed. An Exception occurred (which may be retrieved from the
* arguments).
*/
public static final short FAILED_EXCEPTION = 200;
From f8e8583d298813589e37a8627fb234bc78b7c0ae Mon Sep 17 00:00:00 2001
From: lewismc
Date: Thu, 7 May 2026 21:18:26 -0700
Subject: [PATCH 5/8] NUTCH-3175 Implement integration testing framework for
Nutch Protocol plugins using Testcontainers
---
.github/workflows/master-build.yml | 6 +
build.xml | 4 +
conf/log4j2.xml | 10 +-
ivy/ivy.xml | 6 +-
src/plugin/build-plugin.xml | 28 +++
src/plugin/build.xml | 13 ++
src/plugin/protocol-ftp/ivy.xml | 3 +-
.../org/apache/nutch/protocol/ftp/Ftp.java | 1 +
.../nutch/protocol/ftp/FtpResponse.java | 13 +-
.../nutch/protocol/ftp/FtpProtocolIT.java | 159 ++++++++++++++++++
.../protocol/htmlunit/HtmlUnitProtocolIT.java | 79 +++++++++
.../nutch/protocol/http/HttpProtocolIT.java | 83 +++++++++
src/plugin/protocol-httpclient/ivy.xml | 1 +
.../httpclient/HttpClientProtocolIT.java | 150 +++++++++++++++++
.../protocol/okhttp/OkHttpProtocolIT.java | 79 +++++++++
.../protocol/selenium/SeleniumProtocolIT.java | 86 ++++++++++
.../protocol/AbstractProtocolPluginIT.java | 97 +++++++++++
.../ProtocolPluginIntegrationTest.java | 50 ++++++
18 files changed, 860 insertions(+), 8 deletions(-)
create mode 100644 src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java
create mode 100644 src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java
create mode 100644 src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java
create mode 100644 src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java
create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java
create mode 100644 src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java
create mode 100644 src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java
create mode 100644 src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java
diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index a8675bf22a..4a1604d928 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -106,6 +106,8 @@ jobs:
- 'src/plugin/**'
indexer_plugins:
- 'src/plugin/indexer-*/**'
+ protocol_plugins:
+ - 'src/plugin/protocol-*/**'
buildconf:
- 'build.xml'
- 'ivy/ivy.xml'
@@ -126,6 +128,10 @@ jobs:
- name: test indexer integration
if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
run: ant clean test-indexer-integration -buildfile build.xml
+ # run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only)
+ - name: test protocol integration
+ if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
+ run: ant clean test-protocol-integration -buildfile build.xml
- name: Check for test results
id: check_tests
if: always() && matrix.os == 'ubuntu-latest'
diff --git a/build.xml b/build.xml
index 37b8f4cd2a..8e68ebd9a0 100644
--- a/build.xml
+++ b/build.xml
@@ -539,6 +539,10 @@
+
+
+
+
diff --git a/conf/log4j2.xml b/conf/log4j2.xml
index 713bfdc7fe..6faf4329fa 100644
--- a/conf/log4j2.xml
+++ b/conf/log4j2.xml
@@ -19,16 +19,16 @@
- ${sys:hadoop.log.dir:-./logs}
- ${sys:hadoop.log.file:-hadoop.log}
+ ${sys:hadoop.log.dir:-./logs}
+ ${sys:hadoop.log.file:-hadoop.log}
-
+
-
+
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index dfdb81c0fb..5ed19206ca 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -151,9 +151,13 @@
-
+
+
+
+
+
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index 6ea8d86201..3ba9e1b2fc 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -84,6 +84,7 @@
+
@@ -269,6 +270,33 @@
Indexer integration tests failed!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Protocol integration tests failed!
+
+
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 160febb6cf..24edd14639 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -185,6 +185,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml
index 7749a873ff..5e6a0d8c72 100644
--- a/src/plugin/protocol-ftp/ivy.xml
+++ b/src/plugin/protocol-ftp/ivy.xml
@@ -37,7 +37,8 @@
-
+
+
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 8cf58f75e7..3570d91188 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -183,6 +183,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
} catch (Exception e) {
LOG.error("Could not get protocol output for {}: {}", url,
e.getMessage());
+ datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text("500"));
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
index d6f7fd64a4..8796cfc0b3 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -164,7 +164,8 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
Ftp.LOG.info("connect to {}", addr);
}
- ftp.client.connect(addr);
+ int port = url.getPort();
+ ftp.client.connect(addr, port > 0 ? port : FTP.DEFAULT_PORT);
if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
ftp.client.disconnect();
Ftp.LOG.warn("ftp.client.connect() failed: {} {}", addr,
@@ -206,6 +207,11 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
try {
ftp.parser = null;
String parserKey = ftp.client.getSystemName();
+ // strip surrounding quotes that some servers include in SYST reply
+ if (parserKey.length() > 2 && parserKey.charAt(0) == '"'
+ && parserKey.charAt(parserKey.length() - 1) == '"') {
+ parserKey = parserKey.substring(1, parserKey.length() - 1);
+ }
// some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
if (parserKey.startsWith("UNKNOWN Type: L8"))
parserKey = "UNIX Type: L8";
@@ -302,6 +308,11 @@ private void getFileAsHttpResponse(String path, long lastModified)
list = new LinkedList();
ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
+ if (list.isEmpty()) {
+ this.code = 404; // file not found (server returned empty listing)
+ return;
+ }
+
FTPFile ftpFile = (FTPFile) list.get(0);
this.headers.set(Response.CONTENT_LENGTH,
Long.valueOf(ftpFile.getSize()).toString());
diff --git a/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java
new file mode 100644
index 0000000000..ccd3cd1ccb
--- /dev/null
+++ b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.ftp;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolPluginIntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockftpserver.fake.FakeFtpServer;
+import org.mockftpserver.fake.UserAccount;
+import org.mockftpserver.fake.filesystem.DirectoryEntry;
+import org.mockftpserver.fake.filesystem.FileEntry;
+import org.mockftpserver.fake.filesystem.UnixFakeFileSystem;
+
+/**
+ * Integration tests for protocol-ftp using an in-process FakeFtpServer.
+ *
+ * FTP passive mode with Testcontainers requires that the PASV response IP
+ * matches the host-visible address of the container, which is not reliable
+ * across Docker Desktop (macOS/Windows) and Linux Docker environments. An
+ * in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint
+ * while still testing the Nutch FTP client against a real FTP protocol
+ * implementation.
+ */
+public class FtpProtocolIT implements ProtocolPluginIntegrationTest {
+
+ private static final String FTP_USER = "testuser";
+ private static final String FTP_PASS = "testpass";
+ private static final String FTP_HOME = "/home/testuser";
+ private static final String TEST_FILE = "test.txt";
+ private static final String TEST_CONTENT = "FTP integration test content";
+
+ private static FakeFtpServer fakeFtpServer;
+ private Ftp protocol;
+
+ @BeforeAll
+ static void startFtpServer() {
+ fakeFtpServer = new FakeFtpServer();
+ fakeFtpServer.setServerControlPort(0); // bind to a random free port
+
+ UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME);
+ fakeFtpServer.addUserAccount(userAccount);
+
+ UnixFakeFileSystem fileSystem = new UnixFakeFileSystem();
+ fileSystem.add(new DirectoryEntry(FTP_HOME));
+ fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT));
+ fakeFtpServer.setFileSystem(fileSystem);
+
+ fakeFtpServer.start();
+ }
+
+ @AfterAll
+ static void stopFtpServer() {
+ if (fakeFtpServer != null) {
+ fakeFtpServer.stop();
+ }
+ }
+
+ @BeforeEach
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints");
+ conf.set("http.agent.name", "NutchFtpProtocolIT");
+ conf.set("ftp.username", FTP_USER);
+ conf.set("ftp.password", FTP_PASS);
+ conf.setInt("ftp.timeout", 10000);
+ protocol = new Ftp();
+ protocol.setConf(conf);
+ }
+
+ @AfterEach
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/" + TEST_FILE;
+ }
+
+ @Test
+ void testFtpFileDownload() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum);
+
+ assertNotNull(output, "ProtocolOutput must not be null");
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(200, code, "Expected FTP 200 for file download");
+
+ assertNotNull(output.getContent(), "Content must not be null");
+ String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8);
+ assertTrue(body.contains(TEST_CONTENT),
+ "Downloaded content must match the file on the FTP server");
+ }
+
+ @Test
+ void testFtpDirectoryListing() throws Exception {
+ String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/";
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum);
+
+ assertNotNull(output, "ProtocolOutput for directory listing must not be null");
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(200, code, "Expected FTP 200 for directory listing");
+ }
+
+ @Test
+ void testFtpMissingFileReturnsError() throws Exception {
+ String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort()
+ + FTP_HOME + "/nonexistent.txt";
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum);
+ assertNotNull(output, "ProtocolOutput must not be null even for missing files");
+ // FTP 550 "No such file" maps to a non-200 Nutch status
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code);
+ }
+}
diff --git a/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java
new file mode 100644
index 0000000000..42c551b72f
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-htmlunit using a real nginx container.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class HtmlUnitProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private Http protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-htmlunit|lib-htmlunit|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ @Test
+ void testFetchReturnsContent() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(
+ new org.apache.hadoop.io.Text(getTestUrl()), datum);
+ assertNotNull(output.getContent(),
+ "protocol-htmlunit must return non-null content for a live nginx page");
+ }
+}
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java
new file mode 100644
index 0000000000..87db32335b
--- /dev/null
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-http using a real nginx container.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class HttpProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private Http protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes", "protocol-http|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ @Test
+ void testFetchRedirect301() throws Exception {
+ // nginx returns 301 for directory URLs without trailing slash when autoindex
+ // is off; test a manual redirect via the default nginx welcome page path
+ String redirectUrl =
+ "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/index.html";
+ CrawlDatum datum = new CrawlDatum();
+ protocol.getProtocolOutput(new Text(redirectUrl), datum);
+ int code = getHttpStatusCode(datum);
+ // nginx serves index.html directly with 200; the base test covers 200/404
+ assertEquals(200, code, "Expected 200 for index.html from nginx");
+ }
+}
diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml
index 0b3ce0af73..e5987074b8 100644
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ b/src/plugin/protocol-httpclient/ivy.xml
@@ -38,6 +38,7 @@
+
diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java
new file mode 100644
index 0000000000..7345e4b029
--- /dev/null
+++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import static com.github.tomakehurst.wiremock.client.WireMock.aResponse;
+import static com.github.tomakehurst.wiremock.client.WireMock.get;
+import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import com.github.tomakehurst.wiremock.WireMockServer;
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolPluginIntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Integration tests for protocol-httpclient using an in-process WireMock
+ * server.
+ *
+ *
WireMock runs in the test JVM so no Docker container is required. The
+ * Nutch httpclient plugin connects to it over a real TCP socket, exercising
+ * the full HTTP client stack including header handling and Basic-auth
+ * challenge/response.
+ */
+public class HttpClientProtocolIT implements ProtocolPluginIntegrationTest {
+
+ private static WireMockServer wireMock;
+ private Http protocol;
+
+ @BeforeAll
+ static void startWireMock() {
+ wireMock = new WireMockServer(WireMockConfiguration.options().dynamicPort());
+ wireMock.start();
+
+ wireMock.stubFor(get(urlEqualTo("/"))
+ .willReturn(aResponse()
+ .withStatus(200)
+ .withHeader("Content-Type", "text/html")
+ .withBody("
Integration test")));
+
+ wireMock.stubFor(get(urlEqualTo("/notfound"))
+ .willReturn(aResponse().withStatus(404)));
+
+ wireMock.stubFor(get(urlEqualTo("/secure"))
+ .withBasicAuth("testuser", "testpass")
+ .willReturn(aResponse()
+ .withStatus(200)
+ .withHeader("Content-Type", "text/html")
+ .withBody("Authenticated")));
+
+ wireMock.stubFor(get(urlEqualTo("/secure"))
+ .willReturn(aResponse()
+ .withStatus(401)
+ .withHeader("WWW-Authenticate", "Basic realm=\"Test\"")
+ .withBody("Unauthorized")));
+ }
+
+ @AfterAll
+ static void stopWireMock() {
+ if (wireMock != null) {
+ wireMock.stop();
+ }
+ }
+
+ @BeforeEach
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-httpclient|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @AfterEach
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://localhost:" + wireMock.port() + "/";
+ }
+
+ @Test
+ void testFetch200() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum);
+ assertNotNull(output, "ProtocolOutput must not be null");
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(200, code, "Expected HTTP 200 from WireMock stub");
+ }
+
+ @Test
+ void testFetch404() throws Exception {
+ String url = "http://localhost:" + wireMock.port() + "/notfound";
+ CrawlDatum datum = new CrawlDatum();
+ protocol.getProtocolOutput(new Text(url), datum);
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(404, code, "Expected HTTP 404 for /notfound stub");
+ }
+
+ @Test
+ void testUnauthenticatedRequestReturns401() throws Exception {
+ String secureUrl = "http://localhost:" + wireMock.port() + "/secure";
+ CrawlDatum datum = new CrawlDatum();
+ protocol.getProtocolOutput(new Text(secureUrl), datum);
+ int code = Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ assertEquals(401, code,
+ "Unauthenticated request to /secure should return 401");
+ }
+}
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java
new file mode 100644
index 0000000000..d5342d8309
--- /dev/null
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.okhttp;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-okhttp using a real nginx container.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class OkHttpProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private OkHttp protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes", "protocol-okhttp|lib-http|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new OkHttp();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ /** OkHttp transparently decompresses gzip; verify content is returned. */
+ @Test
+ void testFetchWithAcceptEncoding() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(
+ new org.apache.hadoop.io.Text(getTestUrl()), datum);
+ assertNotNull(output.getContent(),
+ "Content must be present even when server uses compression");
+ }
+}
diff --git a/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java
new file mode 100644
index 0000000000..ec928df64f
--- /dev/null
+++ b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.AbstractProtocolPluginIT;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Integration tests for protocol-selenium using a real nginx container.
+ *
+ * Note: protocol-selenium uses raw HTTP sockets (the same underlying
+ * transport as protocol-http) rather than a Selenium WebDriver. Tests here
+ * validate that the plugin connects to and fetches content from a live HTTP
+ * server. Browser-based rendering is covered by protocol-interactiveselenium
+ * which is excluded from automated integration tests due to its stateful
+ * handler requirements.
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public class SeleniumProtocolIT extends AbstractProtocolPluginIT {
+
+ @Container
+ private static final GenericContainer> nginx =
+ new GenericContainer<>("nginx:alpine").withExposedPorts(80);
+
+ private Http protocol;
+
+ @Override
+ public void setUpProtocol() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-selenium|lib-http|lib-selenium|nutch-extensionpoints");
+ conf.set("http.agent.name", "Nutch-Test");
+ conf.setInt("http.timeout", 10000);
+ conf.setBoolean("store.http.headers", true);
+ protocol = new Http();
+ protocol.setConf(conf);
+ }
+
+ @Override
+ public void tearDownProtocol() {
+ protocol = null;
+ }
+
+ @Override
+ public Protocol getProtocol() {
+ return protocol;
+ }
+
+ @Override
+ public String getTestUrl() {
+ return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/";
+ }
+
+ @Test
+ void testFetchReturnsContent() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = protocol.getProtocolOutput(
+ new org.apache.hadoop.io.Text(getTestUrl()), datum);
+ assertNotNull(output.getContent(),
+ "protocol-selenium must return non-null content for a live nginx page");
+ }
+}
diff --git a/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java
new file mode 100644
index 0000000000..9469b168fb
--- /dev/null
+++ b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+/**
+ * Abstract base for Protocol plugin integration tests using Testcontainers.
+ * Provides common test logic for fetching URLs and verifying status codes.
+ *
+ *
Subclasses declare a static {@code @Container} field for the server
+ * container, implement {@link ProtocolPluginIntegrationTest}, and may add
+ * protocol-specific tests (e.g., redirect handling, authentication).
+ */
+@Testcontainers(disabledWithoutDocker = true)
+public abstract class AbstractProtocolPluginIT implements ProtocolPluginIntegrationTest {
+
+ @BeforeEach
+ void setUp() throws Exception {
+ setUpProtocol();
+ }
+
+ @AfterEach
+ void tearDown() throws Exception {
+ tearDownProtocol();
+ }
+
+ /** Fetch the test URL and assert an HTTP 200 response. */
+ @Test
+ void testFetch200() throws Exception {
+ CrawlDatum datum = new CrawlDatum();
+ ProtocolOutput output = getProtocol()
+ .getProtocolOutput(new Text(getTestUrl()), datum);
+ assertNotNull(output, "ProtocolOutput must not be null");
+ assertEquals(200, getHttpStatusCode(datum),
+ "Expected HTTP 200 for " + getTestUrl());
+ verifyFetchedContent(output, datum);
+ }
+
+ /** Fetch a non-existent path and assert an HTTP 404 response. */
+ @Test
+ void testFetch404() throws Exception {
+ String url = get404Url();
+ CrawlDatum datum = new CrawlDatum();
+ getProtocol().getProtocolOutput(new Text(url), datum);
+ assertEquals(404, getHttpStatusCode(datum),
+ "Expected HTTP 404 for " + url);
+ }
+
+ /**
+ * Returns a URL expected to produce a 404. Default appends a random path
+ * segment to {@link #getTestUrl()}; override if the server needs a specific
+ * path.
+ */
+ protected String get404Url() {
+ String base = getTestUrl();
+ if (base.endsWith("/")) {
+ return base + "nonexistent-path-xyz";
+ }
+ return base + "/nonexistent-path-xyz";
+ }
+
+ /**
+ * Reads the HTTP status code stored in the CrawlDatum metadata by Nutch
+ * protocol plugins. Returns -1 if no status code was stored.
+ */
+ protected static int getHttpStatusCode(CrawlDatum datum) {
+ if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ return Integer.parseInt(
+ datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ }
+ return -1;
+ }
+}
diff --git a/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java
new file mode 100644
index 0000000000..b3778077d9
--- /dev/null
+++ b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol;
+
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * Contract for Protocol plugin integration tests. Implementations run against
+ * real server backends (via Testcontainers or embedded servers).
+ */
+public interface ProtocolPluginIntegrationTest {
+
+ /** Set up the protocol plugin and its backing server before tests. */
+ void setUpProtocol() throws Exception;
+
+ /** Shut down the protocol plugin after tests. */
+ void tearDownProtocol() throws Exception;
+
+ /** The Protocol under test. */
+ Protocol getProtocol();
+
+ /**
+ * A URL that the backing server will serve with a 200/success response.
+ * Must point into the container or embedded server started by this test.
+ */
+ String getTestUrl();
+
+ /**
+ * Optional extra verification after a successful fetch.
+ * Default is a no-op; override to inspect content, headers, etc.
+ */
+ default void verifyFetchedContent(ProtocolOutput output, CrawlDatum datum)
+ throws Exception {
+ // no-op
+ }
+}
From 5cc412c38c48dda065ce7a77833dfc92cf300a92 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Fri, 15 May 2026 19:13:12 +0200
Subject: [PATCH 6/8] Integrate NUTCH-3175 into build of CCF fork
---
.github/workflows/cc-build.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/cc-build.yml b/.github/workflows/cc-build.yml
index 1e8f23a691..2ef2fb6fca 100644
--- a/.github/workflows/cc-build.yml
+++ b/.github/workflows/cc-build.yml
@@ -61,4 +61,4 @@ jobs:
restore-keys: |
${{ runner.os }}-ivy-
- name: Test
- run: ant clean test -buildfile build.xml
+ run: ant clean test test-protocol-integration -buildfile build.xml
From 98704c5c8a301d2e700b1d706e81de708522bdaf Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Sun, 17 May 2026 14:49:22 +0200
Subject: [PATCH 7/8] NUTCH-3176 URLUtil and urlnormalizer-basic: add support
for IDNA2008
- URLUtil:
- make IDNA2008 the default for the methods toASCII and toUNICODE
- provide methods to convert host names both for IDNA2003 and IDNA2008
- also convert host to lowercase it (if not already lowercased)
- urlnormalizer-basic:
- convert host names using IDNA2008 if the property
urlnormalizer.basic.host.idna2008 is true
- refactor to share methods between URLUtil and urlnormalizer-basic
- refactor calls of URLDecoder and pass Charset instead of String
(since Java 10)
---
conf/nutch-default.xml | 11 ++
.../apache/nutch/crawl/DeduplicationJob.java | 9 +-
.../nutch/plugin/PluginManifestParser.java | 7 +-
src/java/org/apache/nutch/util/URLUtil.java | 169 +++++++++++++++---
.../nutch/protocol/file/FileResponse.java | 17 +-
.../nutch/protocol/ftp/FtpResponse.java | 3 +-
.../urlnormalizer/ajax/AjaxURLNormalizer.java | 17 +-
.../basic/BasicURLNormalizer.java | 59 +++---
.../basic/TestBasicURLNormalizer.java | 66 +++++++
.../org/apache/nutch/util/TestURLUtil.java | 126 ++++++++++++-
10 files changed, 397 insertions(+), 87 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 7a8a73f81b..8ae225735d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1657,6 +1657,17 @@
+
+ urlnormalizer.basic.host.idna2008
+ false
+ If true, let urlnormalizer-basic
+ normalize Internationalized Domain Names (IDNs) using the
+ standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490).
+ Note that urlnormalizer.basic.host.idn must be set, otherwise
+ this property has no effect.
+
+
+
urlnormalizer.basic.host.trim-trailing-dot
false
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 50aa4cd7bd..52bf422308 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -17,9 +17,9 @@
package org.apache.nutch.crawl;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
+import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@@ -34,7 +34,6 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
protected final static Text urlKey = new Text("_URLTEMPKEY_");
protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
- protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
+ protected final static Charset UTF_8 = StandardCharsets.UTF_8;
public static class DBFilter extends
Mapper {
@@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
try {
urlExisting = URLDecoder.decode(urlExisting, UTF_8);
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ } catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlExisting, e);
// use the encoded URL
}
try {
urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ } catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlnewDoc, e);
// use the encoded URL
}
diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index 10ce4fdb7b..95208fa433 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -18,11 +18,11 @@
import java.io.File;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
@@ -124,10 +124,7 @@ public File getPluginFolder(String name) {
String path = url.getPath();
if (WINDOWS && path.startsWith("/")) // patch a windows bug
path = path.substring(1);
- try {
- path = URLDecoder.decode(path, "UTF-8"); // decode the url path
- } catch (UnsupportedEncodingException e) {
- }
+ path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path
directory = new File(path);
} else if (!directory.exists()) {
LOG.warn("Plugins: directory not found: {}", name);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 44c6309d2a..fd036480a6 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.util;
+import java.lang.invoke.MethodHandles;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
@@ -23,11 +24,22 @@
import java.util.Locale;
import java.util.regex.Pattern;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.IDNA;
+
import crawlercommons.domains.EffectiveTldFinder;
/** Utility class for URL analysis */
public class URLUtil {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private static final IDNA idna = IDNA.getUTS46Instance(
+ IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE);
+
/**
* Resolve relative URL-s and fix a java.net.URL error in handling of URLs
* with pure query targets.
@@ -520,17 +532,39 @@ public static String getProtocol(URL url) {
return url.getProtocol();
}
+ public static boolean isAscii(String str) {
+ char[] chars = str.toCharArray();
+ for (char c : chars) {
+ if (c > 127) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Convert URL with IDN host/domain name into the ASCII representation.
+ *
+ * @param url
+ * URL string to convert
+ * @return URL string with ASCII host/domain name or null if conversion fails.
+ */
public static String toASCII(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
- if (host == null || host.isEmpty()) {
- // no host name => no punycoded domain name
- // also do not add additional slashes for file: URLs (NUTCH-1880)
+ String hostLowerCase = host.toLowerCase(Locale.ROOT);
+ if (host == null || host.isEmpty()
+ || (isAscii(host) && host.equals(hostLowerCase))) {
+ // - no host name => no punycoded domain name
+ // - also do not add additional slashes for file: URLs (NUTCH-1880)
+ // - do nothing if host is already ASCII-only
+ // - not already in lowercase => conversion also lowercases host name
return url;
}
- URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
- u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+ URI p = new URI(u.getProtocol(), u.getUserInfo(),
+ convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(),
+ u.getQuery(), u.getRef());
return p.toString();
} catch (Exception e) {
@@ -538,13 +572,25 @@ public static String toASCII(String url) {
}
}
+ /**
+ * Convert URL with IDN host/domain name to the Unicode representation.
+ *
+ * @param url
+ * URL string to convert
+ * @return URL string with Unicode host/domain name or null if conversion
+ * fails.
+ */
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
- if (host == null || host.isEmpty()) {
- // no host name => no punycoded domain name
- // also do not add additional slashes for file: URLs (NUTCH-1880)
+ String hostLowerCase = host.toLowerCase(Locale.ROOT);
+ if (host == null || host.isEmpty()
+ || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) {
+ // - no host name => no punycoded domain name
+ // - also do not add additional slashes for file: URLs (NUTCH-1880)
+ // - contains 'xn--' => needs conversion
+ // - not already in lowercase => conversion also lowercases host name
return url;
}
StringBuilder sb = new StringBuilder();
@@ -554,7 +600,7 @@ public static String toUNICODE(String url) {
sb.append(u.getUserInfo());
sb.append('@');
}
- sb.append(IDN.toUnicode(host));
+ sb.append(convertIDNA2008(hostLowerCase, false));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
@@ -572,22 +618,83 @@ public static String toUNICODE(String url) {
}
/**
- * For testing
- * @param args print with no args to get help
+ * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN}
+ * class.
+ *
+ * The conversion supports only IDNA2003, it does not support IDNA2008.
+ * However, unless the parameter strictIDNA2003 is true, the
+ * methods {@link IDN#toASCII(String, int)} resp.
+ * {@link IDN#toUnicode(String, int)} are called passing the flag
+ * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on
+ * characters not in the repertoire of Unicode 3.2.
+ *
+ * @param host
+ * host name to be converted (lowercase expected)
+ * @param toAscii
+ * if true convert to ASCII, otherwise to Unicode
+ * @param strictIDNA2003
+ * if true, do
+ * @return converted host name
+ * @throws MalformedURLException
+ * if the conversion fails
*/
- public static void main(String[] args) {
-
- if (args.length != 1) {
- System.err.println("Usage : URLUtil ");
- return;
+ public static String convertIDNA2003(String host, boolean toAscii,
+ boolean strictIDNA2003) throws MalformedURLException {
+ try {
+ if (toAscii) {
+ return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+ } else {
+ return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED);
+ }
+ } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+ // IllegalArgumentException: thrown if the input string contains
+ // non-convertible Unicode codepoints
+ // IndexOutOfBoundsException: thrown (undocumented) if one "label"
+ // (non-ASCII dot-separated segment) is longer than 256 characters,
+ // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+ LOG.debug("Failed to convert IDN host {}: ", host, e);
+ throw (MalformedURLException) new MalformedURLException(
+ "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
}
+ }
- String url = args[0];
- try {
- System.out.println(URLUtil.getDomainName(new URL(url)));
- } catch (MalformedURLException ex) {
- ex.printStackTrace();
+ /**
+ * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class.
+ *
+ * The conversion supports IDNA2008 names.
+ *
+ * @param host
+ * host name to be converted (lowercase expected)
+ * @param toAscii
+ * if true convert to ASCII, otherwise to Unicode
+ * @return converted host name
+ * @throws MalformedURLException
+ * if the conversion fails
+ */
+ public static String convertIDNA2008(String host, boolean toAscii)
+ throws MalformedURLException {
+ final IDNA.Info idnaInfo = new IDNA.Info();
+ final StringBuilder hostConverted = new StringBuilder();
+ if (toAscii) {
+ idna.nameToASCII(host, hostConverted, idnaInfo);
+ } else {
+ idna.nameToUnicode(host, hostConverted, idnaInfo);
}
+ if (idnaInfo.hasErrors()) {
+ StringBuilder msg = new StringBuilder();
+ for (IDNA.Error error : idnaInfo.getErrors()) {
+ if (msg.length() == 0) {
+ msg.append("Invalid IDNA2008 host").append(host).append(": ");
+ } else {
+ msg.append(", ");
+ }
+ msg.append(error.name());
+ }
+ String errorMsg = msg.toString();
+ LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg);
+ throw new MalformedURLException(errorMsg);
+ }
+ return hostConverted.toString();
}
/**
@@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) {
&& url.getRef() == null //
&& url.getUserInfo() == null;
}
+
+ /**
+ * For testing
+ * @param args print with no args to get help
+ */
+ public static void main(String[] args) {
+
+ if (args.length != 1) {
+ System.err.println("Usage : URLUtil ");
+ System.err.println("\nExtract and print pay-level domain names for the input URL");
+ return;
+ }
+
+ String url = args[0];
+ try {
+ System.out.println(URLUtil.getDomainName(new URL(url)));
+ } catch (MalformedURLException ex) {
+ ex.printStackTrace();
+ }
+ }
}
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index c50988c2dd..64186b9035 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -16,20 +16,18 @@
*/
package org.apache.nutch.protocol.file;
-import java.net.URL;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
-
+import org.apache.nutch.protocol.Content;
import org.apache.tika.Tika;
-import org.apache.hadoop.conf.Configuration;
-
/**
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
@@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
String path = url.getPath().isEmpty() ? "/" : url.getPath();
- try {
- // specify the encoding via the config later?
- path = java.net.URLDecoder.decode(path, "UTF-8");
- } catch (UnsupportedEncodingException ex) {
- }
+ // specify the encoding via the config later?
+ path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
try {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
index 8796cfc0b3..0d7ad1b289 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -30,6 +30,7 @@
import java.net.InetAddress;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.LinkedList;
import java.io.ByteArrayOutputStream;
@@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
this.content = null;
- path = java.net.URLDecoder.decode(path, "UTF-8");
+ path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8);
if (path.endsWith("/")) {
getDirAsHttpResponse(path, datum.getModifiedTime());
diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
index 2342ced68f..5518e39544 100644
--- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -16,19 +16,18 @@
*/
package org.apache.nutch.net.urlnormalizer.ajax;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
-import java.net.MalformedURLException;
-import java.nio.charset.Charset;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
-import org.apache.hadoop.conf.Configuration;
-
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* URLNormalizer capable of dealing with AJAX URL's.
*
@@ -43,13 +42,11 @@ public class AjaxURLNormalizer implements URLNormalizer {
public static String ESCAPED_URL_PART = "_escaped_fragment_=";
private Configuration conf;
- private Charset utf8;
/**
* Default constructor.
*/
public AjaxURLNormalizer() {
- utf8 = Charset.forName("UTF-8");
}
/**
@@ -195,7 +192,7 @@ protected String escape(String fragmentPart) {
String hex = null;
StringBuilder sb = new StringBuilder(fragmentPart.length());
- for (byte b : fragmentPart.getBytes(utf8)) {
+ for (byte b : fragmentPart.getBytes(UTF_8)) {
if (b < 33) {
sb.append('%');
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 2123d8fa9f..4ff9fc64a6 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -16,12 +16,10 @@
*/
package org.apache.nutch.net.urlnormalizer.basic;
-import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.UnsupportedEncodingException;
-import java.net.IDN;
+import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
@@ -36,6 +34,7 @@
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -47,6 +46,12 @@
* normalize
* percent-encoding in URL paths
+ * normalize the host name if it is an Internationalized Domain Name (IDN)
+ * to ASCII or Unicode, depending on the configuration properties
+ * urlnormalizer.basic.host.idn and
+ * urlnormalizer.basic.host.idna2008
+ * remove a trailing dot in the host name (if the property
+ * urlnormalizer.basic.host.trim-trailing-dot is true)
*
*/
public class BasicURLNormalizer implements URLNormalizer {
@@ -54,6 +59,7 @@ public class BasicURLNormalizer implements URLNormalizer {
.getLogger(MethodHandles.lookup().lookupClass());
public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
+ public final static String NORM_HOST_IDNA_2008 = "urlnormalizer.basic.host.idna2008";
public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";
/**
@@ -70,7 +76,7 @@ public class BasicURLNormalizer implements URLNormalizer {
.compile("%([0-9A-Fa-f]{2})");
// charset used for encoding URLs before escaping
- private final static Charset utf8 = StandardCharsets.UTF_8;
+ private final static Charset UTF_8 = StandardCharsets.UTF_8;
/** look-up table for characters which should not be escaped in URL paths */
private final static boolean[] unescapedCharacters = new boolean[128];
@@ -132,20 +138,11 @@ private static boolean isHexCharacter(int c) {
|| (0x30 <= c && c <= 0x39);
}
- private static boolean isAscii(String str) {
- char[] chars = str.toCharArray();
- for (char c : chars) {
- if (c > 127) {
- return false;
- }
- }
- return true;
- }
-
private Configuration conf;
private boolean hostIDNtoASCII;
private boolean hostASCIItoIDN;
+ private boolean hostIDNA2008;
private boolean hostTrimTrailingDot;
@Override
@@ -159,9 +156,12 @@ public void setConf(Configuration conf) {
String normIdn = conf.get(NORM_HOST_IDN, "");
if (normIdn.equalsIgnoreCase("toAscii")) {
hostIDNtoASCII = true;
+ hostASCIItoIDN = false;
} else if (normIdn.equalsIgnoreCase("toUnicode")) {
+ hostIDNtoASCII = false;
hostASCIItoIDN = true;
}
+ hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false);
hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
}
@@ -364,7 +364,7 @@ private String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());
// Traverse over all bytes in this URL
- byte[] bytes = path.getBytes(utf8);
+ byte[] bytes = path.getBytes(UTF_8);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
// Is this a control character?
@@ -415,8 +415,8 @@ private String normalizeHostName(String host) throws MalformedURLException {
// 1. unescape percent-encoded characters in host name
if (host.indexOf('%') != -1) {
try {
- host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString());
- } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ host = URLDecoder.decode(host, UTF_8);
+ } catch (IllegalArgumentException e) {
LOG.debug("Failed to convert percent-encoded host name {}: ", host, e);
throw (MalformedURLException) new MalformedURLException(
"Invalid percent-encoded host name " + host + ": " + e.getMessage())
@@ -429,21 +429,18 @@ private String normalizeHostName(String host) throws MalformedURLException {
// 3. if configured: convert between Unicode and ASCII forms
// for Internationalized Domain Names (IDNs)
- if (hostIDNtoASCII && !isAscii(host)) {
- try {
- host = IDN.toASCII(host);
- } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
- // IllegalArgumentException: thrown if the input string contains
- // non-convertible Unicode codepoints
- // IndexOutOfBoundsException: thrown (undocumented) if one "label"
- // (non-ASCII dot-separated segment) is longer than 256 characters,
- // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
- LOG.debug("Failed to convert IDN host {}: ", host, e);
- throw (MalformedURLException) new MalformedURLException(
- "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
+ if (hostIDNtoASCII && !URLUtil.isAscii(host)) {
+ if (hostIDNA2008) {
+ host = URLUtil.convertIDNA2008(host, true);
+ } else {
+ host = URLUtil.convertIDNA2003(host, true, false);
}
} else if (hostASCIItoIDN && host.contains("xn--")) {
- host = IDN.toUnicode(host);
+ if (hostIDNA2008) {
+ host = URLUtil.convertIDNA2008(host, false);
+ } else {
+ host = URLUtil.convertIDNA2003(host, false, false);
+ }
}
// 4. optionally trim a trailing dot
@@ -466,7 +463,7 @@ public static void main(String args[]) throws IOException {
}
String line, normUrl;
BufferedReader in = new BufferedReader(
- new InputStreamReader(System.in, utf8));
+ new InputStreamReader(System.in, UTF_8));
while ((line = in.readLine()) != null) {
try {
normUrl = normalizer.normalize(line, scope);
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index a6bad41f2e..090c25f2da 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -258,15 +258,24 @@ public void testHostName() throws Exception {
// test Internationalized Domain Names
BasicURLNormalizer norm = new BasicURLNormalizer();
conf = NutchConfiguration.create();
+
+ // to ASCII normalization
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
norm.setConf(conf);
normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
// verify escaping of percent-encoded characters in IDNs (NUTCH-2824)
normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/",
"https://www.xn--0251-sachverstndiger-ozb.de/");
+ // verify that host names with uppercase characters are normalized
+ normalizeTest(norm, "https://нЭб.РФ/", "https://xn--90ax2c.xn--p1ai/");
+
+ // to Unicode normalization
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
norm.setConf(conf);
normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
+ // verify that host names with uppercase characters are normalized
+ normalizeTest(norm, "https://Xn--90Ax2c.xN--P1ai/", "https://нэб.рф/");
+
// test removal of trailing dot
conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true);
norm.setConf(conf);
@@ -274,6 +283,63 @@ public void testHostName() throws Exception {
"https://www.example.org/");
}
+ /**
+ * Test for IDNA2008 and IDNA2003 compatibility.
+ */
+ @Test
+ public void testHostNameIDNA2008() throws Exception {
+ // IDNA2008 (https://www.rfc-editor.org/rfc/rfc5890.html#section-1.1)
+ BasicURLNormalizer norm = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+ norm.setConf(conf);
+
+ // IDNA2003 / RFC 3490
+ // Note: IDNA2008 and IDNA2003 deviate for this example
+ normalizeTest(norm, "https://straße.de/", "https://strasse.de/");
+
+ // Verify that characters not in Unicode 3.2 do not fail the normalization
+ normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/");
+
+ // IDNA2008 / RFC 5890
+ conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, true);
+ norm.setConf(conf);
+ // Note: this is different from IDNA2003
+ normalizeTest(norm, "https://straße.de/", "https://xn--strae-oqa.de/");
+
+ // Verify that characters not in Unicode 3.2 do not fail the normalization
+ normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding)
+ normalizeTest(norm, "https://xn--p1ai.%D1%80%D1%84/",
+ "https://xn--p1ai.xn--p1ai/");
+ normalizeTest(norm, "https://xn--p1ai.рф/", "https://xn--p1ai.xn--p1ai/");
+
+ // test conversion to Unicode (IDNA2008)
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
+ norm.setConf(conf);
+ normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/");
+ normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding), mixed case
+ normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/");
+ normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/");
+ normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/");
+ normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/");
+
+ // test conversion to Unicode (IDNA2003)
+ conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false);
+ norm.setConf(conf);
+ normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/");
+ normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/");
+
+ // mixed encodings (Unicode, Punycode, percent encoding), mixed case
+ normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/");
+ normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/");
+ normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/");
+ normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/");
+ }
+
/**
* Test that normalizer throws MalformedURLException for invalid URLs
*/
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index 4d8ae07971..573af66430 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -16,11 +16,18 @@
*/
package org.apache.nutch.util;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.net.MalformedURLException;
import java.net.URL;
import org.junit.jupiter.api.Test;
-
-import static org.junit.jupiter.api.Assertions.*;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
/** Test class for URLUtil */
public class TestURLUtil {
@@ -312,7 +319,20 @@ public void testToUNICODE() throws Exception {
assertEquals("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1",
URLUtil
.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
-
+ // do not fail on characters not in Unicode 3.2
+ assertEquals("https://example.ᬩᬮᬶ.id/",
+ URLUtil.toUNICODE("https://example.xn--9tfky.id/"));
+ // IDNA2008
+ assertEquals("http://straße.de/",
+ URLUtil.toUNICODE("http://xn--strae-oqa.de/"));
+ // host names with uppercase characters
+ assertEquals("https://googie.com/",
+ URLUtil.toUNICODE("https://googIe.com/"));
+ assertEquals("https://googie.com/", URLUtil.toASCII("https://googIe.com/"));
+ assertEquals("https://xn--90ax2c.xn--p1ai/",
+ URLUtil.toASCII("https://нЭб.РФ/"));
+ assertEquals("https://нэб.рф/",
+ URLUtil.toUNICODE("https://Xn--90Ax2c.xN--P1ai/"));
}
@Test
@@ -324,6 +344,106 @@ public void testToASCII() throws Exception {
assertEquals("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
URLUtil
.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
+ // IDNA2003
+ // assertEquals("http://strasse.de/",
+ // URLUtil.toASCII("http://straße.de/"));
+ // do not fail on characters not in Unicode 3.2
+ assertEquals("https://example.xn--9tfky.id/",
+ URLUtil.toASCII("https://example.ᬩᬮᬶ.id/"));
+ // IDNA2008
+ assertEquals("http://xn--strae-oqa.de/",
+ URLUtil.toASCII("http://straße.de/"));
+ }
+
+ @ParameterizedTest
+ @CsvSource({ //
+ "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", //
+ "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", //
+ "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", //
+ // Test examples from whatwg-url
+ "xn--53h.example,☕.example,IDNA2008,true", //
+ "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", //
+ "xn--mgba3gch31f060k.com,\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.com,IDNA2008,true", //
+ /* Note: IDNA2008 and IDNA2003 deviate for the following examples,
+ * cf. https://www.unicode.org/reports/tr46/#IDNA2003-Section */
+ "xn--strae-oqa.de,straße.de,IDNA2008,true", //
+ "strasse.de,straße.de,IDNA2003,false", //
+ "strasse.de,strasse.de,IDNA2003,true", //
+ "xn--fa-hia.de,faß.de,IDNA2008,true", //
+ "fass.de,faß.de,IDNA2003,false", //
+ "fass.de,fass.de,IDNA2003,true", //
+ "xn--nxasmm1c.com,βόλος.com,IDNA2008,true", //
+ "xn--nxasmq6b.com,βόλος.com,IDNA2003,false", //
+ "xn--nxasmq6b.com,βόλοσ.com,IDNA2003,true", //
+ "xn--10cl1a0b660p.com,ශ්රී.com,IDNA2008,true", //
+ "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,false", //
+ "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,true", //
+ "xn--mgba3gch31f060k.com,نامهای.com,IDNA2008,true", //
+ "xn--mgba3gch31f.com,نامهای.com,IDNA2003,false", //
+ "xn--mgba3gch31f.com,نامهای.com,IDNA2003,true", //
+ // mixed lowercase/uppercase: no round trip conversion
+ "xn--bb-eka.at,ÖBB.at,IDNA2003,false", //
+ "xn--bb-eka.at,öbb.at,IDNA2003,true", //
+ // mixed encoding (Punycode and Unicode)
+ "xn--p1ai.xn--p1ai,рф.xn--p1ai,IDNA2003,false", //
+ "xn--p1ai.xn--p1ai,xn--p1ai.рф,IDNA2003,false", //
+ // percent-encoding is not supported
+ // "xn--p1ai.xn--p1ai,xn--p1ai.%D1%80%D1%84,IDNA2003,false", //
+ })
+ public final void testConvertHost(String ascii, String unicode, String type,
+ boolean roundTrip) throws Exception {
+ System.out.println(ascii + " <> " + unicode);
+ if ("IDNA2008".equals(type)) {
+ assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true));
+ assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false));
+ try {
+ assertNotNull(URLUtil.convertIDNA2003(unicode, true, false));
+ } catch (MalformedURLException e) {
+ /*
+ * Ok. A IDNA2008 input may raise an exception when using the IDNA2003
+ * method
+ */
+ }
+ } else if ("IDNA2003".equals(type)) {
+ assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, true));
+ assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, false));
+ if (roundTrip) {
+ assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, true));
+ assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, false));
+ }
+ }
+ }
+
+ @Test
+ public final void testConvertHostInvalid() {
+ // broken Punycode
+ assertDoesNotThrow(() -> assertEquals("xn--xn--bss-7z6ccid.com",
+ URLUtil.convertIDNA2003("xn--xn--bss-7z6ccid.com", false, true)));
+
+ // invalid Punycode
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--0.pt", false));
+
+ // IDNA2003 not allowing characters not in Unicode 3.2
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2003("☕.example", true, true));
+ assertDoesNotThrow(() -> assertEquals("xn--53h.example",
+ URLUtil.convertIDNA2003("xn--53h.example", false, true)));
+
+ // IDNA2008 invalid,
+ // cf. https://www.unicode.org/reports/tr46/#Implementation_Notes
+ // cf. https://www.unicode.org/Public/17.0.0/idna/IdnaTestV2.txt
+ // disallowed character: ⒈ (U+2488 - DIGIT ONE FULL STOP)
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("\u2488com", true));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--acom-0w1b", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--xn--a--gua.pt", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--a-ä.pt", false));
+ assertThrows(MalformedURLException.class,
+ () -> URLUtil.convertIDNA2008("xn--a-ä.pt", true));
}
@Test
From 514760bca90337afaedcc036265a6d69739cd312 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Wed, 13 May 2026 22:56:43 +0200
Subject: [PATCH 8/8] NUTCH-3177 Fetcher to report idle threads not as hung
threads
- reset reprUrl in FetcherThread after fetch is finished
- report idle threads properly
---
src/java/org/apache/nutch/fetcher/Fetcher.java | 13 +++++++++----
.../org/apache/nutch/fetcher/FetcherThread.java | 3 +++
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 0a08e9da2e..12d1b88bae 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -452,16 +452,21 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
* fetcher.threads.timeout.divisor.
*/
if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
- LOG.warn("Timeout reached with no new requests since {} seconds.",
+ LOG.warn(
+ "Timeout reached with no new requests since {} milliseconds.",
timeout);
- LOG.warn("Aborting with {} hung threads{}.", activeThreads,
+ LOG.warn("Aborting with {} hung or idle threads{}.", activeThreads,
feeder.isAlive() ? " (queue feeder still alive)" : "");
hungThreadsCounter.increment(activeThreads.get());
for (int i = 0; i < fetcherThreads.size(); i++) {
FetcherThread thread = fetcherThreads.get(i);
if (thread.isAlive()) {
- LOG.warn("Thread #{} hung while processing {}", i,
- thread.getReprUrl());
+ if (thread.getReprUrl() != null) {
+ LOG.warn("Thread #{} hung while processing {}", i,
+ thread.getReprUrl());
+ } else {
+ LOG.warn("Thread #{} idle", i);
+ }
StackTraceElement[] stack = thread.getStackTrace();
StringBuilder sb = new StringBuilder();
sb.append("Stack of thread #").append(i).append(":\n");
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 23c2e23542..ba07b8250f 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -623,6 +623,9 @@ public void run() {
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
CrawlDatum.STATUS_FETCH_RETRY);
}
+
+ // done: unset reprUrl for reporting
+ setReprUrl(null);
}
} catch (Throwable e) {