From 3bf55416a8fe8cf2394175157d68f7fbc1cb9ab7 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 21 Feb 2026 08:42:16 -0800 Subject: [PATCH 1/8] NUTCH-3154 Implement integration testing framework for Nutch IndexWriter plugins using Testcontainers (#895) --- .github/workflows/master-build.yml | 6 + build.xml | 4 + ivy/ivy.xml | 4 + src/plugin/build-plugin.xml | 31 +++- src/plugin/build.xml | 11 ++ src/plugin/indexer-elastic/ivy.xml | 5 +- .../elastic/ElasticIndexWriterIT.java | 113 +++++++++++++++ src/plugin/indexer-kafka/ivy.xml | 1 + .../indexwriter/kafka/KafkaIndexWriterIT.java | 96 ++++++++++++ src/plugin/indexer-rabbit/ivy.xml | 6 +- .../rabbit/RabbitIndexWriterIT.java | 90 ++++++++++++ src/plugin/indexer-solr/ivy.xml | 1 + .../indexwriter/solr/SolrIndexWriterIT.java | 137 ++++++++++++++++++ .../nutch/indexer/AbstractIndexWriterIT.java | 90 ++++++++++++ .../indexer/IndexWriterIntegrationTest.java | 53 +++++++ 15 files changed, 644 insertions(+), 4 deletions(-) create mode 100644 src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java create mode 100644 src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java create mode 100644 src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java create mode 100644 src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java create mode 100644 src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java create mode 100644 src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index d73bb3a693..a8675bf22a 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -104,6 +104,8 @@ jobs: - 'src/testresources/**' plugins: - 'src/plugin/**' + indexer_plugins: + - 'src/plugin/indexer-*/**' buildconf: - 'build.xml' - 'ivy/ivy.xml' @@ -120,6 +122,10 @@ jobs: - name: test plugins if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-plugins -buildfile build.xml + # run indexer integration tests when indexer plugin files change (Docker required, ubuntu-latest only) + - name: test indexer integration + if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }} + run: ant clean test-indexer-integration -buildfile build.xml - name: Check for test results id: check_tests if: always() && matrix.os == 'ubuntu-latest' diff --git a/build.xml b/build.xml index d8ee908824..37b8f4cd2a 100644 --- a/build.xml +++ b/build.xml @@ -535,6 +535,10 @@ + + + + diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 9d396ee7b1..dfdb81c0fb 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -151,6 +151,10 @@ + + + + diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index b0aca71038..6ea8d86201 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -189,7 +189,7 @@ - + Tests failed! - + + + + + + + + + + + + + + + + + + + + + + + + + + + Indexer integration tests failed! + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 81d3ece682..160febb6cf 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -174,6 +174,17 @@ + + + + + + + + + + + diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml index ee812a225c..04c1a071d0 100644 --- a/src/plugin/indexer-elastic/ivy.xml +++ b/src/plugin/indexer-elastic/ivy.xml @@ -36,7 +36,10 @@ - + + + + diff --git a/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java new file mode 100644 index 0000000000..0479213c3f --- /dev/null +++ b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.elastic; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.http.HttpHost; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.elasticsearch.action.get.GetRequest; +import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.client.RestHighLevelClient; +import org.junit.jupiter.api.Test; +import org.testcontainers.elasticsearch.ElasticsearchContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration tests for ElasticIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class ElasticIndexWriterIT extends AbstractIndexWriterIT { + + private static final String ELASTICSEARCH_IMAGE = + "docker.elastic.co/elasticsearch/elasticsearch:7.10.2"; + + @Container + private static final ElasticsearchContainer elasticsearchContainer = + new ElasticsearchContainer(ELASTICSEARCH_IMAGE) + .withEnv("discovery.type", "single-node") + .withEnv("xpack.security.enabled", "false"); + + private ElasticIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new ElasticIndexWriter(); + indexWriter.setConf(conf); + + Map params = new HashMap<>(); + params.put(ElasticConstants.HOSTS, elasticsearchContainer.getHost()); + params.put(ElasticConstants.PORT, String.valueOf(elasticsearchContainer.getMappedPort(9200))); + params.put(ElasticConstants.INDEX, "test-index"); + params.put(ElasticConstants.SCHEME, "http"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } + + @Override + public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + try (RestHighLevelClient client = new RestHighLevelClient( + RestClient.builder( + new HttpHost(elasticsearchContainer.getHost(), + elasticsearchContainer.getMappedPort(9200), + "http")))) { + GetRequest getRequest = new GetRequest("test-index", docId); + GetResponse getResponse = client.get(getRequest, RequestOptions.DEFAULT); + assertTrue(getResponse.isExists(), "Document should exist in index"); + assertNotNull(getResponse.getSource()); + assertEquals(expectedTitle, getResponse.getSource().get("title")); + } + } +} diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml index d6157d953e..ffba6746d1 100644 --- a/src/plugin/indexer-kafka/ivy.xml +++ b/src/plugin/indexer-kafka/ivy.xml @@ -37,6 +37,7 @@ + diff --git a/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java new file mode 100644 index 0000000000..4f6a306d46 --- /dev/null +++ b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.kafka; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.kafka.KafkaContainer; + +/** + * Integration tests for KafkaIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class KafkaIndexWriterIT extends AbstractIndexWriterIT { + + private static final String KAFKA_IMAGE = "apache/kafka-native:3.8.0"; + private static final String TEST_TOPIC = "nutch-indexer-test"; + + @Container + private static final KafkaContainer kafkaContainer = + new KafkaContainer(KAFKA_IMAGE); + + private KafkaIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new KafkaIndexWriter(); + indexWriter.setConf(conf); + + String bootstrapServers = kafkaContainer.getBootstrapServers(); + String hostPort = bootstrapServers.contains("://") + ? bootstrapServers.substring(bootstrapServers.indexOf("://") + 3) + : bootstrapServers; + String[] parts = hostPort.split(":"); + String host = parts[0]; + int port = Integer.parseInt(parts[1]); + + Map params = new HashMap<>(); + params.put(KafkaConstants.HOST, host); + params.put(KafkaConstants.PORT, String.valueOf(port)); + params.put(KafkaConstants.TOPIC, TEST_TOPIC); + params.put(KafkaConstants.VALUE_SERIALIZER, + "org.apache.kafka.connect.json.JsonSerializer"); + params.put(KafkaConstants.KEY_SERIALIZER, + "org.apache.kafka.common.serialization.StringSerializer"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return false; + } +} diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml index 81822a0fb7..54930331cc 100644 --- a/src/plugin/indexer-rabbit/ivy.xml +++ b/src/plugin/indexer-rabbit/ivy.xml @@ -35,5 +35,9 @@ - + + + + + diff --git a/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java new file mode 100644 index 0000000000..ed7d055350 --- /dev/null +++ b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.rabbit; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.testcontainers.containers.RabbitMQContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for RabbitIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class RabbitIndexWriterIT extends AbstractIndexWriterIT { + + private static final String RABBITMQ_IMAGE = "rabbitmq:3.13-management"; + + @Container + private static final RabbitMQContainer rabbitContainer = + new RabbitMQContainer(RABBITMQ_IMAGE); + + private RabbitIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new RabbitIndexWriter(); + indexWriter.setConf(conf); + + Map params = new HashMap<>(); + params.put(RabbitMQConstants.SERVER_URI, rabbitContainer.getAmqpUrl()); + params.put(RabbitMQConstants.EXCHANGE_NAME, "nutch-indexer-test"); + params.put(RabbitMQConstants.ROUTING_KEY, "indexer"); + params.put(RabbitMQConstants.COMMIT_MODE, "single"); + params.put(RabbitMQConstants.COMMIT_SIZE, "10"); + params.put(RabbitMQConstants.BINDING, "true"); + params.put(RabbitMQConstants.QUEUE_NAME, "nutch-indexer-queue"); + params.put(RabbitMQConstants.EXCHANGE_OPTIONS, "type=direct,durable=true"); + params.put(RabbitMQConstants.QUEUE_OPTIONS, + "durable=true,exclusive=false,auto-delete=false"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } +} diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml index 99a713c18b..4d2120955c 100644 --- a/src/plugin/indexer-solr/ivy.xml +++ b/src/plugin/indexer-solr/ivy.xml @@ -38,6 +38,7 @@ + diff --git a/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java new file mode 100644 index 0000000000..dcd88bdacb --- /dev/null +++ b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.solr; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.IndexerMapReduce; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.solr.SolrContainer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration tests for SolrIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SolrIndexWriterIT extends AbstractIndexWriterIT { + + private static final String SOLR_IMAGE = "solr:8.11.2"; + private static final String COLLECTION = "nutch-test"; + + @Container + private static final SolrContainer solrContainer = + new SolrContainer(SOLR_IMAGE).withCollection(COLLECTION); + + private SolrIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, false); + + indexWriter = new SolrIndexWriter(); + indexWriter.setConf(conf); + + String solrUrl = "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION; + + Map params = new HashMap<>(); + params.put(SolrConstants.SERVER_TYPE, "http"); + params.put(SolrConstants.SERVER_URLS, solrUrl); + params.put(SolrConstants.COLLECTION, COLLECTION); + params.put(SolrConstants.COMMIT_SIZE, "100"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } + + @Override + public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + try (SolrClient client = new Http2SolrClient.Builder( + "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION).build()) { + ModifiableSolrParams queryParams = new ModifiableSolrParams(); + queryParams.set("q", "id:" + docId); + QueryResponse response = client.query(queryParams); + assertTrue(response.getResults().getNumFound() >= 1, + "Document should exist in Solr"); + Object titleValue = response.getResults().get(0).getFieldValue("title"); + String title = titleValue instanceof Collection + ? ((Collection) titleValue).iterator().next().toString() + : titleValue.toString(); + assertEquals(expectedTitle, title); + } + } + + @Override + public IndexWriter prepareWriterForDeleteTest() throws Exception { + tearDownIndexWriter(); + + Configuration deleteConf = NutchConfiguration.create(); + deleteConf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true); + SolrIndexWriter deleteWriter = new SolrIndexWriter(); + deleteWriter.setConf(deleteConf); + + String solrUrl = "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION; + Map params = new HashMap<>(); + params.put(SolrConstants.SERVER_TYPE, "http"); + params.put(SolrConstants.SERVER_URLS, solrUrl); + params.put(SolrConstants.COLLECTION, COLLECTION); + deleteWriter.open(new IndexWriterParams(params)); + + return deleteWriter; + } +} diff --git a/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java new file mode 100644 index 0000000000..b0bf6e0239 --- /dev/null +++ b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +import org.apache.nutch.indexer.NutchDocument; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for IndexWriter integration tests. Provides common test logic + * for write/commit and delete operations. + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractIndexWriterIT implements IndexWriterIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpIndexWriter(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownIndexWriter(); + } + + @Test + void testWriteAndCommitDocument() throws Exception { + NutchDocument doc = createTestDocument("test-doc-1", "Test Document", + "This is a test document for integration testing."); + assertDoesNotThrow(() -> getIndexWriter().write(doc)); + assertDoesNotThrow(() -> getIndexWriter().commit()); + tearDownIndexWriter(); + verifyDocumentWritten("test-doc-1", "Test Document"); + } + + @Test + void testDeleteDocument() throws Exception { + if (!supportsDelete()) { + return; + } + String docId = "test-doc-to-delete"; + NutchDocument doc = createTestDocument(docId, "Document to Delete", ""); + + IndexWriter writer = getIndexWriter(); + writer.write(doc); + writer.commit(); + + IndexWriter deleteWriter = prepareWriterForDeleteTest(); + if (deleteWriter == null) { + deleteWriter = writer; + } + final IndexWriter writerForDelete = deleteWriter; + assertDoesNotThrow(() -> writerForDelete.delete(docId)); + assertDoesNotThrow(() -> writerForDelete.commit()); + if (deleteWriter != writer) { + try { + deleteWriter.close(); + } catch (Exception e) { + // Ignore + } + } + } + + /** Create a NutchDocument with id, title, and content. */ + protected NutchDocument createTestDocument(String id, String title, String content) { + NutchDocument doc = new NutchDocument(); + doc.add("id", id); + doc.add("title", title); + doc.add("content", content); + return doc; + } +} diff --git a/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java new file mode 100644 index 0000000000..c6f1027da9 --- /dev/null +++ b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +/** + * Contract for IndexWriter integration tests. Implementations run against + * real backends via Testcontainers. + */ +public interface IndexWriterIntegrationTest { + + /** Open the index writer before tests. */ + void setUpIndexWriter() throws Exception; + + /** Close the index writer after tests. */ + void tearDownIndexWriter() throws Exception; + + /** The IndexWriter under test. */ + IndexWriter getIndexWriter(); + + /** Whether this writer supports document deletion (e.g. Kafka does not). */ + boolean supportsDelete(); + + /** + * Optional verification that a document was indexed. + * Default no-op; override for Elastic, Solr. + */ + default void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + // no-op + } + + /** + * Optional writer configured for delete operations. Used when the main + * writer has delete disabled (e.g. Solr requires INDEXER_DELETE=true). + * Default returns null to use {@link #getIndexWriter()}. + */ + default IndexWriter prepareWriterForDeleteTest() throws Exception { + return null; + } +} From d0099c025b0c799d55d9437310af711f58a07a57 Mon Sep 17 00:00:00 2001 From: Luca <15426+lfoppiano@users.noreply.github.com> Date: Fri, 27 Feb 2026 00:25:16 +0000 Subject: [PATCH 2/8] [NUTCH-3160] Remove System.exit(..) from reusable code (#903) --- .../apache/nutch/metrics/NutchMetrics.java | 3 +++ .../nutch/tools/CommonCrawlDataDumper.java | 8 ++++++- .../tools/TestCommonCrawlDataDumper.java | 21 +++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java index ccb2d70ed3..14979803a1 100644 --- a/src/java/org/apache/nutch/metrics/NutchMetrics.java +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -86,6 +86,9 @@ private NutchMetrics() { /** Counter group for WARC export operations. */ public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter"; + /** Counter group for Common Crawl data dumper tool. */ + public static final String GROUP_COMMONCRAWL_DUMPER = "nutch_commoncrawl_dumper"; + /** Counter group for domain statistics operations. */ public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats"; diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index d5d5035e89..8e37c21fcf 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -67,6 +67,8 @@ import org.apache.nutch.crawl.LinkDbReader; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.DumpFileUtil; import org.apache.nutch.util.NutchConfiguration; @@ -188,6 +190,7 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool { private GzipCompressorOutputStream gzipOutput = null; private TarArchiveOutputStream tarOutput = null; private ArrayList fileList = null; + private ErrorTracker errorTracker; /** * Main method for invoking this tool @@ -210,6 +213,7 @@ public static void main(String[] args) throws Exception { * @param config A populated {@link CommonCrawlConfig} */ public CommonCrawlDataDumper(CommonCrawlConfig config) { + this(); this.config = config; } @@ -217,6 +221,7 @@ public CommonCrawlDataDumper(CommonCrawlConfig config) { * Constructor */ public CommonCrawlDataDumper() { + this.errorTracker = new ErrorTracker(NutchMetrics.GROUP_COMMONCRAWL_DUMPER); } /** @@ -274,7 +279,8 @@ public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, if (parts == null || parts.size() == 0) { LOG.error( "No segment directories found in {} ", segmentRootDir.getAbsolutePath()); - System.exit(1); + this.errorTracker.recordError(ErrorTracker.ErrorType.OTHER); + return; } LOG.info("Found {} segment parts", parts.size()); if (gzip && !warc) { diff --git a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java index fee72b65a5..8124fe20b1 100644 --- a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java +++ b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java @@ -17,14 +17,19 @@ package org.apache.nutch.tools; import java.io.File; +import java.lang.reflect.Field; import java.nio.file.Files; import java.util.Collection; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.FileFilterUtils; +import org.apache.nutch.metrics.ErrorTracker; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; /** * @@ -106,6 +111,22 @@ public void testDump() throws Exception { } + @Test + public void testDumpWithNoSegmentDirectoriesRecordsOtherError() throws Exception { + File emptySegmentDir = Files.createTempDirectory("empty-segments").toFile(); + File outputDir = Files.createTempDirectory("dump-output").toFile(); + + ErrorTracker mockErrorTracker = mock(ErrorTracker.class); + CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(); + Field errorTrackerField = CommonCrawlDataDumper.class.getDeclaredField("errorTracker"); + errorTrackerField.setAccessible(true); + errorTrackerField.set(dumper, mockErrorTracker); + + dumper.dump(outputDir, emptySegmentDir, null, false, null, false, "", false); + + verify(mockErrorTracker, times(1)).recordError(ErrorTracker.ErrorType.OTHER); + } + private boolean hasFile(String fileName, Collection files) { for (File f : files) { if (f.getName().equals(fileName)) { From 0ea90c3ec9cf16110bf1b473311ebcb64aa04a6d Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Mon, 13 Apr 2026 19:17:22 -0700 Subject: [PATCH 3/8] NUTCH-3168 Sandbox Commons JEXL usage in crawl and index pipelines (#909) --- conf/nutch-default.xml | 18 +- docker/README.md | 6 + .../org/apache/nutch/crawl/CrawlDbReader.java | 2 +- .../org/apache/nutch/crawl/Generator.java | 14 +- .../org/apache/nutch/hostdb/ReadHostDb.java | 9 +- src/java/org/apache/nutch/util/JexlUtil.java | 151 +++++++++++++++-- .../nutch/exchange/jexl/JexlExchange.java | 3 +- .../indexer/jexl/JexlIndexingFilter.java | 2 +- .../org/apache/nutch/util/TestJexlUtil.java | 160 ++++++++++++++++++ 9 files changed, 334 insertions(+), 31 deletions(-) create mode 100644 src/test/org/apache/nutch/util/TestJexlUtil.java diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index cdac434830..7a8a73f81b 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1707,7 +1707,10 @@ plugins Directories where Nutch plugins are located. Each element may be a relative or absolute path. If absolute, it is used - as is. If relative, it is searched for on the classpath. + as is. If relative, it is searched for on the classpath. + For secure deployments, treat these directories as trusted code: use + read-only filesystem permissions or immutable images so untrusted + parties cannot add or replace plugin JARs or plugin.xml files. @@ -2367,6 +2370,19 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this each property value is always an array of Strings (so if you expect one value, use [0]) * doc - contains all the NutchFields from the NutchDocument. each property value is always an array of Objects. + Expressions are evaluated in a sandboxed JEXL engine (see also + nutch.jexl.disable.sandbox). + + + + + nutch.jexl.disable.sandbox + false + If true, disables the Commons JEXL sandbox and the restriction + on the JEXL "new" operator for all Nutch JEXL expressions (index filter, + generator, hostdb filter, crawl_db_reader, exchange-jexl, etc.). This is + unsafe and should only be used in fully trusted environments when a + legitimate expression cannot be expressed under the default sandbox. diff --git a/docker/README.md b/docker/README.md index 80e1a1d6d9..720fdf8165 100644 --- a/docker/README.md +++ b/docker/README.md @@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080 ``` +## Security and plugin directories + +Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree. + +User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments. + ## Usage If not already running, start docker diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 03cf0fbd39..57e684374c 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -906,7 +906,7 @@ public void setup( retry = config.getInt("retry", -1); if (config.get("expr", null) != null) { - expr = JexlUtil.parseExpression(config.get("expr", null)); + expr = JexlUtil.parseExpression(config, config.get("expr", null)); } sample = config.getFloat("sample", 1); } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 102ce39b94..aa8cfcbbfa 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -227,7 +227,7 @@ public void setup( if (!restrictStatusString.isEmpty()) { restrictStatus = CrawlDatum.getStatusByName(restrictStatusString); } - expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null)); + expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null)); // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); // Initialize cached counter references @@ -453,10 +453,10 @@ public void setup(Context context) throws IOException { URLNormalizers.SCOPE_GENERATE_HOST_COUNT); if (conf.get(GENERATOR_HOSTDB) != null) { - maxCountExpr = JexlUtil - .parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null)); - fetchDelayExpr = JexlUtil - .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); + maxCountExpr = JexlUtil.parseExpression(conf, + conf.get(GENERATOR_MAX_COUNT_EXPR, null)); + fetchDelayExpr = JexlUtil.parseExpression(conf, + conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); } // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); @@ -871,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * maximum number of segments to generate * @param expr * a Jexl expression to use in the Generator job. - * @see JexlUtil#parseExpression(String) + * @see JexlUtil#parseExpression(Configuration, String) * @throws IOException * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) @@ -922,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * @param hostdb * name of a hostdb from which to execute Jexl expressions in a bid * to determine the maximum URL count and/or fetch delay per host. - * @see JexlUtil#parseExpression(String) + * @see JexlUtil#parseExpression(Configuration, String) * @throws IOException * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 9f2e4a384e..23d94bc881 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -44,11 +44,10 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.SegmentReaderUtil; -import org.apache.commons.jexl3.JexlBuilder; import org.apache.commons.jexl3.JexlContext; import org.apache.commons.jexl3.JexlScript; -import org.apache.commons.jexl3.JexlEngine; import org.apache.commons.jexl3.MapContext; +import org.apache.nutch.util.JexlUtil; /** * @see Commons @@ -77,11 +76,7 @@ public void setup(Context context) { fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true); String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION); if (expr != null) { - // Create or retrieve a JexlEngine - JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create(); - - // Create an expression object - this.expr = jexl.createScript(expr); + this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr); } } diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java index 549aebc419..29e8a4f204 100644 --- a/src/java/org/apache/nutch/util/JexlUtil.java +++ b/src/java/org/apache/nutch/util/JexlUtil.java @@ -23,33 +23,159 @@ import org.apache.commons.jexl3.JexlBuilder; import org.apache.commons.jexl3.JexlEngine; +import org.apache.commons.jexl3.JexlFeatures; import org.apache.commons.jexl3.JexlScript; +import org.apache.commons.jexl3.introspection.JexlSandbox; import org.apache.commons.lang3.time.DateUtils; +import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Utility methods for handling JEXL expressions + * Utility methods for handling JEXL expressions used in crawl and index + * pipelines. Expressions are evaluated under a {@link JexlSandbox} with + * {@link JexlFeatures#newInstance(boolean)} disabled so arbitrary classes cannot + * be instantiated from user-supplied configuration. */ public class JexlUtil { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + /** + * When {@code true}, JEXL parsing skips the sandbox (unsafe). For trusted + * environments only; not recommended. + */ + public static final String DISABLE_SANDBOX_KEY = "nutch.jexl.disable.sandbox"; + /** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */ - private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); + private static final Pattern DATE_PATTERN = Pattern + .compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); + + /** + * Classes and interfaces that may be introspected when evaluating Nutch JEXL + * scripts. Default-deny sandbox: anything not listed is blocked. + */ + private static final String[] SANDBOX_ALLOW_CLASSES = { + "java.lang.String", + "java.lang.Boolean", + "java.lang.Byte", + "java.lang.Character", + "java.lang.Short", + "java.lang.Integer", + "java.lang.Long", + "java.lang.Float", + "java.lang.Double", + "java.lang.Number", + "java.lang.Math", + "java.lang.Comparable", + "java.lang.CharSequence", + "java.util.Map", + "java.util.List", + "java.util.Collection", + "java.util.Set", + "java.util.SortedMap", + "java.util.SortedSet", + "java.util.Iterator", + "java.lang.Iterable", + "java.util.AbstractList", + "java.util.AbstractCollection", + "java.util.AbstractMap", + "java.util.AbstractSet", + "java.util.ArrayList", + "java.util.LinkedList", + "java.util.HashMap", + "java.util.LinkedHashMap", + "java.util.HashSet", + "java.util.LinkedHashSet", + "java.util.TreeMap", + "java.util.TreeSet", + "java.util.Collections", + "java.util.Arrays", + "java.util.regex.Pattern", + "java.util.regex.Matcher", + "org.apache.commons.jexl3.MapContext", + "org.apache.nutch.indexer.NutchDocument", + "org.apache.nutch.indexer.NutchField", + }; + + private static volatile JexlEngine sandboxedEngine; + private static volatile JexlEngine legacyEngine; + + private JexlUtil() { + } + + private static JexlSandbox createSandbox() { + JexlSandbox sandbox = new JexlSandbox(false); + for (String name : SANDBOX_ALLOW_CLASSES) { + sandbox.allow(name); + } + return sandbox; + } + + private static JexlFeatures createFeatures() { + return new JexlFeatures(JexlFeatures.createDefault()).newInstance(false); + } + + private static JexlEngine getSandboxedEngine() { + if (sandboxedEngine == null) { + synchronized (JexlUtil.class) { + if (sandboxedEngine == null) { + sandboxedEngine = new JexlBuilder().silent(true).strict(true) + .sandbox(createSandbox()).features(createFeatures()).create(); + } + } + } + return sandboxedEngine; + } + + private static JexlEngine getLegacyEngine() { + if (legacyEngine == null) { + synchronized (JexlUtil.class) { + if (legacyEngine == null) { + legacyEngine = new JexlBuilder().silent(true).strict(true).create(); + } + } + } + return legacyEngine; + } + + private static JexlEngine engineFor(Configuration conf) { + if (conf != null && conf.getBoolean(DISABLE_SANDBOX_KEY, false)) { + LOG.warn("{}=true: JEXL sandbox is disabled; only use in fully trusted environments.", + DISABLE_SANDBOX_KEY); + return getLegacyEngine(); + } + return getSandboxedEngine(); + } /** - * Parses the given expression to a JEXL expression. This supports - * date parsing. + * Parses a JEXL expression using the default (sandboxed) engine. Use + * {@link #parseExpression(Configuration, String)} when a {@link Configuration} + * is available so {@link #DISABLE_SANDBOX_KEY} can be honored. * * @param expr string JEXL expression * @return parsed JEXL expression or null in case of parse error */ public static JexlScript parseExpression(String expr) { - if (expr == null) return null; - + return parseExpression(null, expr); + } + + /** + * Parses a JEXL expression. Unless {@link #DISABLE_SANDBOX_KEY} is set to + * {@code true} in {@code conf}, the expression is parsed for execution under + * a restrictive sandbox. + * + * @param conf Hadoop configuration, or null to always use the sandbox + * @param expr string JEXL expression + * @return parsed JEXL expression or null in case of parse error + */ + public static JexlScript parseExpression(Configuration conf, String expr) { + if (expr == null) { + return null; + } + try { // Translate any date object into a long. Dates must be in the DATE_PATTERN // format. For example: 2016-03-20T00:00:00Z @@ -57,22 +183,21 @@ public static JexlScript parseExpression(String expr) { if (matcher.find()) { String date = matcher.group(); - + // parse the matched substring and get the epoch - Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"}); + Date parsedDate = DateUtils.parseDateStrictly(date, + new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" }); long time = parsedDate.getTime(); - + // replace the original string date with the numeric value expr = expr.replace(date, Long.toString(time)); } - JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create(); - - return jexl.createScript(expr); + return engineFor(conf).createScript(expr); } catch (Exception e) { LOG.error(e.getMessage()); } - + return null; } } diff --git a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java index a55557595d..be6bf0dbe8 100644 --- a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java +++ b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java @@ -41,7 +41,8 @@ public class JexlExchange implements Exchange { */ @Override public void open(Map parameters) { - expression = JexlUtil.parseExpression(parameters.get(EXPRESSION_KEY)); + expression = JexlUtil.parseExpression(getConf(), + parameters.get(EXPRESSION_KEY)); } /** diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java index e1fa792951..a89be63826 100644 --- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java +++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java @@ -114,7 +114,7 @@ public void setConf(Configuration conf) { "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none"); } - expr = JexlUtil.parseExpression(strExpr); + expr = JexlUtil.parseExpression(conf, strExpr); if (expr == null) { LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr); diff --git a/src/test/org/apache/nutch/util/TestJexlUtil.java b/src/test/org/apache/nutch/util/TestJexlUtil.java new file mode 100644 index 0000000000..221fffea22 --- /dev/null +++ b/src/test/org/apache/nutch/util/TestJexlUtil.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import org.apache.commons.jexl3.JexlScript; +import org.apache.commons.jexl3.MapContext; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Unit tests for {@link JexlUtil} sandboxing. + */ +public class TestJexlUtil { + + @Test + public void testSandboxAllowsDocFieldCompare() throws Exception { + JexlScript script = JexlUtil.parseExpression("doc.lang == 'en'"); + assertNotNull(script); + MapContext doc = new MapContext(); + doc.set("lang", "en"); + MapContext root = new MapContext(); + root.set("doc", doc); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testSandboxAllowsScoreCompare() throws Exception { + JexlScript script = JexlUtil.parseExpression("score > 0.5"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("score", 0.9f); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testNewInstanceIoBlocked() { + assertNull(JexlUtil.parseExpression("new java.io.File('/')")); + } + + @Test + public void testNewInstanceFileOutputStreamBlocked() { + assertNull(JexlUtil.parseExpression( + "new java.io.FileOutputStream('/tmp/nutch-jexl-poc')")); + } + + @Test + public void testDisableSandboxAllowsNewExpressionParse() { + Configuration conf = new Configuration(); + conf.setBoolean(JexlUtil.DISABLE_SANDBOX_KEY, true); + JexlScript script = JexlUtil.parseExpression(conf, + "new java.io.File('/')"); + assertNotNull(script); + } + + @Test + public void testArithmeticAllowed() throws Exception { + JexlScript script = JexlUtil.parseExpression("2 * 3 + 1 == 7"); + assertNotNull(script); + assertTrue(Boolean.TRUE.equals(script.execute(new MapContext()))); + } + + @Test + public void testStringMethodsAllowed() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "url.startsWith('http://')"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("url", "http://example.org/"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testDateRewriteStillParses() { + JexlScript script = JexlUtil.parseExpression( + "fetchTime > 2016-03-20T00:00:00Z"); + assertNotNull(script); + } + + @Test + public void testNullExpression() { + assertNull(JexlUtil.parseExpression(null)); + assertNull(JexlUtil.parseExpression(new Configuration(), null)); + } + + @Test + public void testInvalidSyntaxReturnsNull() { + assertNull(JexlUtil.parseExpression("doc.lang=<>:='en'")); + } + + @Test + public void testListSize() throws Exception { + JexlScript script = JexlUtil.parseExpression("doc.tags.size() == 2"); + assertNotNull(script); + MapContext doc = new MapContext(); + java.util.List tags = new java.util.ArrayList<>(); + tags.add("a"); + tags.add("b"); + doc.set("tags", tags); + MapContext root = new MapContext(); + root.set("doc", doc); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testGeneratorStyleMetadata() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "warc_import_time > 0 && score > 0"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("warc_import_time", 1); + root.set("score", 1.0f); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testEqualsIgnoreCase() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "status.equalsIgnoreCase('FETCHED')"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("status", "fetched"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testRegex() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "url =~ 'https?://.*\\.example\\.org/.*'"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("url", "http://foo.example.org/bar"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testTernary() throws Exception { + JexlScript script = JexlUtil.parseExpression("true ? 1 : 0"); + assertNotNull(script); + assertEquals(1, script.execute(new MapContext())); + } +} From 2b72674a2d90b5b927e7b284ea6f8c38329ee577 Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Thu, 16 Apr 2026 15:29:05 -0700 Subject: [PATCH 4/8] parse: fix 'occured' -> 'occurred' typo in ParseStatus FAILED javadoc (#910) Javadoc on the FAILED ParseStatus constant in src/java/org/apache/nutch/parse/ParseStatus.java read 'Parsing failed. An Exception occured'. Doc-only change. Signed-off-by: SAY-5 Co-authored-by: SAY-5 --- src/java/org/apache/nutch/parse/ParseStatus.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java index 052a342247..25b8ae1b47 100644 --- a/src/java/org/apache/nutch/parse/ParseStatus.java +++ b/src/java/org/apache/nutch/parse/ParseStatus.java @@ -56,7 +56,7 @@ public class ParseStatus implements Writable { // Secondary failure codes go here: /** - * Parsing failed. An Exception occured (which may be retrieved from the + * Parsing failed. An Exception occurred (which may be retrieved from the * arguments). */ public static final short FAILED_EXCEPTION = 200; From f8e8583d298813589e37a8627fb234bc78b7c0ae Mon Sep 17 00:00:00 2001 From: lewismc Date: Thu, 7 May 2026 21:18:26 -0700 Subject: [PATCH 5/8] NUTCH-3175 Implement integration testing framework for Nutch Protocol plugins using Testcontainers --- .github/workflows/master-build.yml | 6 + build.xml | 4 + conf/log4j2.xml | 10 +- ivy/ivy.xml | 6 +- src/plugin/build-plugin.xml | 28 +++ src/plugin/build.xml | 13 ++ src/plugin/protocol-ftp/ivy.xml | 3 +- .../org/apache/nutch/protocol/ftp/Ftp.java | 1 + .../nutch/protocol/ftp/FtpResponse.java | 13 +- .../nutch/protocol/ftp/FtpProtocolIT.java | 159 ++++++++++++++++++ .../protocol/htmlunit/HtmlUnitProtocolIT.java | 79 +++++++++ .../nutch/protocol/http/HttpProtocolIT.java | 83 +++++++++ src/plugin/protocol-httpclient/ivy.xml | 1 + .../httpclient/HttpClientProtocolIT.java | 150 +++++++++++++++++ .../protocol/okhttp/OkHttpProtocolIT.java | 79 +++++++++ .../protocol/selenium/SeleniumProtocolIT.java | 86 ++++++++++ .../protocol/AbstractProtocolPluginIT.java | 97 +++++++++++ .../ProtocolPluginIntegrationTest.java | 50 ++++++ 18 files changed, 860 insertions(+), 8 deletions(-) create mode 100644 src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java create mode 100644 src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java create mode 100644 src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java create mode 100644 src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java create mode 100644 src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java create mode 100644 src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java create mode 100644 src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java create mode 100644 src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index a8675bf22a..4a1604d928 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -106,6 +106,8 @@ jobs: - 'src/plugin/**' indexer_plugins: - 'src/plugin/indexer-*/**' + protocol_plugins: + - 'src/plugin/protocol-*/**' buildconf: - 'build.xml' - 'ivy/ivy.xml' @@ -126,6 +128,10 @@ jobs: - name: test indexer integration if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }} run: ant clean test-indexer-integration -buildfile build.xml + # run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only) + - name: test protocol integration + if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }} + run: ant clean test-protocol-integration -buildfile build.xml - name: Check for test results id: check_tests if: always() && matrix.os == 'ubuntu-latest' diff --git a/build.xml b/build.xml index 37b8f4cd2a..8e68ebd9a0 100644 --- a/build.xml +++ b/build.xml @@ -539,6 +539,10 @@ + + + + diff --git a/conf/log4j2.xml b/conf/log4j2.xml index 713bfdc7fe..6faf4329fa 100644 --- a/conf/log4j2.xml +++ b/conf/log4j2.xml @@ -19,16 +19,16 @@ - ${sys:hadoop.log.dir:-./logs} - ${sys:hadoop.log.file:-hadoop.log} + ${sys:hadoop.log.dir:-./logs} + ${sys:hadoop.log.file:-hadoop.log} - + - + diff --git a/ivy/ivy.xml b/ivy/ivy.xml index dfdb81c0fb..5ed19206ca 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -151,9 +151,13 @@ - + + + + + diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index 6ea8d86201..3ba9e1b2fc 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -84,6 +84,7 @@ + @@ -269,6 +270,33 @@ Indexer integration tests failed! + + + + + + + + + + + + + + + + + + + + + + + + + Protocol integration tests failed! + + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 160febb6cf..24edd14639 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -185,6 +185,19 @@ + + + + + + + + + + + + + diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml index 7749a873ff..5e6a0d8c72 100644 --- a/src/plugin/protocol-ftp/ivy.xml +++ b/src/plugin/protocol-ftp/ivy.xml @@ -37,7 +37,8 @@ - + + diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 8cf58f75e7..3570d91188 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -183,6 +183,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { } catch (Exception e) { LOG.error("Could not get protocol output for {}: {}", url, e.getMessage()); + datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text("500")); return new ProtocolOutput(null, new ProtocolStatus(e)); } } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index d6f7fd64a4..8796cfc0b3 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -164,7 +164,8 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) Ftp.LOG.info("connect to {}", addr); } - ftp.client.connect(addr); + int port = url.getPort(); + ftp.client.connect(addr, port > 0 ? port : FTP.DEFAULT_PORT); if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { ftp.client.disconnect(); Ftp.LOG.warn("ftp.client.connect() failed: {} {}", addr, @@ -206,6 +207,11 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) try { ftp.parser = null; String parserKey = ftp.client.getSystemName(); + // strip surrounding quotes that some servers include in SYST reply + if (parserKey.length() > 2 && parserKey.charAt(0) == '"' + && parserKey.charAt(parserKey.length() - 1) == '"') { + parserKey = parserKey.substring(1, parserKey.length() - 1); + } // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8 if (parserKey.startsWith("UNKNOWN Type: L8")) parserKey = "UNIX Type: L8"; @@ -302,6 +308,11 @@ private void getFileAsHttpResponse(String path, long lastModified) list = new LinkedList(); ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser); + if (list.isEmpty()) { + this.code = 404; // file not found (server returned empty listing) + return; + } + FTPFile ftpFile = (FTPFile) list.get(0); this.headers.set(Response.CONTENT_LENGTH, Long.valueOf(ftpFile.getSize()).toString()); diff --git a/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java new file mode 100644 index 0000000000..ccd3cd1ccb --- /dev/null +++ b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.ftp; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockftpserver.fake.FakeFtpServer; +import org.mockftpserver.fake.UserAccount; +import org.mockftpserver.fake.filesystem.DirectoryEntry; +import org.mockftpserver.fake.filesystem.FileEntry; +import org.mockftpserver.fake.filesystem.UnixFakeFileSystem; + +/** + * Integration tests for protocol-ftp using an in-process FakeFtpServer. + * + *

FTP passive mode with Testcontainers requires that the PASV response IP + * matches the host-visible address of the container, which is not reliable + * across Docker Desktop (macOS/Windows) and Linux Docker environments. An + * in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint + * while still testing the Nutch FTP client against a real FTP protocol + * implementation. + */ +public class FtpProtocolIT implements ProtocolPluginIntegrationTest { + + private static final String FTP_USER = "testuser"; + private static final String FTP_PASS = "testpass"; + private static final String FTP_HOME = "/home/testuser"; + private static final String TEST_FILE = "test.txt"; + private static final String TEST_CONTENT = "FTP integration test content"; + + private static FakeFtpServer fakeFtpServer; + private Ftp protocol; + + @BeforeAll + static void startFtpServer() { + fakeFtpServer = new FakeFtpServer(); + fakeFtpServer.setServerControlPort(0); // bind to a random free port + + UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME); + fakeFtpServer.addUserAccount(userAccount); + + UnixFakeFileSystem fileSystem = new UnixFakeFileSystem(); + fileSystem.add(new DirectoryEntry(FTP_HOME)); + fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT)); + fakeFtpServer.setFileSystem(fileSystem); + + fakeFtpServer.start(); + } + + @AfterAll + static void stopFtpServer() { + if (fakeFtpServer != null) { + fakeFtpServer.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints"); + conf.set("http.agent.name", "NutchFtpProtocolIT"); + conf.set("ftp.username", FTP_USER); + conf.set("ftp.password", FTP_PASS); + conf.setInt("ftp.timeout", 10000); + protocol = new Ftp(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/" + TEST_FILE; + } + + @Test + void testFtpFileDownload() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for file download"); + + assertNotNull(output.getContent(), "Content must not be null"); + String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8); + assertTrue(body.contains(TEST_CONTENT), + "Downloaded content must match the file on the FTP server"); + } + + @Test + void testFtpDirectoryListing() throws Exception { + String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum); + + assertNotNull(output, "ProtocolOutput for directory listing must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for directory listing"); + } + + @Test + void testFtpMissingFileReturnsError() throws Exception { + String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/nonexistent.txt"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum); + assertNotNull(output, "ProtocolOutput must not be null even for missing files"); + // FTP 550 "No such file" maps to a non-200 Nutch status + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code); + } +} diff --git a/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java new file mode 100644 index 0000000000..42c551b72f --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-htmlunit using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HtmlUnitProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-htmlunit|lib-htmlunit|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-htmlunit must return non-null content for a live nginx page"); + } +} diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java new file mode 100644 index 0000000000..87db32335b --- /dev/null +++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-http using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-http|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchRedirect301() throws Exception { + // nginx returns 301 for directory URLs without trailing slash when autoindex + // is off; test a manual redirect via the default nginx welcome page path + String redirectUrl = + "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/index.html"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(redirectUrl), datum); + int code = getHttpStatusCode(datum); + // nginx serves index.html directly with 200; the base test covers 200/404 + assertEquals(200, code, "Expected 200 for index.html from nginx"); + } +} diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml index 0b3ce0af73..e5987074b8 100644 --- a/src/plugin/protocol-httpclient/ivy.xml +++ b/src/plugin/protocol-httpclient/ivy.xml @@ -38,6 +38,7 @@ + diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java new file mode 100644 index 0000000000..7345e4b029 --- /dev/null +++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.httpclient; + +import static com.github.tomakehurst.wiremock.client.WireMock.aResponse; +import static com.github.tomakehurst.wiremock.client.WireMock.get; +import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import com.github.tomakehurst.wiremock.WireMockServer; +import com.github.tomakehurst.wiremock.core.WireMockConfiguration; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Integration tests for protocol-httpclient using an in-process WireMock + * server. + * + *

WireMock runs in the test JVM so no Docker container is required. The + * Nutch httpclient plugin connects to it over a real TCP socket, exercising + * the full HTTP client stack including header handling and Basic-auth + * challenge/response. + */ +public class HttpClientProtocolIT implements ProtocolPluginIntegrationTest { + + private static WireMockServer wireMock; + private Http protocol; + + @BeforeAll + static void startWireMock() { + wireMock = new WireMockServer(WireMockConfiguration.options().dynamicPort()); + wireMock.start(); + + wireMock.stubFor(get(urlEqualTo("/")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Integration test"))); + + wireMock.stubFor(get(urlEqualTo("/notfound")) + .willReturn(aResponse().withStatus(404))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .withBasicAuth("testuser", "testpass") + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Authenticated"))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .willReturn(aResponse() + .withStatus(401) + .withHeader("WWW-Authenticate", "Basic realm=\"Test\"") + .withBody("Unauthorized"))); + } + + @AfterAll + static void stopWireMock() { + if (wireMock != null) { + wireMock.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-httpclient|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://localhost:" + wireMock.port() + "/"; + } + + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected HTTP 200 from WireMock stub"); + } + + @Test + void testFetch404() throws Exception { + String url = "http://localhost:" + wireMock.port() + "/notfound"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(url), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(404, code, "Expected HTTP 404 for /notfound stub"); + } + + @Test + void testUnauthenticatedRequestReturns401() throws Exception { + String secureUrl = "http://localhost:" + wireMock.port() + "/secure"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(secureUrl), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(401, code, + "Unauthenticated request to /secure should return 401"); + } +} diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java new file mode 100644 index 0000000000..d5342d8309 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-okhttp using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class OkHttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private OkHttp protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-okhttp|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new OkHttp(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + /** OkHttp transparently decompresses gzip; verify content is returned. */ + @Test + void testFetchWithAcceptEncoding() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "Content must be present even when server uses compression"); + } +} diff --git a/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java new file mode 100644 index 0000000000..ec928df64f --- /dev/null +++ b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.selenium; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-selenium using a real nginx container. + * + *

Note: protocol-selenium uses raw HTTP sockets (the same underlying + * transport as protocol-http) rather than a Selenium WebDriver. Tests here + * validate that the plugin connects to and fetches content from a live HTTP + * server. Browser-based rendering is covered by protocol-interactiveselenium + * which is excluded from automated integration tests due to its stateful + * handler requirements. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SeleniumProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-selenium|lib-http|lib-selenium|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-selenium must return non-null content for a live nginx page"); + } +} diff --git a/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java new file mode 100644 index 0000000000..9469b168fb --- /dev/null +++ b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for Protocol plugin integration tests using Testcontainers. + * Provides common test logic for fetching URLs and verifying status codes. + * + *

Subclasses declare a static {@code @Container} field for the server + * container, implement {@link ProtocolPluginIntegrationTest}, and may add + * protocol-specific tests (e.g., redirect handling, authentication). + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractProtocolPluginIT implements ProtocolPluginIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpProtocol(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownProtocol(); + } + + /** Fetch the test URL and assert an HTTP 200 response. */ + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = getProtocol() + .getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + assertEquals(200, getHttpStatusCode(datum), + "Expected HTTP 200 for " + getTestUrl()); + verifyFetchedContent(output, datum); + } + + /** Fetch a non-existent path and assert an HTTP 404 response. */ + @Test + void testFetch404() throws Exception { + String url = get404Url(); + CrawlDatum datum = new CrawlDatum(); + getProtocol().getProtocolOutput(new Text(url), datum); + assertEquals(404, getHttpStatusCode(datum), + "Expected HTTP 404 for " + url); + } + + /** + * Returns a URL expected to produce a 404. Default appends a random path + * segment to {@link #getTestUrl()}; override if the server needs a specific + * path. + */ + protected String get404Url() { + String base = getTestUrl(); + if (base.endsWith("/")) { + return base + "nonexistent-path-xyz"; + } + return base + "/nonexistent-path-xyz"; + } + + /** + * Reads the HTTP status code stored in the CrawlDatum metadata by Nutch + * protocol plugins. Returns -1 if no status code was stored. + */ + protected static int getHttpStatusCode(CrawlDatum datum) { + if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) { + return Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + } + return -1; + } +} diff --git a/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java new file mode 100644 index 0000000000..b3778077d9 --- /dev/null +++ b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import org.apache.nutch.crawl.CrawlDatum; + +/** + * Contract for Protocol plugin integration tests. Implementations run against + * real server backends (via Testcontainers or embedded servers). + */ +public interface ProtocolPluginIntegrationTest { + + /** Set up the protocol plugin and its backing server before tests. */ + void setUpProtocol() throws Exception; + + /** Shut down the protocol plugin after tests. */ + void tearDownProtocol() throws Exception; + + /** The Protocol under test. */ + Protocol getProtocol(); + + /** + * A URL that the backing server will serve with a 200/success response. + * Must point into the container or embedded server started by this test. + */ + String getTestUrl(); + + /** + * Optional extra verification after a successful fetch. + * Default is a no-op; override to inspect content, headers, etc. + */ + default void verifyFetchedContent(ProtocolOutput output, CrawlDatum datum) + throws Exception { + // no-op + } +} From 5cc412c38c48dda065ce7a77833dfc92cf300a92 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 15 May 2026 19:13:12 +0200 Subject: [PATCH 6/8] Integrate NUTCH-3175 into build of CCF fork --- .github/workflows/cc-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cc-build.yml b/.github/workflows/cc-build.yml index 1e8f23a691..2ef2fb6fca 100644 --- a/.github/workflows/cc-build.yml +++ b/.github/workflows/cc-build.yml @@ -61,4 +61,4 @@ jobs: restore-keys: | ${{ runner.os }}-ivy- - name: Test - run: ant clean test -buildfile build.xml + run: ant clean test test-protocol-integration -buildfile build.xml From 98704c5c8a301d2e700b1d706e81de708522bdaf Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 17 May 2026 14:49:22 +0200 Subject: [PATCH 7/8] NUTCH-3176 URLUtil and urlnormalizer-basic: add support for IDNA2008 - URLUtil: - make IDNA2008 the default for the methods toASCII and toUNICODE - provide methods to convert host names both for IDNA2003 and IDNA2008 - also convert host to lowercase it (if not already lowercased) - urlnormalizer-basic: - convert host names using IDNA2008 if the property urlnormalizer.basic.host.idna2008 is true - refactor to share methods between URLUtil and urlnormalizer-basic - refactor calls of URLDecoder and pass Charset instead of String (since Java 10) --- conf/nutch-default.xml | 11 ++ .../apache/nutch/crawl/DeduplicationJob.java | 9 +- .../nutch/plugin/PluginManifestParser.java | 7 +- src/java/org/apache/nutch/util/URLUtil.java | 169 +++++++++++++++--- .../nutch/protocol/file/FileResponse.java | 17 +- .../nutch/protocol/ftp/FtpResponse.java | 3 +- .../urlnormalizer/ajax/AjaxURLNormalizer.java | 17 +- .../basic/BasicURLNormalizer.java | 59 +++--- .../basic/TestBasicURLNormalizer.java | 66 +++++++ .../org/apache/nutch/util/TestURLUtil.java | 126 ++++++++++++- 10 files changed, 397 insertions(+), 87 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 7a8a73f81b..8ae225735d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1657,6 +1657,17 @@ + + urlnormalizer.basic.host.idna2008 + false + If true, let urlnormalizer-basic + normalize Internationalized Domain Names (IDNs) using the + standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490). + Note that urlnormalizer.basic.host.idn must be set, otherwise + this property has no effect. + + + urlnormalizer.basic.host.trim-trailing-dot false diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 50aa4cd7bd..52bf422308 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -17,9 +17,9 @@ package org.apache.nutch.crawl; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.URLDecoder; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -34,7 +34,6 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Counter; -import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool { protected final static Text urlKey = new Text("_URLTEMPKEY_"); protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; - protected final static String UTF_8 = StandardCharsets.UTF_8.toString(); + protected final static Charset UTF_8 = StandardCharsets.UTF_8; public static class DBFilter extends Mapper { @@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) { String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); try { urlExisting = URLDecoder.decode(urlExisting, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlExisting, e); // use the encoded URL } try { urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlnewDoc, e); // use the encoded URL } diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java index 10ce4fdb7b..95208fa433 100644 --- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java +++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -18,11 +18,11 @@ import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -124,10 +124,7 @@ public File getPluginFolder(String name) { String path = url.getPath(); if (WINDOWS && path.startsWith("/")) // patch a windows bug path = path.substring(1); - try { - path = URLDecoder.decode(path, "UTF-8"); // decode the url path - } catch (UnsupportedEncodingException e) { - } + path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path directory = new File(path); } else if (!directory.exists()) { LOG.warn("Plugins: directory not found: {}", name); diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 44c6309d2a..fd036480a6 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.util; +import java.lang.invoke.MethodHandles; import java.net.IDN; import java.net.MalformedURLException; import java.net.URI; @@ -23,11 +24,22 @@ import java.util.Locale; import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ibm.icu.text.IDNA; + import crawlercommons.domains.EffectiveTldFinder; /** Utility class for URL analysis */ public class URLUtil { + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private static final IDNA idna = IDNA.getUTS46Instance( + IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE); + /** * Resolve relative URL-s and fix a java.net.URL error in handling of URLs * with pure query targets. @@ -520,17 +532,39 @@ public static String getProtocol(URL url) { return url.getProtocol(); } + public static boolean isAscii(String str) { + char[] chars = str.toCharArray(); + for (char c : chars) { + if (c > 127) { + return false; + } + } + return true; + } + + /** + * Convert URL with IDN host/domain name into the ASCII representation. + * + * @param url + * URL string to convert + * @return URL string with ASCII host/domain name or null if conversion fails. + */ public static String toASCII(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (isAscii(host) && host.equals(hostLowerCase))) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - do nothing if host is already ASCII-only + // - not already in lowercase => conversion also lowercases host name return url; } - URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host), - u.getPort(), u.getPath(), u.getQuery(), u.getRef()); + URI p = new URI(u.getProtocol(), u.getUserInfo(), + convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(), + u.getQuery(), u.getRef()); return p.toString(); } catch (Exception e) { @@ -538,13 +572,25 @@ public static String toASCII(String url) { } } + /** + * Convert URL with IDN host/domain name to the Unicode representation. + * + * @param url + * URL string to convert + * @return URL string with Unicode host/domain name or null if conversion + * fails. + */ public static String toUNICODE(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - contains 'xn--' => needs conversion + // - not already in lowercase => conversion also lowercases host name return url; } StringBuilder sb = new StringBuilder(); @@ -554,7 +600,7 @@ public static String toUNICODE(String url) { sb.append(u.getUserInfo()); sb.append('@'); } - sb.append(IDN.toUnicode(host)); + sb.append(convertIDNA2008(hostLowerCase, false)); if (u.getPort() != -1) { sb.append(':'); sb.append(u.getPort()); @@ -572,22 +618,83 @@ public static String toUNICODE(String url) { } /** - * For testing - * @param args print with no args to get help + * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN} + * class. + * + * The conversion supports only IDNA2003, it does not support IDNA2008. + * However, unless the parameter strictIDNA2003 is true, the + * methods {@link IDN#toASCII(String, int)} resp. + * {@link IDN#toUnicode(String, int)} are called passing the flag + * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on + * characters not in the repertoire of Unicode 3.2. + * + * @param host + * host name to be converted (lowercase expected) + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @param strictIDNA2003 + * if true, do + * @return converted host name + * @throws MalformedURLException + * if the conversion fails */ - public static void main(String[] args) { - - if (args.length != 1) { - System.err.println("Usage : URLUtil "); - return; + public static String convertIDNA2003(String host, boolean toAscii, + boolean strictIDNA2003) throws MalformedURLException { + try { + if (toAscii) { + return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } else { + return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } + } catch (IllegalArgumentException | IndexOutOfBoundsException e) { + // IllegalArgumentException: thrown if the input string contains + // non-convertible Unicode codepoints + // IndexOutOfBoundsException: thrown (undocumented) if one "label" + // (non-ASCII dot-separated segment) is longer than 256 characters, + // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 + LOG.debug("Failed to convert IDN host {}: ", host, e); + throw (MalformedURLException) new MalformedURLException( + "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); } + } - String url = args[0]; - try { - System.out.println(URLUtil.getDomainName(new URL(url))); - } catch (MalformedURLException ex) { - ex.printStackTrace(); + /** + * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class. + * + * The conversion supports IDNA2008 names. + * + * @param host + * host name to be converted (lowercase expected) + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @return converted host name + * @throws MalformedURLException + * if the conversion fails + */ + public static String convertIDNA2008(String host, boolean toAscii) + throws MalformedURLException { + final IDNA.Info idnaInfo = new IDNA.Info(); + final StringBuilder hostConverted = new StringBuilder(); + if (toAscii) { + idna.nameToASCII(host, hostConverted, idnaInfo); + } else { + idna.nameToUnicode(host, hostConverted, idnaInfo); } + if (idnaInfo.hasErrors()) { + StringBuilder msg = new StringBuilder(); + for (IDNA.Error error : idnaInfo.getErrors()) { + if (msg.length() == 0) { + msg.append("Invalid IDNA2008 host").append(host).append(": "); + } else { + msg.append(", "); + } + msg.append(error.name()); + } + String errorMsg = msg.toString(); + LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg); + throw new MalformedURLException(errorMsg); + } + return hostConverted.toString(); } /** @@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) { && url.getRef() == null // && url.getUserInfo() == null; } + + /** + * For testing + * @param args print with no args to get help + */ + public static void main(String[] args) { + + if (args.length != 1) { + System.err.println("Usage : URLUtil "); + System.err.println("\nExtract and print pay-level domain names for the input URL"); + return; + } + + String url = args[0]; + try { + System.out.println(URLUtil.getDomainName(new URL(url))); + } catch (MalformedURLException ex) { + ex.printStackTrace(); + } + } } diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java index c50988c2dd..64186b9035 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java @@ -16,20 +16,18 @@ */ package org.apache.nutch.protocol.file; -import java.net.URL; import java.io.IOException; -import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.protocol.Content; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; - +import org.apache.nutch.protocol.Content; import org.apache.tika.Tika; -import org.apache.hadoop.conf.Configuration; - /** * FileResponse.java mimics file replies as http response. It tries its best to * follow http's way for headers, response codes as well as exceptions. @@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) String path = url.getPath().isEmpty() ? "/" : url.getPath(); - try { - // specify the encoding via the config later? - path = java.net.URLDecoder.decode(path, "UTF-8"); - } catch (UnsupportedEncodingException ex) { - } + // specify the encoding via the config later? + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); try { diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index 8796cfc0b3..0d7ad1b289 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -30,6 +30,7 @@ import java.net.InetAddress; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.LinkedList; import java.io.ByteArrayOutputStream; @@ -245,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) this.content = null; - path = java.net.URLDecoder.decode(path, "UTF-8"); + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); if (path.endsWith("/")) { getDirAsHttpResponse(path, datum.getModifiedTime()); diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java index 2342ced68f..5518e39544 100644 --- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java +++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java @@ -16,19 +16,18 @@ */ package org.apache.nutch.net.urlnormalizer.ajax; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; -import java.net.MalformedURLException; -import java.nio.charset.Charset; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; -import org.apache.hadoop.conf.Configuration; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * URLNormalizer capable of dealing with AJAX URL's. * @@ -43,13 +42,11 @@ public class AjaxURLNormalizer implements URLNormalizer { public static String ESCAPED_URL_PART = "_escaped_fragment_="; private Configuration conf; - private Charset utf8; /** * Default constructor. */ public AjaxURLNormalizer() { - utf8 = Charset.forName("UTF-8"); } /** @@ -195,7 +192,7 @@ protected String escape(String fragmentPart) { String hex = null; StringBuilder sb = new StringBuilder(fragmentPart.length()); - for (byte b : fragmentPart.getBytes(utf8)) { + for (byte b : fragmentPart.getBytes(UTF_8)) { if (b < 33) { sb.append('%'); diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 2123d8fa9f..4ff9fc64a6 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -16,12 +16,10 @@ */ package org.apache.nutch.net.urlnormalizer.basic; -import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; -import java.net.IDN; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; @@ -36,6 +34,7 @@ import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,6 +46,12 @@ *

normalize * percent-encoding in URL paths

+ *

normalize the host name if it is an Internationalized Domain Name (IDN) + * to ASCII or Unicode, depending on the configuration properties + * urlnormalizer.basic.host.idn and + * urlnormalizer.basic.host.idna2008

+ *

remove a trailing dot in the host name (if the property + * urlnormalizer.basic.host.trim-trailing-dot is true)

* */ public class BasicURLNormalizer implements URLNormalizer { @@ -54,6 +59,7 @@ public class BasicURLNormalizer implements URLNormalizer { .getLogger(MethodHandles.lookup().lookupClass()); public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn"; + public final static String NORM_HOST_IDNA_2008 = "urlnormalizer.basic.host.idna2008"; public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot"; /** @@ -70,7 +76,7 @@ public class BasicURLNormalizer implements URLNormalizer { .compile("%([0-9A-Fa-f]{2})"); // charset used for encoding URLs before escaping - private final static Charset utf8 = StandardCharsets.UTF_8; + private final static Charset UTF_8 = StandardCharsets.UTF_8; /** look-up table for characters which should not be escaped in URL paths */ private final static boolean[] unescapedCharacters = new boolean[128]; @@ -132,20 +138,11 @@ private static boolean isHexCharacter(int c) { || (0x30 <= c && c <= 0x39); } - private static boolean isAscii(String str) { - char[] chars = str.toCharArray(); - for (char c : chars) { - if (c > 127) { - return false; - } - } - return true; - } - private Configuration conf; private boolean hostIDNtoASCII; private boolean hostASCIItoIDN; + private boolean hostIDNA2008; private boolean hostTrimTrailingDot; @Override @@ -159,9 +156,12 @@ public void setConf(Configuration conf) { String normIdn = conf.get(NORM_HOST_IDN, ""); if (normIdn.equalsIgnoreCase("toAscii")) { hostIDNtoASCII = true; + hostASCIItoIDN = false; } else if (normIdn.equalsIgnoreCase("toUnicode")) { + hostIDNtoASCII = false; hostASCIItoIDN = true; } + hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false); hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false); } @@ -364,7 +364,7 @@ private String escapePath(String path) { StringBuilder sb = new StringBuilder(path.length()); // Traverse over all bytes in this URL - byte[] bytes = path.getBytes(utf8); + byte[] bytes = path.getBytes(UTF_8); for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; // Is this a control character? @@ -415,8 +415,8 @@ private String normalizeHostName(String host) throws MalformedURLException { // 1. unescape percent-encoded characters in host name if (host.indexOf('%') != -1) { try { - host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + host = URLDecoder.decode(host, UTF_8); + } catch (IllegalArgumentException e) { LOG.debug("Failed to convert percent-encoded host name {}: ", host, e); throw (MalformedURLException) new MalformedURLException( "Invalid percent-encoded host name " + host + ": " + e.getMessage()) @@ -429,21 +429,18 @@ private String normalizeHostName(String host) throws MalformedURLException { // 3. if configured: convert between Unicode and ASCII forms // for Internationalized Domain Names (IDNs) - if (hostIDNtoASCII && !isAscii(host)) { - try { - host = IDN.toASCII(host); - } catch (IllegalArgumentException | IndexOutOfBoundsException e) { - // IllegalArgumentException: thrown if the input string contains - // non-convertible Unicode codepoints - // IndexOutOfBoundsException: thrown (undocumented) if one "label" - // (non-ASCII dot-separated segment) is longer than 256 characters, - // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 - LOG.debug("Failed to convert IDN host {}: ", host, e); - throw (MalformedURLException) new MalformedURLException( - "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); + if (hostIDNtoASCII && !URLUtil.isAscii(host)) { + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, true); + } else { + host = URLUtil.convertIDNA2003(host, true, false); } } else if (hostASCIItoIDN && host.contains("xn--")) { - host = IDN.toUnicode(host); + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, false); + } else { + host = URLUtil.convertIDNA2003(host, false, false); + } } // 4. optionally trim a trailing dot @@ -466,7 +463,7 @@ public static void main(String args[]) throws IOException { } String line, normUrl; BufferedReader in = new BufferedReader( - new InputStreamReader(System.in, utf8)); + new InputStreamReader(System.in, UTF_8)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index a6bad41f2e..090c25f2da 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -258,15 +258,24 @@ public void testHostName() throws Exception { // test Internationalized Domain Names BasicURLNormalizer norm = new BasicURLNormalizer(); conf = NutchConfiguration.create(); + + // to ASCII normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); norm.setConf(conf); normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/"); // verify escaping of percent-encoded characters in IDNs (NUTCH-2824) normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/", "https://www.xn--0251-sachverstndiger-ozb.de/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://нЭб.РФ/", "https://xn--90ax2c.xn--p1ai/"); + + // to Unicode normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); norm.setConf(conf); normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://Xn--90Ax2c.xN--P1ai/", "https://нэб.рф/"); + // test removal of trailing dot conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true); norm.setConf(conf); @@ -274,6 +283,63 @@ public void testHostName() throws Exception { "https://www.example.org/"); } + /** + * Test for IDNA2008 and IDNA2003 compatibility. + */ + @Test + public void testHostNameIDNA2008() throws Exception { + // IDNA2008 (https://www.rfc-editor.org/rfc/rfc5890.html#section-1.1) + BasicURLNormalizer norm = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); + norm.setConf(conf); + + // IDNA2003 / RFC 3490 + // Note: IDNA2008 and IDNA2003 deviate for this example + normalizeTest(norm, "https://straße.de/", "https://strasse.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // IDNA2008 / RFC 5890 + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, true); + norm.setConf(conf); + // Note: this is different from IDNA2003 + normalizeTest(norm, "https://straße.de/", "https://xn--strae-oqa.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding) + normalizeTest(norm, "https://xn--p1ai.%D1%80%D1%84/", + "https://xn--p1ai.xn--p1ai/"); + normalizeTest(norm, "https://xn--p1ai.рф/", "https://xn--p1ai.xn--p1ai/"); + + // test conversion to Unicode (IDNA2008) + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); + + // test conversion to Unicode (IDNA2003) + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); + } + /** * Test that normalizer throws MalformedURLException for invalid URLs */ diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 4d8ae07971..573af66430 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -16,11 +16,18 @@ */ package org.apache.nutch.util; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.net.MalformedURLException; import java.net.URL; import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; /** Test class for URLUtil */ public class TestURLUtil { @@ -312,7 +319,20 @@ public void testToUNICODE() throws Exception { assertEquals("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1", URLUtil .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1")); - + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.ᬩᬮᬶ.id/", + URLUtil.toUNICODE("https://example.xn--9tfky.id/")); + // IDNA2008 + assertEquals("http://straße.de/", + URLUtil.toUNICODE("http://xn--strae-oqa.de/")); + // host names with uppercase characters + assertEquals("https://googie.com/", + URLUtil.toUNICODE("https://googIe.com/")); + assertEquals("https://googie.com/", URLUtil.toASCII("https://googIe.com/")); + assertEquals("https://xn--90ax2c.xn--p1ai/", + URLUtil.toASCII("https://нЭб.РФ/")); + assertEquals("https://нэб.рф/", + URLUtil.toUNICODE("https://Xn--90Ax2c.xN--P1ai/")); } @Test @@ -324,6 +344,106 @@ public void testToASCII() throws Exception { assertEquals("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1", URLUtil .toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); + // IDNA2003 + // assertEquals("http://strasse.de/", + // URLUtil.toASCII("http://straße.de/")); + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.xn--9tfky.id/", + URLUtil.toASCII("https://example.ᬩᬮᬶ.id/")); + // IDNA2008 + assertEquals("http://xn--strae-oqa.de/", + URLUtil.toASCII("http://straße.de/")); + } + + @ParameterizedTest + @CsvSource({ // + "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", // + "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", // + "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", // + // Test examples from whatwg-url + "xn--53h.example,☕.example,IDNA2008,true", // + "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", // + "xn--mgba3gch31f060k.com,\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.com,IDNA2008,true", // + /* Note: IDNA2008 and IDNA2003 deviate for the following examples, + * cf. https://www.unicode.org/reports/tr46/#IDNA2003-Section */ + "xn--strae-oqa.de,straße.de,IDNA2008,true", // + "strasse.de,straße.de,IDNA2003,false", // + "strasse.de,strasse.de,IDNA2003,true", // + "xn--fa-hia.de,faß.de,IDNA2008,true", // + "fass.de,faß.de,IDNA2003,false", // + "fass.de,fass.de,IDNA2003,true", // + "xn--nxasmm1c.com,βόλος.com,IDNA2008,true", // + "xn--nxasmq6b.com,βόλος.com,IDNA2003,false", // + "xn--nxasmq6b.com,βόλοσ.com,IDNA2003,true", // + "xn--10cl1a0b660p.com,ශ්‍රී.com,IDNA2008,true", // + "xn--10cl1a0b.com,ශ්‍රී.com,IDNA2003,false", // + "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,true", // + "xn--mgba3gch31f060k.com,نامه‌ای.com,IDNA2008,true", // + "xn--mgba3gch31f.com,نامه‌ای.com,IDNA2003,false", // + "xn--mgba3gch31f.com,نامهای.com,IDNA2003,true", // + // mixed lowercase/uppercase: no round trip conversion + "xn--bb-eka.at,ÖBB.at,IDNA2003,false", // + "xn--bb-eka.at,öbb.at,IDNA2003,true", // + // mixed encoding (Punycode and Unicode) + "xn--p1ai.xn--p1ai,рф.xn--p1ai,IDNA2003,false", // + "xn--p1ai.xn--p1ai,xn--p1ai.рф,IDNA2003,false", // + // percent-encoding is not supported + // "xn--p1ai.xn--p1ai,xn--p1ai.%D1%80%D1%84,IDNA2003,false", // + }) + public final void testConvertHost(String ascii, String unicode, String type, + boolean roundTrip) throws Exception { + System.out.println(ascii + " <> " + unicode); + if ("IDNA2008".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true)); + assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false)); + try { + assertNotNull(URLUtil.convertIDNA2003(unicode, true, false)); + } catch (MalformedURLException e) { + /* + * Ok. A IDNA2008 input may raise an exception when using the IDNA2003 + * method + */ + } + } else if ("IDNA2003".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, true)); + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, false)); + if (roundTrip) { + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, true)); + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, false)); + } + } + } + + @Test + public final void testConvertHostInvalid() { + // broken Punycode + assertDoesNotThrow(() -> assertEquals("xn--xn--bss-7z6ccid.com", + URLUtil.convertIDNA2003("xn--xn--bss-7z6ccid.com", false, true))); + + // invalid Punycode + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--0.pt", false)); + + // IDNA2003 not allowing characters not in Unicode 3.2 + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2003("☕.example", true, true)); + assertDoesNotThrow(() -> assertEquals("xn--53h.example", + URLUtil.convertIDNA2003("xn--53h.example", false, true))); + + // IDNA2008 invalid, + // cf. https://www.unicode.org/reports/tr46/#Implementation_Notes + // cf. https://www.unicode.org/Public/17.0.0/idna/IdnaTestV2.txt + // disallowed character: ⒈ (U+2488 - DIGIT ONE FULL STOP) + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("\u2488com", true)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--acom-0w1b", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--xn--a--gua.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", true)); } @Test From 514760bca90337afaedcc036265a6d69739cd312 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 13 May 2026 22:56:43 +0200 Subject: [PATCH 8/8] NUTCH-3177 Fetcher to report idle threads not as hung threads - reset reprUrl in FetcherThread after fetch is finished - report idle threads properly --- src/java/org/apache/nutch/fetcher/Fetcher.java | 13 +++++++++---- .../org/apache/nutch/fetcher/FetcherThread.java | 3 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 0a08e9da2e..12d1b88bae 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -452,16 +452,21 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { * fetcher.threads.timeout.divisor. */ if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { - LOG.warn("Timeout reached with no new requests since {} seconds.", + LOG.warn( + "Timeout reached with no new requests since {} milliseconds.", timeout); - LOG.warn("Aborting with {} hung threads{}.", activeThreads, + LOG.warn("Aborting with {} hung or idle threads{}.", activeThreads, feeder.isAlive() ? " (queue feeder still alive)" : ""); hungThreadsCounter.increment(activeThreads.get()); for (int i = 0; i < fetcherThreads.size(); i++) { FetcherThread thread = fetcherThreads.get(i); if (thread.isAlive()) { - LOG.warn("Thread #{} hung while processing {}", i, - thread.getReprUrl()); + if (thread.getReprUrl() != null) { + LOG.warn("Thread #{} hung while processing {}", i, + thread.getReprUrl()); + } else { + LOG.warn("Thread #{} idle", i); + } StackTraceElement[] stack = thread.getStackTrace(); StringBuilder sb = new StringBuilder(); sb.append("Stack of thread #").append(i).append(":\n"); diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 23c2e23542..ba07b8250f 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -623,6 +623,9 @@ public void run() { output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } + + // done: unset reprUrl for reporting + setReprUrl(null); } } catch (Throwable e) {