Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cc-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ jobs:
restore-keys: |
${{ runner.os }}-ivy-
- name: Test
run: ant clean test -buildfile build.xml
run: ant clean test test-protocol-integration -buildfile build.xml
12 changes: 12 additions & 0 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ jobs:
- 'src/testresources/**'
plugins:
- 'src/plugin/**'
indexer_plugins:
- 'src/plugin/indexer-*/**'
protocol_plugins:
- 'src/plugin/protocol-*/**'
buildconf:
- 'build.xml'
- 'ivy/ivy.xml'
Expand All @@ -120,6 +124,14 @@ jobs:
- name: test plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
# run indexer integration tests when indexer plugin files change (Docker required, ubuntu-latest only)
- name: test indexer integration
if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
run: ant clean test-indexer-integration -buildfile build.xml
# run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only)
- name: test protocol integration
if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }}
run: ant clean test-protocol-integration -buildfile build.xml
- name: Check for test results
id: check_tests
if: always() && matrix.os == 'ubuntu-latest'
Expand Down
8 changes: 8 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,14 @@
<ant dir="src/plugin" target="test-single" inheritAll="false"/>
</target>

<target name="test-indexer-integration" depends="resolve-test, compile, compile-core-test, job" description="--> run indexer plugin integration tests (Testcontainers)">
<ant dir="src/plugin" target="test-indexer-integration" inheritAll="false"/>
</target>

<target name="test-protocol-integration" depends="resolve-test, compile, compile-core-test, job" description="--> run protocol plugin integration tests (Testcontainers)">
<ant dir="src/plugin" target="test-protocol-integration" inheritAll="false"/>
</target>

<target name="nightly" depends="test, tar-src, zip-src" description="--> run the nightly target build">
</target>

Expand Down
10 changes: 5 additions & 5 deletions conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@
<!-- default values that can be overridden by system properties:
Note: the script bin/nutch sets these properties from the environment variables
NUTCH_LOG_DIR and NUTCH_LOGFILE -->
<Property name="hadoop.log.dir">${sys:hadoop.log.dir:-./logs}</Property>
<Property name="hadoop.log.file">${sys:hadoop.log.file:-hadoop.log}</Property>
<Property name="nutch.log.dir">${sys:hadoop.log.dir:-./logs}</Property>
<Property name="nutch.log.file">${sys:hadoop.log.file:-hadoop.log}</Property>
</Properties>
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<RollingFile name="RollingFile" fileName="${nutch.log.dir}/${nutch.log.file}"
filePattern="${nutch.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
<Delete basePath="${nutch.log.dir}" maxDepth="2">
<IfFileName glob="*/nutch-*.log.gz" />
<IfLastModified age="60d" />
</Delete>
Expand Down
29 changes: 28 additions & 1 deletion conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1657,6 +1657,17 @@
</description>
</property>

<property>
<name>urlnormalizer.basic.host.idna2008</name>
<value>false</value>
<description>If true, let urlnormalizer-basic
normalize Internationalized Domain Names (IDNs) using the
standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490).
Note that urlnormalizer.basic.host.idn must be set, otherwise
this property has no effect.
</description>
</property>

<property>
<name>urlnormalizer.basic.host.trim-trailing-dot</name>
<value>false</value>
Expand Down Expand Up @@ -1707,7 +1718,10 @@
<value>plugins</value>
<description>Directories where Nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
as is. If relative, it is searched for on the classpath.</description>
as is. If relative, it is searched for on the classpath.
For secure deployments, treat these directories as trusted code: use
read-only filesystem permissions or immutable images so untrusted
parties cannot add or replace plugin JARs or plugin.xml files.</description>
</property>

<property>
Expand Down Expand Up @@ -2367,6 +2381,19 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
each property value is always an array of Strings (so if you expect one value, use [0])
* doc - contains all the NutchFields from the NutchDocument.
each property value is always an array of Objects.
Expressions are evaluated in a sandboxed JEXL engine (see also
nutch.jexl.disable.sandbox).
</description>
</property>

<property>
<name>nutch.jexl.disable.sandbox</name>
<value>false</value>
<description>If true, disables the Commons JEXL sandbox and the restriction
on the JEXL "new" operator for all Nutch JEXL expressions (index filter,
generator, hostdb filter, crawl_db_reader, exchange-jexl, etc.). This is
unsafe and should only be used in fully trusted environments when a
legitimate expression cannot be expressed under the default sandbox.
</description>
</property>

Expand Down
6 changes: 6 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary
docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080
```

## Security and plugin directories

Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree.

User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments.

## Usage

If not already running, start docker
Expand Down
8 changes: 8 additions & 0 deletions ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,14 @@
<dependency org="org.mockito" name="mockito-core" rev="5.18.0" conf="test->default"/>
<dependency org="org.mockito" name="mockito-junit-jupiter" rev="5.18.0" conf="test->default"/>

<!-- Testcontainers for indexer and protocol plugin integration tests -->
<dependency org="org.testcontainers" name="testcontainers" rev="2.0.3" conf="test->default"/>
<dependency org="org.testcontainers" name="junit-jupiter" rev="1.21.4" conf="test->default"/>
<!-- WireMock for HTTP mock server in protocol-httpclient integration tests -->
<dependency org="com.github.tomakehurst" name="wiremock-standalone" rev="3.0.1" conf="test->default"/>
<!-- MockFtpServer for in-process FTP server in protocol-ftp integration tests -->
<dependency org="org.mockftpserver" name="MockFtpServer" rev="3.1.0" conf="test->default"/>

<!-- Jetty used to serve test pages for unit tests, but is also provided as dependency of Hadoop -->
<dependency org="org.eclipse.jetty" name="jetty-server" rev="10.0.25" conf="test->default">
<exclude org="ch.qos.reload4j" module="*" />
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ public void setup(
retry = config.getInt("retry", -1);

if (config.get("expr", null) != null) {
expr = JexlUtil.parseExpression(config.get("expr", null));
expr = JexlUtil.parseExpression(config, config.get("expr", null));
}
sample = config.getFloat("sample", 1);
}
Expand Down
9 changes: 4 additions & 5 deletions src/java/org/apache/nutch/crawl/DeduplicationJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
package org.apache.nutch.crawl;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
Expand All @@ -34,7 +34,6 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
Expand Down Expand Up @@ -70,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
protected final static Text urlKey = new Text("_URLTEMPKEY_");
protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
protected final static Charset UTF_8 = StandardCharsets.UTF_8;

public static class DBFilter extends
Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
Expand Down Expand Up @@ -224,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
try {
urlExisting = URLDecoder.decode(urlExisting, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
} catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlExisting, e);
// use the encoded URL
}
try {
urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
} catch (UnsupportedEncodingException | IllegalArgumentException e) {
} catch (IllegalArgumentException e) {
LOG.error("Error decoding: {}", urlnewDoc, e);
// use the encoded URL
}
Expand Down
14 changes: 7 additions & 7 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ public void setup(
if (!restrictStatusString.isEmpty()) {
restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
}
expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null));
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
// Initialize cached counter references
Expand Down Expand Up @@ -453,10 +453,10 @@ public void setup(Context context) throws IOException {
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);

if (conf.get(GENERATOR_HOSTDB) != null) {
maxCountExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null));
fetchDelayExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
maxCountExpr = JexlUtil.parseExpression(conf,
conf.get(GENERATOR_MAX_COUNT_EXPR, null));
fetchDelayExpr = JexlUtil.parseExpression(conf,
conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
Expand Down Expand Up @@ -871,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
* @see JexlUtil#parseExpression(String)
* @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
Expand Down Expand Up @@ -922,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* @param hostdb
* name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
* @see JexlUtil#parseExpression(String)
* @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
Expand Down
13 changes: 9 additions & 4 deletions src/java/org/apache/nutch/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -452,16 +452,21 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
* fetcher.threads.timeout.divisor.
*/
if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
LOG.warn("Timeout reached with no new requests since {} seconds.",
LOG.warn(
"Timeout reached with no new requests since {} milliseconds.",
timeout);
LOG.warn("Aborting with {} hung threads{}.", activeThreads,
LOG.warn("Aborting with {} hung or idle threads{}.", activeThreads,
feeder.isAlive() ? " (queue feeder still alive)" : "");
hungThreadsCounter.increment(activeThreads.get());
for (int i = 0; i < fetcherThreads.size(); i++) {
FetcherThread thread = fetcherThreads.get(i);
if (thread.isAlive()) {
LOG.warn("Thread #{} hung while processing {}", i,
thread.getReprUrl());
if (thread.getReprUrl() != null) {
LOG.warn("Thread #{} hung while processing {}", i,
thread.getReprUrl());
} else {
LOG.warn("Thread #{} idle", i);
}
StackTraceElement[] stack = thread.getStackTrace();
StringBuilder sb = new StringBuilder();
sb.append("Stack of thread #").append(i).append(":\n");
Expand Down
3 changes: 3 additions & 0 deletions src/java/org/apache/nutch/fetcher/FetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,9 @@ public void run() {
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
CrawlDatum.STATUS_FETCH_RETRY);
}

// done: unset reprUrl for reporting
setReprUrl(null);
}

} catch (Throwable e) {
Expand Down
9 changes: 2 additions & 7 deletions src/java/org/apache/nutch/hostdb/ReadHostDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,10 @@
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;

import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.JexlScript;
import org.apache.commons.jexl3.JexlEngine;
import org.apache.commons.jexl3.MapContext;
import org.apache.nutch.util.JexlUtil;

/**
* @see <a href='https://commons.apache.org/proper/commons-jexl/reference/syntax.html'>Commons</a>
Expand Down Expand Up @@ -77,11 +76,7 @@ public void setup(Context context) {
fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true);
String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
if (expr != null) {
// Create or retrieve a JexlEngine
JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();

// Create an expression object
this.expr = jexl.createScript(expr);
this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr);
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/java/org/apache/nutch/metrics/NutchMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ private NutchMetrics() {
/** Counter group for WARC export operations. */
public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter";

/** Counter group for Common Crawl data dumper tool. */
public static final String GROUP_COMMONCRAWL_DUMPER = "nutch_commoncrawl_dumper";

/** Counter group for domain statistics operations. */
public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats";

Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/parse/ParseStatus.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public class ParseStatus implements Writable {
// Secondary failure codes go here:

/**
* Parsing failed. An Exception occured (which may be retrieved from the
* Parsing failed. An Exception occurred (which may be retrieved from the
* arguments).
*/
public static final short FAILED_EXCEPTION = 200;
Expand Down
7 changes: 2 additions & 5 deletions src/java/org/apache/nutch/plugin/PluginManifestParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

Expand Down Expand Up @@ -124,10 +124,7 @@ public File getPluginFolder(String name) {
String path = url.getPath();
if (WINDOWS && path.startsWith("/")) // patch a windows bug
path = path.substring(1);
try {
path = URLDecoder.decode(path, "UTF-8"); // decode the url path
} catch (UnsupportedEncodingException e) {
}
path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path
directory = new File(path);
} else if (!directory.exists()) {
LOG.warn("Plugins: directory not found: {}", name);
Expand Down
8 changes: 7 additions & 1 deletion src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
import org.apache.nutch.crawl.LinkDbReader;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.metrics.ErrorTracker;
import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
Expand Down Expand Up @@ -188,6 +190,7 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool {
private GzipCompressorOutputStream gzipOutput = null;
private TarArchiveOutputStream tarOutput = null;
private ArrayList<String> fileList = null;
private ErrorTracker errorTracker;

/**
* Main method for invoking this tool
Expand All @@ -210,13 +213,15 @@ public static void main(String[] args) throws Exception {
* @param config A populated {@link CommonCrawlConfig}
*/
public CommonCrawlDataDumper(CommonCrawlConfig config) {
this();
this.config = config;
}

/**
* Constructor
*/
public CommonCrawlDataDumper() {
this.errorTracker = new ErrorTracker(NutchMetrics.GROUP_COMMONCRAWL_DUMPER);
}

/**
Expand Down Expand Up @@ -274,7 +279,8 @@ public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip,
if (parts == null || parts.size() == 0) {
LOG.error( "No segment directories found in {} ",
segmentRootDir.getAbsolutePath());
System.exit(1);
this.errorTracker.recordError(ErrorTracker.ErrorType.OTHER);
return;
}
LOG.info("Found {} segment parts", parts.size());
if (gzip && !warc) {
Expand Down
Loading
Loading