diff --git a/.github/workflows/cc-build.yml b/.github/workflows/cc-build.yml
index e382c8771a..1e8f23a691 100644
--- a/.github/workflows/cc-build.yml
+++ b/.github/workflows/cc-build.yml
@@ -29,9 +29,9 @@ jobs:
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v5
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@v4
+ uses: actions/setup-java@v5
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
@@ -53,5 +53,12 @@ jobs:
- name: Install recent public suffix list
run: |
curl https://publicsuffix.org/list/public_suffix_list.dat -o conf/effective_tld_names.dat
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
- name: Test
run: ant clean test -buildfile build.xml
diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml
index ead3e5b325..e2359737ba 100644
--- a/.github/workflows/junit-report.yml
+++ b/.github/workflows/junit-report.yml
@@ -25,30 +25,41 @@ jobs:
checks:
runs-on: ubuntu-latest
steps:
- - name: Download Test Report
+ - name: Download Test Report (Ubuntu)
uses: dawidd6/action-download-artifact@v11
with:
- name: junit-test-results
+ name: junit-test-results-ubuntu-latest
workflow: master-build.yml
run_id: ${{ github.event.workflow_run.id }}
+ continue-on-error: true
- name: Publish Test Report
- uses: mikepenz/action-junit-report@v5
+ uses: mikepenz/action-junit-report@v6
with:
report_paths: |-
./test/TEST-*.xml
./**/test/TEST-*.xml
+ check_name: |-
+ JUnit Test Report
+ JUnit Test Report Plugins
commit: ${{ github.event.workflow_run.head_sha }}
- comment: true
- pr_id: ${{ github.event.workflow_run.pull_requests[0].number }}
- fail_on_failure: true
+ fail_on_failure: false
+ fail_on_parse_error: true
+ require_tests: true
+ require_passed_tests: true
+ include_passed: false
+ include_skipped: true
+ check_annotations: true
+ annotate_notice: true
job_summary: true
detailed_summary: true
- truncate_stack_traces: false
- fail_on_parse_error: false # temporary while debugging TestMimeUtil
- require_tests: true
+ flaky_summary: true
+ skip_success_summary: true
include_time_in_summary: true
- include_passed: true
+ group_suite: true
+ comment: true
+ updateComment: true
+ skip_comment_without_tests: true
job_name: tests
- check_name: |-
- JUnit Test Report Core
- JUnit Test Report Plugins
+ truncate_stack_traces: false
+ annotations_limit: 50
+ pr_id: ${{ github.event.workflow_run.pull_requests[0].number || '' }}
diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml
index f7265e5b52..d73bb3a693 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -24,7 +24,7 @@ jobs:
javadoc:
strategy:
matrix:
- java: ['11']
+ java: ['17']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
@@ -34,12 +34,19 @@ jobs:
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
- name: Javadoc
run: ant clean javadoc -buildfile build.xml
rat:
strategy:
matrix:
- java: ['11']
+ java: ['17']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
@@ -49,6 +56,13 @@ jobs:
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
- name: Run Apache Rat
run: ant clean run-rat -buildfile build.xml
- name: Cache unknown licenses
@@ -62,10 +76,10 @@ jobs:
tests:
strategy:
matrix:
- java: ['11']
+ java: ['17']
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
- timeout-minutes: 30
+ timeout-minutes: 45
steps:
- uses: actions/checkout@v5
- name: Set up JDK ${{ matrix.java }}
@@ -73,6 +87,13 @@ jobs:
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
+ - name: Cache Ivy dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.ivy2/cache
+ key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-ivy-
- uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36
id: filter
with:
@@ -99,13 +120,23 @@ jobs:
- name: test plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
+ - name: Check for test results
+ id: check_tests
+ if: always() && matrix.os == 'ubuntu-latest'
+ run: |
+ shopt -s globstar nullglob
+ files=(./build/test/TEST-*.xml ./build/**/test/TEST-*.xml)
+ if [ ${#files[@]} -gt 0 ]; then
+ echo "has_results=true" >> $GITHUB_OUTPUT
+ else
+ echo "has_results=false" >> $GITHUB_OUTPUT
+ fi
- name: Upload Test Report
uses: actions/upload-artifact@v4
- if: always()
+ if: always() && matrix.os == 'ubuntu-latest' && steps.check_tests.outputs.has_results == 'true'
with:
- name: junit-test-results
+ name: junit-test-results-${{ matrix.os }}
path: |
./build/test/TEST-*.xml
./build/**/test/TEST-*.xml
- retention-days: 1
- overwrite: true
\ No newline at end of file
+ retention-days: 1
\ No newline at end of file
diff --git a/LICENSE-binary b/LICENSE-binary
index 538e3baf7c..addc4a2824 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -245,7 +245,6 @@ com.google.inject.extensions:guice-servlet
com.google.j2objc:j2objc-annotations
com.healthmarketscience.jackcess:jackcess
com.healthmarketscience.jackcess:jackcess-encrypt
-com.intellij:annotations
com.maxmind.db:maxmind-db
com.maxmind.geoip2:geoip2
com.nimbusds:nimbus-jose-jwt
@@ -257,7 +256,12 @@ com.rometools:rome-utils
com.shapesecurity:salvation2
com.squareup.okhttp3:okhttp
com.squareup.okhttp3:okhttp-brotli
+com.squareup.okhttp3:okhttp-jvm
+com.squareup.okhttp3:okhttp-zstd
com.squareup.okio:okio
+com.squareup.okio:okio-jvm
+com.squareup.zstd:zstd-kmp-jvm
+com.squareup.zstd:zstd-kmp-okio-jvm
com.tdunning:t-digest
com.typesafe.netty:netty-reactive-streams
com.typesafe.scala-logging:scala-logging_2.12
@@ -275,13 +279,14 @@ commons-lang:commons-lang
commons-logging:commons-logging
commons-net:commons-net
commons-validator:commons-validator
+de.l3s.boilerpipe:boilerpipe
de.vandermeer:ascii-utf-themes
de.vandermeer:asciitable
de.vandermeer:char-translation
de.vandermeer:skb-interfaces
dev.failsafe:failsafe
+info.picocli:picocli
io.dropwizard.metrics:metrics-core
-io.netty:netty
io.netty:netty-all
io.netty:netty-buffer
io.netty:netty-codec
@@ -378,7 +383,7 @@ org.apache.hadoop:hadoop-yarn-api
org.apache.hadoop:hadoop-yarn-client
org.apache.hadoop:hadoop-yarn-common
org.apache.hadoop.thirdparty:hadoop-shaded-guava
-org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7
+org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25
org.apache.httpcomponents:httpasyncclient
org.apache.httpcomponents:httpclient
org.apache.httpcomponents:httpcore
@@ -398,21 +403,13 @@ org.apache.kafka:kafka-storage
org.apache.kafka:kafka-storage-api
org.apache.kafka:kafka-tools-api
org.apache.kafka:kafka_2.12
-org.apache.kerby:kerb-admin
-org.apache.kerby:kerb-client
-org.apache.kerby:kerb-common
org.apache.kerby:kerb-core
org.apache.kerby:kerb-crypto
-org.apache.kerby:kerb-identity
-org.apache.kerby:kerb-server
-org.apache.kerby:kerb-simplekdc
org.apache.kerby:kerb-util
org.apache.kerby:kerby-asn1
org.apache.kerby:kerby-config
org.apache.kerby:kerby-pkix
org.apache.kerby:kerby-util
-org.apache.kerby:kerby-xdr
-org.apache.kerby:token-provider
org.apache.logging.log4j:log4j-api
org.apache.logging.log4j:log4j-core
org.apache.logging.log4j:log4j-slf4j2-impl
@@ -435,6 +432,7 @@ org.apache.pdfbox:fontbox
org.apache.pdfbox:jbig2-imageio
org.apache.pdfbox:jempbox
org.apache.pdfbox:pdfbox
+org.apache.pdfbox:pdfbox-io
org.apache.pdfbox:pdfbox-tools
org.apache.pdfbox:xmpbox
org.apache.poi:poi
@@ -443,6 +441,7 @@ org.apache.poi:poi-ooxml-lite
org.apache.poi:poi-scratchpad
org.apache.solr:solr-solrj
org.apache.tika:tika-core
+org.apache.tika:tika-handler-boilerpipe
org.apache.tika:tika-langdetect-optimaize
org.apache.tika:tika-parser-apple-module
org.apache.tika:tika-parser-audiovideo-module
@@ -476,8 +475,6 @@ org.asynchttpclient:async-http-client
org.asynchttpclient:async-http-client-netty-utils
org.bitbucket.b_c:jose4j
org.ccil.cowan.tagsoup:tagsoup
-org.codehaus.jackson:jackson-core-asl
-org.codehaus.jackson:jackson-mapper-asl
org.codehaus.jettison:jettison
org.eclipse.jetty:jetty-alpn-client
org.eclipse.jetty:jetty-alpn-java-client
@@ -515,9 +512,6 @@ org.gagravarr:vorbis-java-core
org.gagravarr:vorbis-java-tika
org.jetbrains:annotations
org.jetbrains.kotlin:kotlin-stdlib
-org.jetbrains.kotlin:kotlin-stdlib-common
-org.jetbrains.kotlin:kotlin-stdlib-jdk7
-org.jetbrains.kotlin:kotlin-stdlib-jdk8
org.jspecify:jspecify
org.littleshoot:littleproxy
org.locationtech.spatial4j:spatial4j
@@ -595,9 +589,7 @@ BSD 2-Clause
com.barchart.udt:barchart-udt-bundle
com.github.luben:zstd-jni
-com.google.protobuf:protobuf-java
dk.brics:automaton
-dnsjava:dnsjava
org.codehaus.woodstox:stax2-api
org.jline:jline
@@ -609,6 +601,7 @@ BSD 3-Clause
com.adobe.xmp:xmpcore
com.github.virtuald:curvesapi
+dnsjava:dnsjava
org.fusesource.leveldbjni:leveldbjni-all
org.ow2.asm:asm
@@ -633,7 +626,7 @@ Bouncy Castle Licence
(licenses-binary/LICENSE-bouncy-castle-licence.txt)
-org.bouncycastle:bcmail-jdk18on
+org.bouncycastle:bcjmail-jdk18on
org.bouncycastle:bcpkix-jdk18on
org.bouncycastle:bcprov-jdk18on
org.bouncycastle:bcutil-jdk18on
@@ -717,6 +710,8 @@ jakarta.jws:jakarta.jws-api
jakarta.xml.bind:jakarta.xml.bind-api
jakarta.xml.soap:jakarta.xml.soap-api
jakarta.xml.ws:jakarta.xml.ws-api
+org.eclipse.angus:angus-activation
+org.glassfish.jaxb:jaxb-core
org.glassfish.jaxb:jaxb-runtime
org.glassfish.jaxb:txw2
@@ -724,6 +719,8 @@ org.glassfish.jaxb:txw2
Eclipse Public License - Version 2.0
------------------------------------
+(licenses-binary/LICENSE-eclipse-public-license---version-2.0.txt)
+
org.eclipse.jetty:jetty-http
org.eclipse.jetty:jetty-io
org.eclipse.jetty:jetty-security
@@ -734,6 +731,8 @@ org.eclipse.jetty:jetty-util
MIT
---
+(licenses-binary/LICENSE-mit-license.txt)
+
net.sourceforge.argparse4j:argparse4j
org.slf4j:slf4j-api
@@ -781,7 +780,6 @@ Public Domain
(licenses-binary/LICENSE-public-domain.txt)
aopalliance:aopalliance
-org.tukaani:xz
Public Domain, per Creative Commons CC0
diff --git a/NOTICE-binary b/NOTICE-binary
index 99fea523a4..412ce7d38e 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -48,7 +48,7 @@ Apache projects
# org.apache.avro:avro
-Apache Avro (http://avro.apache.org)
+Apache Avro (https://avro.apache.org)
# org.apache.commons:commons-collections4
Apache Commons Collections (https://commons.apache.org/proper/commons-collections/)
@@ -60,6 +60,8 @@ Apache Commons Configuration (https://commons.apache.org/proper/commons-configur
Apache Commons CSV (https://commons.apache.org/proper/commons-csv/)
# org.apache.commons:commons-exec
Apache Commons Exec (http://commons.apache.org/proper/commons-exec/)
+# org.apache.commons:commons-exec
+Apache Commons Exec (https://commons.apache.org/proper/commons-exec/)
# org.apache.commons:commons-jexl3
Apache Commons JEXL (https://commons.apache.org/proper/commons-jexl/)
# org.apache.commons:commons-lang3
@@ -68,8 +70,6 @@ Apache Commons Lang (https://commons.apache.org/proper/commons-lang/)
Apache Commons Lang (http://commons.apache.org/proper/commons-lang/)
# org.apache.commons:commons-math3
Apache Commons Math (http://commons.apache.org/proper/commons-math/)
-# org.apache.commons:commons-math3
-Apache Commons Math (http://commons.apache.org/math/)
# org.apache.commons:commons-text
Apache Commons Text (https://commons.apache.org/proper/commons-text)
@@ -132,8 +132,8 @@ Apache Hadoop YARN Common
# org.apache.hadoop.thirdparty:hadoop-shaded-guava
Apache Hadoop shaded Guava
-# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7
-Apache Hadoop shaded Protobuf 3.7
+# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25
+Apache Hadoop shaded Protobuf
# org.apache.httpcomponents:httpasyncclient
Apache HttpAsyncClient (http://hc.apache.org/httpcomponents-asyncclient)
@@ -146,6 +146,8 @@ Apache HttpCore (http://hc.apache.org/httpcomponents-core-ga)
# org.apache.httpcomponents:httpcore-nio
Apache HttpCore NIO (http://hc.apache.org/httpcomponents-core-ga)
# org.apache.httpcomponents:httpmime
+Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client-ga)
+# org.apache.httpcomponents:httpmime
Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client)
# org.apache.james:apache-mime4j-core
@@ -178,22 +180,10 @@ Apache Kafka (https://kafka.apache.org)
# org.apache.kafka:kafka_2.12
Apache Kafka (https://kafka.apache.org)
-# org.apache.kerby:kerb-admin
-Apache Kerby-kerb Admin
-# org.apache.kerby:kerb-client
-Apache Kerby-kerb Client
-# org.apache.kerby:kerb-common
-Apache Kerby-kerb Common
# org.apache.kerby:kerb-core
Apache Kerby-kerb core
# org.apache.kerby:kerb-crypto
Apache Kerby-kerb Crypto
-# org.apache.kerby:kerb-identity
-Apache Kerby-kerb Identity
-# org.apache.kerby:kerb-server
-Apache Kerby-kerb Server
-# org.apache.kerby:kerb-simplekdc
-Apache Kerb Simple Kdc
# org.apache.kerby:kerb-util
Apache Kerby-kerb Util
# org.apache.kerby:kerby-asn1
@@ -204,10 +194,6 @@ Apache Kerby Config
Apache Kerby PKIX Project
# org.apache.kerby:kerby-util
Apache Kerby Util
-# org.apache.kerby:kerby-xdr
-Apache Kerby XDR Project
-# org.apache.kerby:token-provider
-Apache Token provider
# org.apache.logging.log4j:log4j-api
Apache Log4j API
@@ -258,6 +244,8 @@ Apache PDFBox JBIG2 ImageIO plugin
Apache JempBox
# org.apache.pdfbox:pdfbox
Apache PDFBox
+# org.apache.pdfbox:pdfbox-io
+Apache PDFBox io
# org.apache.pdfbox:pdfbox-tools
Apache PDFBox tools
# org.apache.pdfbox:xmpbox
@@ -277,6 +265,8 @@ Apache Solr Solrj
# org.apache.tika:tika-core
Apache Tika core (https://tika.apache.org/)
+# org.apache.tika:tika-handler-boilerpipe
+Apache
# org.apache.tika:tika-langdetect-optimaize
Apache Tika Optimaize langdetect
# org.apache.tika:tika-parser-apple-module
@@ -391,10 +381,10 @@ Jackson-annotations (http://github.com/FasterXML/jackson)
Jackson-annotations (https://github.com/FasterXML/jackson)
- license: The Apache Software License, Version 2.0
# com.fasterxml.jackson.core:jackson-core
-Jackson-core (https://github.com/FasterXML/jackson)
+Jackson-core (https://github.com/FasterXML/jackson-core)
- license: The Apache Software License, Version 2.0
# com.fasterxml.jackson.core:jackson-core
-Jackson-core (https://github.com/FasterXML/jackson-core)
+Jackson-core (https://github.com/FasterXML/jackson)
- license: The Apache Software License, Version 2.0
# com.fasterxml.jackson.core:jackson-databind
jackson-databind (http://github.com/FasterXML/jackson)
@@ -519,10 +509,10 @@ error-prone annotations
# com.google.guava:failureaccess
Guava InternalFutureFailureAccess and InternalFutures
-- license: The Apache Software License, Version 2.0
+- license: Apache License, Version 2.0
# com.google.guava:failureaccess
Guava InternalFutureFailureAccess and InternalFutures
-- license: Apache License, Version 2.0
+- license: The Apache Software License, Version 2.0
# com.google.guava:guava
Guava: Google Core Libraries for Java (https://github.com/google/guava)
- license: Apache License, Version 2.0
@@ -548,14 +538,10 @@ J2ObjC Annotations (https://github.com/google/j2objc/)
J2ObjC Annotations (https://github.com/google/j2objc/)
- license: The Apache Software License, Version 2.0
-# com.google.protobuf:protobuf-java
-Protocol Buffer Java API (http://code.google.com/p/protobuf)
-- license: New BSD license
- (licenses-binary/LICENSE-bsd-2-clause.txt)
-
# com.google.re2j:re2j
re2j (http://github.com/google/re2j)
- license: The Go license
+ (licenses-binary/LICENSE-the-go-license.txt)
# com.googlecode.juniversalchardet:juniversalchardet
juniversalchardet (http://juniversalchardet.googlecode.com/)
@@ -577,10 +563,7 @@ Jackcess Encrypt (http://jackcessencrypt.sf.net)
# com.ibm.icu:icu4j
ICU4J (https://icu.unicode.org/)
- license: Unicode-3.0
-
-# com.intellij:annotations
-IntelliJ IDEA Annotations (http://www.jetbrains.org)
-- license: Apache License 2
+ (licenses-binary/LICENSE-unicode-icu-license.txt)
# com.jcraft:jsch
JSch (http://www.jcraft.com/jsch/)
@@ -633,14 +616,30 @@ salvation (http://cspvalidator.org)
- license: Apache License, Version 2.0
# com.squareup.okhttp3:okhttp
-OkHttp (https://square.github.io/okhttp/)
+okhttp (https://square.github.io/okhttp/)
- license: The Apache Software License, Version 2.0
# com.squareup.okhttp3:okhttp-brotli
okhttp-brotli (https://square.github.io/okhttp/)
- license: The Apache Software License, Version 2.0
+# com.squareup.okhttp3:okhttp-jvm
+okhttp (https://square.github.io/okhttp/)
+- license: The Apache Software License, Version 2.0
+# com.squareup.okhttp3:okhttp-zstd
+okhttp-zstd (https://square.github.io/okhttp/)
+- license: The Apache Software License, Version 2.0
# com.squareup.okio:okio
-Okio (https://github.com/square/okio/)
+okio (https://github.com/square/okio/)
+- license: The Apache Software License, Version 2.0
+# com.squareup.okio:okio-jvm
+okio (https://github.com/square/okio/)
+- license: The Apache Software License, Version 2.0
+
+# com.squareup.zstd:zstd-kmp-jvm
+zstd-kmp (https://github.com/square/okio-zstd/)
+- license: The Apache Software License, Version 2.0
+# com.squareup.zstd:zstd-kmp-okio-jvm
+zstd-kmp-okio (https://github.com/square/okio-zstd/)
- license: The Apache Software License, Version 2.0
# com.sun.activation:jakarta.activation
@@ -778,6 +777,10 @@ Apache Commons Net (https://commons.apache.org/proper/commons-net/)
Apache Commons Validator (http://commons.apache.org/proper/commons-validator/)
- license: Apache License, Version 2.0
+# de.l3s.boilerpipe:boilerpipe
+Apache License 2.0 (http://code.google.com/p/boilerpipe/)
+- license: Apache License 2.0
+
# de.vandermeer:ascii-utf-themes
ASCII and UTF Themes (https://github.com/vdmeer/ascii-utf-themes)
- license: Apache 2
@@ -801,17 +804,18 @@ dk.brics.automaton (https://www.brics.dk/automaton)
(licenses-binary/LICENSE-bsd-2-clause.txt)
# dnsjava:dnsjava
-dnsjava (http://www.dnsjava.org)
-- license: BSD 2-Clause license
- (licenses-binary/LICENSE-bsd-2-clause.txt)
+dnsjava (https://github.com/dnsjava/dnsjava)
+- license: BSD-3-Clause
+ (licenses-binary/LICENSE-bsd-3-clause.txt)
+
+# info.picocli:picocli
+picocli (https://picocli.info)
+- license: The Apache Software License, version 2.0
# io.dropwizard.metrics:metrics-core
Metrics Core
- license: Apache License 2.0
-# io.netty:netty
-Netty (http://netty.io/)
-- license: Apache License, Version 2.0
# io.netty:netty-all
Netty/All-in-One (https://netty.io/netty-all/)
- license: Apache License, Version 2.0
@@ -969,6 +973,10 @@ Google S2 geometry library (https://github.com/sgr-io/s2-geometry-library-java)
# jakarta.activation:jakarta.activation-api
Jakarta Activation API jar
+- license: EDL 1.0
+ (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt)
+# jakarta.activation:jakarta.activation-api
+Jakarta Activation API (https://github.com/jakartaee/jaf-api)
- license: EDL 1.0
(licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt)
@@ -1019,7 +1027,7 @@ javax.ws.rs-api (https://github.com/eclipse-ee4j/jaxrs-api)
(licenses-binary/LICENSE-epl-2.0.txt)
# javax.ws.rs:jsr311-api
jsr311-api (https://jsr311.dev.java.net)
-- license: CDDL License
+- license: CDDL License
(licenses-binary/LICENSE-cddl-license.txt)
# javax.xml.bind:jaxb-api
@@ -1060,6 +1068,7 @@ JOpt Simple (http://jopt-simple.github.io/jopt-simple)
# net.sourceforge.argparse4j:argparse4j
argparse4j (http://argparse4j.github.io)
- license: MIT
+ (licenses-binary/LICENSE-mit-license.txt)
# net.sourceforge.htmlunit:htmlunit
HtmlUnit (http://htmlunit.sourceforge.net)
@@ -1105,20 +1114,24 @@ Asynchronous Http Client Netty Utils
jose4j (https://bitbucket.org/b_c/jose4j/)
- license: The Apache Software License, Version 2.0
-# org.bouncycastle:bcmail-jdk18on
-Bouncy Castle S/MIME API (https://www.bouncycastle.org/java.html)
+# org.bouncycastle:bcjmail-jdk18on
+Bouncy Castle JavaMail Jakarta S/MIME APIs (https://www.bouncycastle.org/download/bouncy-castle-java/)
- license: Bouncy Castle Licence
(licenses-binary/LICENSE-bouncy-castle-licence.txt)
# org.bouncycastle:bcpkix-jdk18on
-Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/java.html)
+Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/download/bouncy-castle-java/)
- license: Bouncy Castle Licence
(licenses-binary/LICENSE-bouncy-castle-licence.txt)
# org.bouncycastle:bcprov-jdk18on
Bouncy Castle Provider (https://www.bouncycastle.org/java.html)
+- license: Bouncy Castle Licence
+ (licenses-binary/LICENSE-bouncy-castle-licence.txt)
+# org.bouncycastle:bcprov-jdk18on
+Bouncy Castle Provider (https://www.bouncycastle.org/download/bouncy-castle-java/)
- license: Bouncy Castle Licence
(licenses-binary/LICENSE-bouncy-castle-licence.txt)
# org.bouncycastle:bcutil-jdk18on
-Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/java.html)
+Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/download/bouncy-castle-java/)
- license: Bouncy Castle Licence
(licenses-binary/LICENSE-bouncy-castle-licence.txt)
@@ -1140,13 +1153,6 @@ Checker Qual (https://checkerframework.org/)
- license: The MIT License
(licenses-binary/LICENSE-mit-license.txt)
-# org.codehaus.jackson:jackson-core-asl
-Jackson (http://jackson.codehaus.org)
-- license: The Apache Software License, Version 2.0
-# org.codehaus.jackson:jackson-mapper-asl
-Data Mapper for Jackson (http://jackson.codehaus.org)
-- license: The Apache Software License, Version 2.0
-
# org.codehaus.jettison:jettison
Jettison (https://github.com/jettison-json/jettison)
- license: Apache License, Version 2.0
@@ -1163,7 +1169,12 @@ Stax2 API (http://github.com/FasterXML/stax2-api)
# org.codelibs:jhighlight
JHighlight (https://github.com/codelibs/jhighlight)
- license: CDDL, v1.0
- (licenses-binary/LICENSE-cddl-v1.0.txt)
+ (licenses-binary/LICENSE-cddl-1.0.txt)
+
+# org.eclipse.angus:angus-activation
+Angus Activation Registries
+- license: EDL 1.0
+ (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt)
# org.eclipse.jetty:jetty-alpn-client
Jetty :: ALPN :: Client
@@ -1180,18 +1191,22 @@ Jetty :: Http Utility
# org.eclipse.jetty:jetty-http
Jetty :: Http Utility
- license: Eclipse Public License - Version 2.0
+ (licenses-binary/LICENSE-epl-2.0.txt)
# org.eclipse.jetty:jetty-io
Jetty :: IO Utility
- license: Apache Software License - Version 2.0
# org.eclipse.jetty:jetty-io
Jetty :: IO Utility
- license: Eclipse Public License - Version 2.0
+ (licenses-binary/LICENSE-epl-2.0.txt)
# org.eclipse.jetty:jetty-security
Jetty :: Security
- license: Eclipse Public License - Version 2.0
+ (licenses-binary/LICENSE-epl-2.0.txt)
# org.eclipse.jetty:jetty-server
Jetty :: Server Core
- license: Eclipse Public License - Version 2.0
+ (licenses-binary/LICENSE-epl-2.0.txt)
# org.eclipse.jetty:jetty-servlet
Jetty :: Servlet Handling
- license: Apache Software License - Version 2.0
@@ -1201,6 +1216,7 @@ Jetty :: Utilities
# org.eclipse.jetty:jetty-util
Jetty :: Utilities
- license: Eclipse Public License - Version 2.0
+ (licenses-binary/LICENSE-epl-2.0.txt)
# org.eclipse.jetty:jetty-util-ajax
Jetty :: Utilities :: Ajax(JSON)
- license: Apache Software License - Version 2.0
@@ -1295,6 +1311,10 @@ Ogg and Vorbis for Java, Core (https://github.com/Gagravarr/VorbisJava)
Apache Tika plugin for Ogg, Vorbis and FLAC (https://github.com/Gagravarr/VorbisJava)
- license: The Apache Software License, Version 2.0
+# org.glassfish.jaxb:jaxb-core
+JAXB Core (https://eclipse-ee4j.github.io/jaxb-ri/)
+- license: Eclipse Distribution License - v 1.0
+ (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt)
# org.glassfish.jaxb:jaxb-runtime
JAXB Runtime (https://eclipse-ee4j.github.io/jaxb-ri/)
- license: Eclipse Distribution License - v 1.0
@@ -1326,22 +1346,16 @@ JDOM (http://www.jdom.org)
JDOM (http://www.jdom.org)
- license: Similar to Apache License but with the acknowledgment clause removed
+# org.jetbrains:annotations
+JetBrains Java Annotations (https://github.com/JetBrains/java-annotations)
+- license: The Apache Software License, Version 2.0
# org.jetbrains:annotations
IntelliJ IDEA Annotations (http://www.jetbrains.org)
- license: The Apache Software License, Version 2.0
# org.jetbrains.kotlin:kotlin-stdlib
-org.jetbrains.kotlin:kotlin-stdlib (https://kotlinlang.org/)
-- license: The Apache License, Version 2.0
-# org.jetbrains.kotlin:kotlin-stdlib-common
-org.jetbrains.kotlin:kotlin-stdlib-common (https://kotlinlang.org/)
-- license: The Apache License, Version 2.0
-# org.jetbrains.kotlin:kotlin-stdlib-jdk7
-org.jetbrains.kotlin:kotlin-stdlib-jdk7 (https://kotlinlang.org/)
-- license: The Apache License, Version 2.0
-# org.jetbrains.kotlin:kotlin-stdlib-jdk8
-org.jetbrains.kotlin:kotlin-stdlib-jdk8 (https://kotlinlang.org/)
-- license: The Apache License, Version 2.0
+Kotlin Stdlib (https://kotlinlang.org/)
+- license: Apache-2.0
# org.jline:jline
JLine Bundle
@@ -1349,6 +1363,10 @@ JLine Bundle
(licenses-binary/LICENSE-bsd-2-clause.txt)
# org.jsoup:jsoup
+jsoup Java HTML Parser (https://jsoup.org/)
+- license: The MIT License
+ (licenses-binary/LICENSE-mit-license.txt)
+# org.jsoup:jsoup
jsoup (http://jsoup.org/)
- license: The MIT License
(licenses-binary/LICENSE-mit-license.txt)
@@ -1517,6 +1535,9 @@ org.seleniumhq.selenium:selenium-support (https://selenium.dev/)
# org.slf4j:jcl-over-slf4j
JCL 1.2 implemented over SLF4J (http://www.slf4j.org)
- license: Apache License, Version 2.0
+# org.slf4j:jcl-over-slf4j
+JCL 1.2 implemented over SLF4J (http://www.slf4j.org)
+- license: Apache-2.0
# org.slf4j:slf4j-api
SLF4J API Module (http://www.slf4j.org)
- license: MIT License
@@ -1524,6 +1545,7 @@ SLF4J API Module (http://www.slf4j.org)
# org.slf4j:slf4j-api
SLF4J API Module (http://www.slf4j.org)
- license: MIT
+ (licenses-binary/LICENSE-mit-license.txt)
# org.tallison:jmatio
JMatIO (https://github.com/tballison/jmatio)
@@ -1532,8 +1554,7 @@ JMatIO (https://github.com/tballison/jmatio)
# org.tukaani:xz
XZ for Java (https://tukaani.org/xz/java.html)
-- license: Public Domain
- (licenses-binary/LICENSE-public-domain.txt)
+- license: Zero-Clause BSD (0BSD)
# org.xerial.snappy:snappy-java
Apache-2.0 (https://github.com/xerial/snappy-java)
diff --git a/build.xml b/build.xml
index a4530c40f1..d8ee908824 100644
--- a/build.xml
+++ b/build.xml
@@ -48,6 +48,8 @@
+
+
@@ -495,7 +497,7 @@
-
+
@@ -510,7 +512,7 @@
-
+
@@ -1110,19 +1112,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -1132,7 +1121,6 @@
-
@@ -1143,18 +1131,24 @@
-
+
+
+
+
+
+
+ dest="${ivy.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
-
+
-
+
@@ -1169,7 +1163,7 @@
+ classpath="${ant-eclipse.jar}" />
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index a13894110c..9d396ee7b1 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -142,11 +142,14 @@
-
+
-
-
-
+
+
+
+
+
+
diff --git a/licenses-binary/LICENSE-bsd-licence.txt b/licenses-binary/LICENSE-bsd-licence.txt
new file mode 100644
index 0000000000..ce7787d52f
--- /dev/null
+++ b/licenses-binary/LICENSE-bsd-licence.txt
@@ -0,0 +1,39 @@
+(source: http://antlr.org/license.html)
+
+ANTLR v4 License
+
+ANTLR
+
+ANTLR 4 License
+[The BSD License]
+Copyright (c) 2012 Terence Parr and Sam Harwell
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Developer's Certificate of Origin
+As of 4.10, ANTLR uses the Linux Foundation's Developer Certificate of Origin, DCO, version 1.1. See certificate
+of origin. To contribute:
+
+- fork the dev branch of the ANTLR v4 github repository
+- make your changes
+- commit your changes, signing your commits with git commit -s ....
+- send a pull request
diff --git a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt b/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt
deleted file mode 100644
index a25e8c704e..0000000000
--- a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-(source: http://www.gnu.org/software/classpath/license.html)
-
-
-GNU Classpath License - GNU Project - Free Software Foundation (FSF)
-
-
-
-
-Classpath is distributed under the terms of the GNU General Public License with the following clarification and special exception.
-
- Linking this library statically or dynamically with other modules is making a combined work based on this library. Thus, the terms and conditions of the GNU General Public License cover the whole combination.
-
- As a special exception, the copyright holders of this library give you permission to link this library with independent modules to produce an executable, regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An independent module is a module which is not derived from or based on this library. If you modify this library, you may extend this exception to your version of the library, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version.
-
-As such, it can be used to run, create and distribute a large class of applications and applets. When GNU Classpath is used unmodified as the core class library for a virtual machine, compiler for the java languge, or for a program written in the java programming language it does not affect the licensing for distributing those programs directly.
diff --git a/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt b/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 6575ccb886..68d65ba1ad 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -35,6 +35,7 @@
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.net.URISyntaxException;
+import java.time.Duration;
/**
* This class implements an adaptive re-fetch algorithm. This works as follows:
@@ -219,15 +220,15 @@ private void setHostSpecificIntervals(String fileName,
// The custom intervals should respect the boundaries of the default values.
if (m < defaultMin) {
LOG.error(
- "Min. interval out of bounds on line {} in the config. file: `{}`",
- lineNo, line);
+ "Min. interval out of bounds ({}) on line {} in the config. file: `{}`",
+ defaultMin, lineNo, line);
continue;
}
if (M > defaultMax) {
LOG.error(
- "Max. interval out of bounds on line {} in the config. file: `{}`",
- lineNo, line);
+ "Max. interval out of bounds ({}) on line {} in the config. file: `{}`",
+ defaultMax, lineNo, line);
continue;
}
@@ -332,17 +333,30 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
case FetchSchedule.STATUS_UNKNOWN:
break;
}
- if (SYNC_DELTA) {
- // try to synchronize with the time of change
- long delta = (fetchTime - modifiedTime) / 1000L;
- if (delta > interval)
- interval = delta;
- refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
- }
// Ensure the interval does not fall outside of bounds
float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL;
float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL;
+
+ if (SYNC_DELTA) {
+ // try to synchronize with the time of change
+ long delta = (fetchTime - modifiedTime);
+ if (delta > (interval * 1000))
+ interval = delta / 1000L;
+ // offset: a fraction (sync_delta_rate) of the difference between the last modification time, and the last fetch time.
+ long offset = Math.round(delta * SYNC_DELTA_RATE);
+ long maxIntervalMillis = (long) maxInterval * 1000L;
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("delta (days): {}; offset (days): {}; maxInterval (days): {}",
+ Duration.ofMillis(delta).toDays(), Duration.ofMillis(offset).toDays(), Duration.ofMillis(maxIntervalMillis).toDays());
+ }
+ // convert the offset to a ratio of max interval: avoid next fetchTime in the past, and mimic fetches within max interval
+ if (delta > 0 && offset > maxIntervalMillis) {
+ offset = offset / delta * maxIntervalMillis; // ex: 9/30*7 = 2.1
+ }
+ refTime = fetchTime - offset;
+ }
+
if (interval < minInterval) {
interval = minInterval;
} else if (interval > maxInterval) {
@@ -389,7 +403,8 @@ public static void main(String[] args) throws Exception {
(p.getFetchInterval() / SECONDS_PER_DAY), miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
- fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+ // Text (url) required by the API, but not relevant here.
+ fs.setFetchSchedule(new Text(), p, p
.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
changed ? FetchSchedule.STATUS_MODIFIED
: FetchSchedule.STATUS_NOTMODIFIED);
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 01598a5f18..32081e1d61 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -43,6 +43,7 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
@@ -145,7 +146,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize,
if (filter) {
long urlsFiltered = job.getCounters()
- .findCounter("CrawlDB filter", "URLs filtered").getValue();
+ .findCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).getValue();
LOG.info(
"CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}",
urlsFiltered);
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index d9ab0d3cc0..912c6e4abf 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -22,8 +22,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -49,6 +51,11 @@ public class CrawlDbFilter extends
private String scope;
+ // Cached counter references for performance
+ private Counter goneRecordsRemovedCounter;
+ private Counter orphanRecordsRemovedCounter;
+ private Counter urlsFilteredCounter;
+
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
@@ -67,6 +74,21 @@ public void setup(Mapper.Context context) {
scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
normalizers = new URLNormalizers(conf, scope);
}
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ goneRecordsRemovedCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL);
+ orphanRecordsRemovedCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL);
+ urlsFilteredCounter = context.getCounter(
+ NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL);
}
private Text newKey = new Text();
@@ -80,15 +102,13 @@ public void map(Text key, CrawlDatum value,
// https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
// cheaper than normalizing or filtering
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
- context.getCounter("CrawlDB filter",
- "Gone records removed").increment(1);
+ goneRecordsRemovedCounter.increment(1);
return;
}
// Whether to remove orphaned pages
// https://issues.apache.org/jira/browse/NUTCH-1932
if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
- context.getCounter("CrawlDB filter",
- "Orphan records removed").increment(1);
+ orphanRecordsRemovedCounter.increment(1);
return;
}
if (url != null && urlNormalizers) {
@@ -108,7 +128,7 @@ public void map(Text key, CrawlDatum value,
}
}
if (url == null) {
- context.getCounter("CrawlDB filter", "URLs filtered").increment(1);
+ urlsFilteredCounter.increment(1);
} else {
// URL has passed filters
newKey.set(url); // collect it
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index deb266af61..3454116575 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -18,19 +18,24 @@
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Map.Entry;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.PriorityQueue;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
@@ -48,6 +53,10 @@ public class CrawlDbReducer extends
private boolean additionsAllowed;
private int maxInterval;
private FetchSchedule schedule;
+ private ErrorTracker errorTracker;
+
+ // Cached counter references for status-based metrics
+ private Map statusCounters = new HashMap<>();
@Override
public void setup(Reducer.Context context) {
@@ -59,6 +68,17 @@ public void setup(Reducer.Context context) {
schedule = FetchScheduleFactory.getFetchSchedule(conf);
int maxLinks = conf.getInt("db.update.max.inlinks", 10000);
linked = new InlinkPriorityQueue(maxLinks);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB, context);
+ }
+
+ /**
+ * Get counter for status, caching for subsequent lookups.
+ */
+ private Counter getStatusCounter(byte status, Context context) {
+ return statusCounters.computeIfAbsent(status,
+ s -> context.getCounter(NutchMetrics.GROUP_CRAWLDB,
+ CrawlDatum.getStatusName(s)));
}
@Override
@@ -161,10 +181,11 @@ public void reduce(Text key, Iterable values,
scfilters.orphanedScore(key, old);
} catch (ScoringFilterException e) {
LOG.warn("Couldn't update orphaned score, key={}: {}", key, e);
+ errorTracker.incrementCounters(e);
}
context.write(key, old);
- context.getCounter("CrawlDB status",
- CrawlDatum.getStatusName(old.getStatus())).increment(1);
+ // Dynamic counter based on status name
+ getStatusCounter(old.getStatus(), context).increment(1);
} else {
LOG.warn("Missing fetch and old value, signature={}",
StringUtil.toHexString(signature));
@@ -206,6 +227,7 @@ public void reduce(Text key, Iterable values,
} catch (ScoringFilterException e) {
LOG.warn("Cannot filter init score for url {}, using default: {}",
key, e.getMessage());
+ errorTracker.incrementCounters(e);
result.setScore(0.0f);
}
}
@@ -315,12 +337,13 @@ public void reduce(Text key, Iterable values,
scfilters.updateDbScore(key, oldSet ? old : null, result, linkList);
} catch (Exception e) {
LOG.warn("Couldn't update score, key={}: {}", key, e);
+ errorTracker.incrementCounters(e);
}
// remove generation time, if any
result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
context.write(key, result);
- context.getCounter("CrawlDB status",
- CrawlDatum.getStatusName(result.getStatus())).increment(1);
+ // Dynamic counter based on status name
+ getStatusCounter(result.getStatus(), context).increment(1);
}
}
diff --git a/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java
index 5c82b6d6b2..3b77878211 100644
--- a/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java
+++ b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java
@@ -36,6 +36,7 @@
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -154,12 +155,14 @@ public void map(Text key, CrawlDatum value, Context context)
//
value.getMetaData().put(urlKey, key);
Text redirKey = new Text(redirTarget);
- context.getCounter("DeduplicationJobStatus", "Redirects in CrawlDb")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_DEDUP,
+ NutchMetrics.DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL).increment(1);
if (redirKey.equals(key)) {
// exclude self-referential redirects
- context.getCounter("DeduplicationJobStatus",
- "Self-referential redirects in CrawlDb").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_DEDUP,
+ NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL)
+ .increment(1);
} else {
context.write(redirKey, value);
}
@@ -219,16 +222,15 @@ public void reduce(Text key, Iterable values, Context context)
// duplicate!
unsetDuplicateStatus(existingDoc);
context.write(origURL, existingDoc);
- context.getCounter("DeduplicationJobStatus",
- "Redirects kept as non-duplicates").increment(1);
+ context.getCounter(NutchMetrics.GROUP_DEDUP,
+ NutchMetrics.DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL).increment(1);
} else {
// (c) it is a self-referential redirect
String targetURL = getTargetURL(existingDoc);
if (key.toString().equals(targetURL)) {
context.write(key, existingDoc);
- context
- .getCounter("DeduplicationJobStatus",
- "Self-referential redirects kept as non-duplicates")
+ context.getCounter(NutchMetrics.GROUP_DEDUP,
+ NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL)
.increment(1);
}
// else: ignore redirects emitted under original URL because they are
@@ -306,9 +308,10 @@ public int run(String[] args) throws IOException {
fs.delete(tempDir, true);
throw new RuntimeException(message);
}
- CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
+ CounterGroup g = job.getCounters().getGroup(NutchMetrics.GROUP_DEDUP);
if (g != null) {
- Counter counter = g.findCounter("Documents marked as duplicate");
+ Counter counter = g
+ .findCounter(NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL);
numDuplicates = counter.getValue();
LOG.info("Deduplication: {} documents marked as duplicates",
numDuplicates);
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 3e12d4598c..50aa4cd7bd 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -45,6 +45,7 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
@@ -127,11 +128,25 @@ public static class DedupReducer
protected String[] compareOrder;
+ // Cached counter reference for performance
+ private Counter documentsMarkedDuplicateCounter;
+
@Override
public void setup(
Reducer.Context context) {
Configuration conf = context.getConfiguration();
compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+
+ // Initialize cached counter reference
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ documentsMarkedDuplicateCounter = context.getCounter(
+ NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL);
}
protected void writeOutAsDuplicate(CrawlDatum datum,
@@ -139,8 +154,7 @@ protected void writeOutAsDuplicate(CrawlDatum datum,
throws IOException, InterruptedException {
datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
Text key = (Text) datum.getMetaData().remove(urlKey);
- context.getCounter("DeduplicationJobStatus",
- "Documents marked as duplicate").increment(1);
+ documentsMarkedDuplicateCounter.increment(1);
context.write(key, datum);
}
@@ -334,12 +348,10 @@ public int run(String[] args) throws IOException {
fs.delete(tempDir, true);
throw new RuntimeException(message);
}
- CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
- if (g != null) {
- Counter counter = g.findCounter("Documents marked as duplicate");
- long dups = counter.getValue();
- LOG.info("Deduplication: {} documents marked as duplicates", dups);
- }
+ long dups = job.getCounters()
+ .findCounter(NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL)
+ .getValue();
+ LOG.info("Deduplication: {} documents marked as duplicates", dups);
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("DeduplicationJob:", e);
fs.delete(tempDir, true);
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 82475af5b8..102ce39b94 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -67,6 +67,8 @@
import org.apache.hadoop.io.WritableComparator;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -190,6 +192,18 @@ public static class SelectorMapper
private int intervalThreshold = -1;
private byte restrictStatus = -1;
private JexlScript expr = null;
+ private ErrorTracker errorTracker;
+
+ // Cached counter references for performance
+ private Counter urlFiltersRejectedCounter;
+ private Counter scheduleRejectedCounter;
+ private Counter waitForUpdateCounter;
+ private Counter exprRejectedCounter;
+ private Counter statusRejectedCounter;
+ private Counter scoreTooLowCounter;
+ private Counter intervalRejectedCounter;
+ private Counter hostsAffectedPerHostOverflowCounter;
+ private Counter urlsSkippedPerHostOverflowCounter;
@Override
public void setup(
@@ -214,6 +228,34 @@ public void setup(
restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
}
expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ urlFiltersRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL);
+ scheduleRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL);
+ waitForUpdateCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL);
+ exprRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL);
+ statusRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL);
+ scoreTooLowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL);
+ intervalRejectedCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL);
+ hostsAffectedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL);
+ urlsSkippedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL);
}
@Override
@@ -225,11 +267,11 @@ public void map(Text key, CrawlDatum value, Context context)
// URLFilters
try {
if (filters.filter(url.toString()) == null) {
- context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1);
+ urlFiltersRejectedCounter.increment(1);
return;
}
} catch (URLFilterException e) {
- context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1);
+ errorTracker.incrementCounters(e);
LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage());
}
}
@@ -239,7 +281,7 @@ public void map(Text key, CrawlDatum value, Context context)
if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url,
crawlDatum.getFetchTime(), curTime);
- context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1);
+ scheduleRejectedCounter.increment(1);
return;
}
@@ -248,7 +290,7 @@ public void map(Text key, CrawlDatum value, Context context)
if (oldGenTime != null) { // awaiting fetch & update
if (oldGenTime.get() + genDelay > curTime) { // still wait for
// update
- context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1);
+ waitForUpdateCounter.increment(1);
return;
}
}
@@ -256,25 +298,26 @@ public void map(Text key, CrawlDatum value, Context context)
try {
sort = scfilters.generatorSortValue(key, crawlDatum, sort);
} catch (ScoringFilterException sfe) {
+ errorTracker.incrementCounters(sfe);
LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe);
}
// check expr
if (expr != null) {
if (!crawlDatum.execute(expr, key.toString())) {
- context.getCounter("Generator", "EXPR_REJECTED").increment(1);
+ exprRejectedCounter.increment(1);
return;
}
}
if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) {
- context.getCounter("Generator", "STATUS_REJECTED").increment(1);
+ statusRejectedCounter.increment(1);
return;
}
// consider only entries with a score superior to the threshold
if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) {
- context.getCounter("Generator", "SCORE_TOO_LOW").increment(1);
+ scoreTooLowCounter.increment(1);
return;
}
@@ -282,7 +325,7 @@ public void map(Text key, CrawlDatum value, Context context)
// threshold
if (intervalThreshold != -1
&& crawlDatum.getFetchInterval() > intervalThreshold) {
- context.getCounter("Generator", "INTERVAL_REJECTED").increment(1);
+ intervalRejectedCounter.increment(1);
return;
}
@@ -317,6 +360,11 @@ public static class SelectorReducer extends
private JexlScript maxCountExpr = null;
private JexlScript fetchDelayExpr = null;
private Map hostDatumCache = new HashMap<>();
+ private ErrorTracker errorTracker;
+
+ // Cached counter references for performance
+ private Counter hostsAffectedPerHostOverflowCounter;
+ private Counter urlsSkippedPerHostOverflowCounter;
public void readHostDb() throws IOException {
if (conf.get(GENERATOR_HOSTDB) == null) {
@@ -410,10 +458,24 @@ public void setup(Context context) throws IOException {
fetchDelayExpr = JexlUtil
.parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
+ // Initialize cached counter references
+ initReducerCounters(context);
readHostDb();
}
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initReducerCounters(Context context) {
+ hostsAffectedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL);
+ urlsSkippedPerHostOverflowCounter = context.getCounter(
+ NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL);
+ }
+
@Override
public void cleanup(Context context)
throws IOException, InterruptedException {
@@ -507,7 +569,7 @@ public void reduce(FloatWritable key, Iterable values,
} catch (MalformedURLException e) {
LOG.warn("Malformed URL: '{}', skipping ({})", urlString,
StringUtils.stringifyException(e));
- context.getCounter("Generator", "MALFORMED_URL").increment(1);
+ errorTracker.incrementCounters(e);
continue;
}
@@ -539,16 +601,13 @@ public void reduce(FloatWritable key, Iterable values,
hostCount[1] = 1;
} else {
if (hostCount[1] == (maxCount+1)) {
- context
- .getCounter("Generator", "HOSTS_AFFECTED_PER_HOST_OVERFLOW")
- .increment(1);
+ hostsAffectedPerHostOverflowCounter.increment(1);
LOG.info(
"Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.",
hostordomain, maxCount, maxNumSegments);
}
// skip this entry
- context.getCounter("Generator", "URLS_SKIPPED_PER_HOST_OVERFLOW")
- .increment(1);
+ urlsSkippedPerHostOverflowCounter.increment(1);
continue;
}
}
@@ -959,10 +1018,13 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
}
LOG.info("Generator: number of items rejected during selection:");
- for (Counter counter : job.getCounters().getGroup("Generator")) {
- LOG.info("Generator: {} {}",
- String.format(Locale.ROOT, "%6d", counter.getValue()),
- counter.getName());
+ for (Counter counter : job.getCounters()
+ .getGroup(NutchMetrics.GROUP_GENERATOR)) {
+ long counterValue = counter.getValue();
+ if (counterValue > 0) {
+ LOG.info("Generator: {} {}",
+ String.format(Locale.ROOT, "%6d", counterValue), counter.getName());
+ }
}
if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
/*
diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java
index 6de2adab81..6b619445b7 100644
--- a/src/java/org/apache/nutch/crawl/Generator2.java
+++ b/src/java/org/apache/nutch/crawl/Generator2.java
@@ -65,6 +65,8 @@
import org.apache.hadoop.util.hash.MurmurHash;
import org.apache.nutch.crawl.Generator2.SelectorReducer.DomainLimits;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -335,6 +337,7 @@ public static class SelectorMapper
private int intervalThreshold = -1;
private String restrictStatus = null;
private DomainScorePair outputKey = new DomainScorePair();
+ private ErrorTracker errorTracker;
@Override
public void setup(
@@ -362,6 +365,9 @@ public void setup(
if (GENERATOR_COUNT_VALUE_DOMAIN.equals(conf.get(GENERATOR_COUNT_MODE))) {
byDomain = true;
}
+
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
}
/** Select & invert subset due for fetch. */
@@ -375,12 +381,15 @@ public void map(Text key, CrawlDatum value, Context context)
// URLFilters
try {
if (filters.filter(urlString) == null) {
- context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL)
+ .increment(1);
return;
}
} catch (URLFilterException e) {
LOG.warn("Couldn't filter url {}: {}", key, e.getMessage());
- context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1);
+ errorTracker.incrementCounters(e);
}
}
@@ -388,7 +397,8 @@ public void map(Text key, CrawlDatum value, Context context)
if (!schedule.shouldFetch(key, value, curTime)) {
LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", key,
value.getFetchTime(), curTime);
- context.getCounter("Schedule rejected by status",
+ context.getCounter(
+ NutchMetrics.GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS,
CrawlDatum.getStatusName(value.getStatus())).increment(1);
return;
}
@@ -413,8 +423,10 @@ public void map(Text key, CrawlDatum value, Context context)
// consider only entries with a score superior to the threshold
if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) {
- context.getCounter("Score below threshold by status",
- CrawlDatum.getStatusName(value.getStatus())).increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS,
+ CrawlDatum.getStatusName(value.getStatus()))
+ .increment(1);
return;
}
@@ -440,7 +452,7 @@ public void map(Text key, CrawlDatum value, Context context)
} catch (Exception e) {
LOG.warn("Malformed URL: '{}', skipping ({})", urlString,
e.getMessage());
- context.getCounter("Generator", "MALFORMED_URL").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
@@ -738,7 +750,8 @@ public void reduce(DomainScorePair key, Iterable values,
LOG.info(
"Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.",
key.getDomain(), maxCountTotal, maxNumSegments);
- context.getCounter("Generator", "SKIPPED_DOMAINS_OVERFLOW")
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL)
.increment(1);
maxUrlsOverflow = true;
break;
@@ -784,11 +797,14 @@ public void reduce(DomainScorePair key, Iterable values,
LOG.info(
"Host {}{} (domain: {}) has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.",
host, domain, domain, maxCountPerHostTotal, maxNumSegments);
- context.getCounter("Generator", "SKIPPED_HOSTS_NUM_URLS_OVERFLOW")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL)
+ .increment(1);
}
- context.getCounter("Generator", "SKIPPED_URLS_HOST_OVERFLOW")
- .increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL)
+ .increment(1);
maxUrlsPerHostOverflowCount++;
counts[0]++;
continue;
@@ -819,17 +835,19 @@ public void reduce(DomainScorePair key, Iterable values,
}
}
- context.getCounter("Selected by status",
+ context.getCounter(NutchMetrics.GROUP_GENERATOR_SELECTED_BY_STATUS,
CrawlDatum.getStatusName(entry.datum.getStatus())).increment(1);
context.write(key.getScore(), entry);
}
if (maxHostsOverflowCount > 0) {
- context.getCounter("Generator", "SKIPPED_DOMAINS_NUM_HOSTS_OVERFLOW")
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL)
.increment(1);
- context.getCounter("Generator", "SKIPPED_URLS_NUM_HOSTS_OVERFLOW")
- .increment(maxHostsOverflowCount);
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL)
+ .increment(maxHostsOverflowCount);
LOG.info(
"Domain {} has more than {} hosts, skipped {} URLs from remaining hosts",
key.getDomain(), maxHosts, maxHostsOverflowCount);
@@ -1022,7 +1040,8 @@ public void reduce(SegmenterKey key, Iterable values,
if (count < maxPerSegment) {
mos.write("sequenceFiles", entry.url, entry, fileName);
} else {
- context.getCounter("Generator", "SKIPPED_RECORDS_SEGMENT_OVERFLOW")
+ context.getCounter(NutchMetrics.GROUP_GENERATOR,
+ NutchMetrics.GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL)
.increment(1);
if (count == maxPerSegment) {
LOG.info(
diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
index 3e03f9ea8e..ae154350ef 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -24,6 +24,7 @@
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -36,6 +37,8 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -126,6 +129,13 @@ public static class InjectMapper
private boolean url404Purging;
private String scope;
private boolean filterNormalizeAll = false;
+ private ErrorTracker errorTracker;
+
+ // Cached counter references for performance
+ private Counter urlsFilteredCounter;
+ private Counter urlsInjectedCounter;
+ private Counter urlsPurged404Counter;
+ private Counter urlsPurgedFilterCounter;
@Override
public void setup(Context context) {
@@ -146,6 +156,24 @@ public void setup(Context context) {
curTime = conf.getLong("injector.current.time",
System.currentTimeMillis());
url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_INJECTOR, context);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ urlsFilteredCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL);
+ urlsInjectedCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL);
+ urlsPurged404Counter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL);
+ urlsPurgedFilterCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL);
}
/* Filter and normalize the input url */
@@ -218,7 +246,7 @@ public void map(Text key, Writable value, Context context)
url = filterNormalize(url);
if (url == null) {
- context.getCounter("injector", "urls_filtered").increment(1);
+ urlsFilteredCounter.increment(1);
} else {
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -237,8 +265,9 @@ public void map(Text key, Writable value, Context context)
LOG.warn(
"Cannot filter injected score for url {}, using default ({})",
url, e.getMessage());
+ errorTracker.incrementCounters(e);
}
- context.getCounter("injector", "urls_injected").increment(1);
+ urlsInjectedCounter.increment(1);
context.write(key, datum);
}
} else if (value instanceof CrawlDatum) {
@@ -248,14 +277,14 @@ public void map(Text key, Writable value, Context context)
// remove 404 urls
if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) {
- context.getCounter("injector", "urls_purged_404").increment(1);
+ urlsPurged404Counter.increment(1);
return;
}
if (filterNormalizeAll) {
String url = filterNormalize(key.toString());
if (url == null) {
- context.getCounter("injector", "urls_purged_filter").increment(1);
+ urlsPurgedFilterCounter.increment(1);
} else {
key.set(url);
context.write(key, datum);
@@ -275,6 +304,10 @@ public static class InjectReducer
private CrawlDatum old = new CrawlDatum();
private CrawlDatum injected = new CrawlDatum();
+ // Cached counter references for performance
+ private Counter urlsInjectedUniqueCounter;
+ private Counter urlsMergedCounter;
+
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
@@ -282,6 +315,19 @@ public void setup(Context context) {
update = conf.getBoolean("db.injector.update", false);
LOG.info("Injector: overwrite: {}", overwrite);
LOG.info("Injector: update: {}", update);
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ urlsInjectedUniqueCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL);
+ urlsMergedCounter = context.getCounter(
+ NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_MERGED_TOTAL);
}
/**
@@ -341,9 +387,9 @@ public void reduce(Text key, Iterable values, Context context)
}
}
if (injectedSet) {
- context.getCounter("injector", "urls_injected_unique").increment(1);
+ urlsInjectedUniqueCounter.increment(1);
if (oldSet) {
- context.getCounter("injector", "urls_merged").increment(1);
+ urlsMergedCounter.increment(1);
}
}
context.write(key, result);
@@ -454,17 +500,23 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
if (LOG.isInfoEnabled()) {
long urlsInjected = job.getCounters()
- .findCounter("injector", "urls_injected").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).getValue();
long urlsInjectedUniq = job.getCounters()
- .findCounter("injector", "urls_injected_unique").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).getValue();
long urlsFiltered = job.getCounters()
- .findCounter("injector", "urls_filtered").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).getValue();
long urlsMerged = job.getCounters()
- .findCounter("injector", "urls_merged").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).getValue();
long urlsPurged404 = job.getCounters()
- .findCounter("injector", "urls_purged_404").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).getValue();
long urlsPurgedFilter = job.getCounters()
- .findCounter("injector", "urls_purged_filter").getValue();
+ .findCounter(NutchMetrics.GROUP_INJECTOR,
+ NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).getValue();
LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered);
LOG.info(
"Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})",
diff --git a/src/java/org/apache/nutch/crawl/SitemapInjector.java b/src/java/org/apache/nutch/crawl/SitemapInjector.java
index f8e108874a..b643e3368a 100644
--- a/src/java/org/apache/nutch/crawl/SitemapInjector.java
+++ b/src/java/org/apache/nutch/crawl/SitemapInjector.java
@@ -26,6 +26,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Random;
@@ -53,6 +54,7 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -343,10 +345,8 @@ public ProtocolOutput call() throws Exception {
BaseRobotRules rules = protocol.getRobotRules(turl, null, null);
if (!rules.isAllowed(url)) {
LOG.info("Fetch of sitemap forbidden by robots.txt: {}", url);
- context
- .getCounter("SitemapInjector",
- "failed to fetch sitemap content, robots.txt disallow")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_ROBOTSTXT_DISALLOW_TOTAL).increment(1);
return null;
}
}
@@ -444,15 +444,17 @@ public void process(String url) {
try {
sitemap = parseSitemap(content, url);
} catch (Exception e) {
- context.getCounter("SitemapInjector", "sitemaps failed to parse")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1);
LOG.warn("failed to parse sitemap {}: {}", url,
StringUtils.stringifyException(e));
return;
}
LOG.info("parsed sitemap {} ({})", url, sitemap.getType());
context
- .getCounter("SitemapInjector", "sitemap type: " + sitemap.getType())
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_TYPE_PREFIX
+ + sitemap.getType().toString().toLowerCase(Locale.ROOT))
.increment(1);
if (checkCrossSubmits) {
@@ -519,14 +521,16 @@ public void processSitemap(AbstractSiteMap sitemap,
return;
}
- context.getCounter("SitemapInjector", "sitemaps processed")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_PROCESSED_TOTAL).increment(1);
injectURLs((SiteMap) sitemap);
if (totalUrls >= maxUrls) {
- LOG.warn("URL limit reached, skipped remaining urls of {}",
+ LOG.warn(
+ "Sitemap index URL limit reached, skipped remaining urls of {}",
sitemap.getUrl());
context
- .getCounter("SitemapInjector", "sitemap index: URL limit reached")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL)
.increment(1);
}
sitemap.setProcessed(true);
@@ -543,8 +547,10 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
LOG.warn(
"Depth limit reached recursively processing sitemap index {}",
sitemapIndex.getUrl());
- context.getCounter("SitemapInjector",
- "sitemap index: depth limit reached").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL)
+ .increment(1);
return;
}
@@ -557,10 +563,8 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
double publishScore = 0.3;
if (s.getLastModified() != null) {
double elapsedMonthsSincePublished = (System.currentTimeMillis()
- - s.getLastModified().getTime())
- / (1000.0 * 60 * 60 * 24 * 30);
- publishScore = (1.0
- / Math.log(1.0 + elapsedMonthsSincePublished));
+ - s.getLastModified().getTime()) / (1000.0 * 60 * 60 * 24 * 30);
+ publishScore = (1.0 / Math.log(1.0 + elapsedMonthsSincePublished));
}
double score = (1.0 / subSitemaps) + publishScore + Math.random();
sitemaps.add(new ScoredSitemap(score, s));
@@ -574,18 +578,18 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
LOG.warn(
"Max. processing time reached, skipped remaining sitemaps of sitemap index {}",
sitemapIndex.getUrl());
- context.getCounter("SitemapInjector",
- "sitemap index: time limit reached").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL)
+ .increment(1);
return;
}
- if ((totalUrls == 0)
- && (elapsed > (maxSitemapProcessingTime / 2))) {
+ if ((totalUrls == 0) && (elapsed > (maxSitemapProcessingTime / 2))) {
LOG.warn(
"Half of processing time elapsed and no URLs injected, skipped remaining sitemaps of sitemap index {}",
sitemapIndex.getUrl());
- context
- .getCounter("SitemapInjector",
- "sitemap index: no URLs after 50% of time limit")
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL)
.increment(1);
return;
}
@@ -594,29 +598,34 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
LOG.warn(
"Too many failures, skipped remaining sitemaps of sitemap index {}",
sitemapIndex.getUrl());
- context.getCounter("SitemapInjector",
- "sitemap index: too many failures").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL)
+ .increment(1);
return;
}
AbstractSiteMap nextSitemap = sitemaps.poll().sitemap;
- context.getCounter("SitemapInjector", "sitemap index: processed sitemaps")
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL)
.increment(1);
String url = nextSitemap.getUrl().toString();
if (processedSitemaps.contains(url)) {
LOG.warn("skipped duplicated or recursive sitemap URL {}", url);
- context.getCounter("SitemapInjector",
- "skipped duplicated or recursive sitemap URLs").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL)
+ .increment(1);
nextSitemap.setProcessed(true);
continue;
}
if (processedSitemaps.size() > maxRecursiveSitemaps) {
- LOG.warn(
- "{} sitemaps processed for {}, skipped remaining sitemaps",
+ LOG.warn("{} sitemaps processed for {}, skipped remaining sitemaps",
processedSitemaps.size(), sitemapIndex.getUrl());
context
- .getCounter("SitemapInjector", "sitemap index limit reached")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL)
.increment(1);
return;
}
@@ -624,8 +633,10 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
LOG.warn(
"URL limit reached, skipped remaining sitemaps of sitemap index {}",
sitemapIndex.getUrl());
- context.getCounter("SitemapInjector",
- "sitemap index: URL limit reached").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL)
+ .increment(1);
return;
}
@@ -634,21 +645,20 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex,
Content content = getContent(url);
if (content == null) {
nextSitemap.setProcessed(true);
- context.getCounter("SitemapInjector", "sitemaps failed to fetch")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_FETCH_TOTAL).increment(1);
failedSubSitemaps++;
continue;
}
try {
- AbstractSiteMap parsedSitemap = parseSitemap(content,
- nextSitemap);
+ AbstractSiteMap parsedSitemap = parseSitemap(content, nextSitemap);
processSitemap(parsedSitemap, processedSitemaps, depth);
} catch (Exception e) {
LOG.warn("failed to parse sitemap {}: {}", nextSitemap.getUrl(),
StringUtils.stringifyException(e));
- context.getCounter("SitemapInjector", "sitemaps failed to parse")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1);
failedSubSitemaps++;
}
nextSitemap.setProcessed(true);
@@ -661,8 +671,8 @@ private Content getContent(String url) {
LOG.warn(
"Not fetching sitemap with overlong URL: {} ... (truncated, length = {} characters)",
url.substring(0, maxUrlLength), url.length());
- context.getCounter("SitemapInjector", "sitemap overlong URL")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_SKIPPED_OVERLONG_URL_TOTAL).increment(1);
return null;
}
String origUrl = url;
@@ -670,7 +680,8 @@ private Content getContent(String url) {
if (url == null) {
LOG.warn("Sitemap rejected by URL filters: {}", origUrl);
context
- .getCounter("SitemapInjector", "sitemap rejected by URL filters")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL)
.increment(1);
return null;
}
@@ -683,8 +694,10 @@ private Content getContent(String url) {
if (failuresPerHost.containsKey(hostName)
&& failuresPerHost.get(hostName) > maxFailuresPerHost) {
LOG.info("Skipped, too many failures per host: {}", url);
- context.getCounter("SitemapInjector",
- "skipped, too many failures per host").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL)
+ .increment(1);
return null;
}
Protocol protocol = null;
@@ -693,8 +706,8 @@ private Content getContent(String url) {
} catch (ProtocolNotFound e) {
LOG.error("Protocol not found: {}", url);
context
- .getCounter("SitemapInjector",
- "failed to fetch sitemap content, protocol not found")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL)
.increment(1);
return null;
}
@@ -715,14 +728,16 @@ private Content getContent(String url) {
} catch (Exception e) {
if (e instanceof TimeoutException) {
LOG.error("fetch of sitemap {} timed out", url);
- context.getCounter("SitemapInjector",
- "failed to fetch sitemap content, timeout").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL)
+ .increment(1);
} else {
LOG.error("fetch of sitemap {} failed with: {}", url,
StringUtils.stringifyException(e));
context
- .getCounter("SitemapInjector",
- "failed to fetch sitemap content, exception")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL)
.increment(1);
}
task.cancel(true);
@@ -737,17 +752,16 @@ private Content getContent(String url) {
}
if (protocolOutput.getStatus().isRedirect()) {
- context.getCounter("SitemapInjector", "sitemap redirect")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_REDIRECT_TOTAL).increment(1);
String redirUrl = protocolOutput.getStatus().getArgs()[0];
url = filterNormalize(redirUrl);
if (url == null) {
LOG.info(
"Redirect target of sitemap {} rejected by URL filters: {}",
origUrl, redirUrl);
- context
- .getCounter("SitemapInjector",
- "sitemap (redirect target) rejected by URL filters")
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL)
.increment(1);
return null;
}
@@ -766,8 +780,10 @@ private Content getContent(String url) {
redirects++;
if (redirects >= maxRedirect) {
LOG.warn("sitemap redirect limit exceeded: {}", origUrl);
- context.getCounter("SitemapInjector",
- "sitemap redirect limit exceeded").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL)
+ .increment(1);
// return to avoid that exceeded redirects are counted twice
// (also as non-success fetch status)
return null;
@@ -779,9 +795,8 @@ private Content getContent(String url) {
if (!protocolOutput.getStatus().isSuccess()) {
LOG.error("fetch of sitemap {} failed with status code {}", url,
protocolOutput.getStatus().getCode());
- context
- .getCounter("SitemapInjector",
- "failed to fetch sitemap content, HTTP status != 200")
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL)
.increment(1);
incrementFailuresPerHost(hostName);
return null;
@@ -791,10 +806,8 @@ private Content getContent(String url) {
if (content == null) {
LOG.error("No content for {}, status: {}", url,
protocolOutput.getStatus().getMessage());
- context
- .getCounter("SitemapInjector",
- "failed to fetch sitemap content, empty content")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_EMPTY_CONTENT_TOTAL).increment(1);
incrementFailuresPerHost(hostName);
return null;
}
@@ -826,7 +839,8 @@ public void injectURLs(SiteMap sitemap)
Collection sitemapURLs = sitemap.getSiteMapUrls();
if (sitemapURLs.size() == 0) {
LOG.info("No URLs in sitemap {}", sitemap.getUrl());
- context.getCounter("SitemapInjector", "empty sitemap").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_EMPTY_TOTAL).increment(1);
return;
}
LOG.info("Found {} URLs in {}", sitemapURLs.size(), sitemap.getUrl());
@@ -852,8 +866,8 @@ public void injectURLs(SiteMap sitemap)
for (SiteMapURL siteMapURL : sitemapURLs) {
if (totalUrls >= maxUrls) {
- context.getCounter("SitemapInjector", "sitemap URL limit reached")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_URL_LIMIT_REACHED_TOTAL).increment(1);
LOG.info("URL limit ({}) reached for {}", maxUrls,
sitemap.getUrl());
break;
@@ -861,7 +875,8 @@ public void injectURLs(SiteMap sitemap)
if (random != null) {
if (randomSelect > random.nextFloat()) {
- context.getCounter("SitemapInjector", "random skip").increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_RANDOM_SKIP_TOTAL).increment(1);
continue;
}
}
@@ -889,8 +904,8 @@ public void injectURLs(SiteMap sitemap)
&& !injectedHosts.contains(host)) {
hostLimitRejected++;
context
- .getCounter("SitemapInjector",
- "urls from sitemaps rejected, host limit reached")
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL)
.increment(1);
continue;
}
@@ -905,8 +920,8 @@ public void injectURLs(SiteMap sitemap)
}
if (crossSubmit == null || !crossSubmits.contains(crossSubmit)) {
crossSubmitsRejected++;
- context.getCounter("SitemapInjector",
- "urls from sitemaps rejected, target not allowed by cross-submits")
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL)
.increment(1);
continue;
}
@@ -918,8 +933,10 @@ public void injectURLs(SiteMap sitemap)
url = null;
}
if (url == null) {
- context.getCounter("SitemapInjector",
- "urls from sitemaps rejected by URL filters").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS)
+ .increment(1);
} else {
// URL passed normalizers and filters
totalUrls++;
@@ -939,8 +956,8 @@ public void injectURLs(SiteMap sitemap)
url, e.getMessage());
}
- context.getCounter("SitemapInjector", "urls from sitemaps injected")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR,
+ NutchMetrics.SITEMAP_URLS_INJECTED).increment(1);
context.write(value, datum);
injectedHosts.add(host);
}
@@ -1089,7 +1106,7 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
}
for (Counter counter : sitemapJob.getCounters()
- .getGroup("SitemapInjector")) {
+ .getGroup(NutchMetrics.GROUP_SITEMAP_INJECTOR)) {
LOG.info(String.format("SitemapInjector: %8d %s", counter.getValue(),
counter.getName()));
}
@@ -1171,7 +1188,8 @@ public void usage() {
"Usage: SitemapInjector [-D...] [-threads ] [-overwrite|-update] [-noFilter] [-noNormalize] [-filterNormalizeAll]\n");
System.err.println("\nFor sitemap URLs listed in seed input files:");
System.err.println("\t- fetch and parse the sitemap (step 1)");
- System.err.println("\t- inject URLs from sitemaps into the CrawlDb (step 2)");
+ System.err
+ .println("\t- inject URLs from sitemaps into the CrawlDb (step 2)");
System.err.println(
"\t- using fetch intervals and scores from sitemaps if applicable");
System.err.println("Options and properties of SitemapInjector");
@@ -1206,25 +1224,25 @@ public int run(String[] args) throws Exception {
continue;
}
switch (args[i]) {
- case "-threads":
- i++;
- if (i == args.length) {
- usage("Argument -threads requires parameter");
- return -1;
- }
- threads = Integer.parseInt(args[i]);
- break;
- case "-keepTemp":
- keepTemp = true;
- break;
- case "-step1":
- runStepOneOnly = true;
- break;
- case "-step2":
- runStepTwoOnly = true;
- break;
- default:
- superArguments.add(args[i]);
+ case "-threads":
+ i++;
+ if (i == args.length) {
+ usage("Argument -threads requires parameter");
+ return -1;
+ }
+ threads = Integer.parseInt(args[i]);
+ break;
+ case "-keepTemp":
+ keepTemp = true;
+ break;
+ case "-step1":
+ runStepOneOnly = true;
+ break;
+ case "-step2":
+ runStepTwoOnly = true;
+ break;
+ default:
+ superArguments.add(args[i]);
}
}
if (runStepOneOnly && runStepTwoOnly) {
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index f6518be761..0a08e9da2e 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -34,6 +34,7 @@
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
@@ -48,6 +49,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -158,6 +160,13 @@ public static class FetcherRun extends
private boolean storingContent;
private boolean parsing;
+ // Cached counter references for performance
+ private Counter bytesDownloadedCounter;
+ private Counter hitByThroughputThresholdCounter;
+ private Counter hitByTimelimitCounter;
+ private Counter hungThreadsCounter;
+ private Counter hitByTimeoutCounter;
+
private AtomicInteger getActiveThreads() {
return activeThreads;
}
@@ -196,11 +205,28 @@ public void setup(Mapper.Context context)
parsing = isParsing(conf);
}
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ bytesDownloadedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL);
+ hitByThroughputThresholdCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL);
+ hitByTimelimitCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL);
+ hungThreadsCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HUNG_THREADS_TOTAL);
+ hitByTimeoutCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL);
+ }
+
@Override
public void run(Context innerContext)
throws IOException, InterruptedException {
setup(innerContext);
+ initCounters(innerContext);
try {
Configuration conf = innerContext.getConfiguration();
LinkedList fetcherThreads = new LinkedList<>();
@@ -295,8 +321,7 @@ public void run(Context innerContext)
pagesLastSec = pages.get() - pagesLastSec;
bytesLastSec = (int) bytes.get() - bytesLastSec;
- innerContext.getCounter("FetcherStatus", "bytes_downloaded")
- .increment(bytesLastSec);
+ bytesDownloadedCounter.increment(bytesLastSec);
reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec);
@@ -334,9 +359,7 @@ public void run(Context innerContext)
int hitByThrougputThreshold = fetchQueues.emptyQueues();
if (hitByThrougputThreshold != 0)
- innerContext
- .getCounter("FetcherStatus", "hitByThrougputThreshold")
- .increment(hitByThrougputThreshold);
+ hitByThroughputThresholdCounter.increment(hitByThrougputThreshold);
}
}
}
@@ -417,8 +440,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
if (!feeder.isAlive()) {
int hitByTimeLimit = fetchQueues.checkTimelimit();
if (hitByTimeLimit != 0)
- innerContext.getCounter("FetcherStatus", "hitByTimeLimit")
- .increment(hitByTimeLimit);
+ hitByTimelimitCounter.increment(hitByTimeLimit);
}
/*
@@ -434,8 +456,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
timeout);
LOG.warn("Aborting with {} hung threads{}.", activeThreads,
feeder.isAlive() ? " (queue feeder still alive)" : "");
- innerContext.getCounter("FetcherStatus", "hungThreads")
- .increment(activeThreads.get());
+ hungThreadsCounter.increment(activeThreads.get());
for (int i = 0; i < fetcherThreads.size(); i++) {
FetcherThread thread = fetcherThreads.get(i);
if (thread.isAlive()) {
@@ -470,8 +491,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
fetchQueues.getTotalSize(), fetchQueues.getQueueCount(),
feeder.isAlive() ? " (queue feeder still alive)" : "");
int hitByTimeout = fetchQueues.emptyQueues();
- innerContext.getCounter("FetcherStatus", "hitByTimeout")
- .increment(hitByTimeout);
+ hitByTimeoutCounter.increment(hitByTimeout);
return;
}
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 75ae606cb4..23c2e23542 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -34,12 +34,16 @@
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.Fetcher.FetcherRun;
import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.LatencyTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLExemptionFilters;
@@ -172,6 +176,29 @@ public class FetcherThread extends Thread {
private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+ // Cached counters for performance (avoid repeated lookups in hot paths)
+ private Counter robotsDeniedCounter;
+ private Counter robotsDeniedMaxCrawlDelayCounter;
+ private Counter robotsDeferVisitsDroppedCounter;
+ private Counter redirectCountExceededCounter;
+ private Counter redirectDeduplicatedCounter;
+ private Counter redirectNotCreatedCounter;
+ private Counter hitByTimeLimitCounter;
+ private Counter aboveExceptionThresholdCounter;
+ private Counter outlinksDetectedCounter;
+ private Counter outlinksFollowingCounter;
+ private Counter robotsTxtArchivingFilteredCounter;
+ private Counter ipv4Counter;
+ private Counter ipv6Counter;
+ private Counter robotsTxtArchivingFilteredMimeCounter;
+ private Counter robotsTxtArchivingRobotsDeniedCounter;
+
+ // Latency tracker for fetch timing metrics
+ private LatencyTracker fetchLatencyTracker;
+
+ // Error tracker for categorized error metrics
+ private ErrorTracker errorTracker;
+
public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues,
QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context,
AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent,
@@ -279,6 +306,59 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
getName(), Thread.currentThread().getId());
}
}
+
+ // Initialize cached counters for performance
+ initCounters();
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters() {
+ robotsDeniedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_TOTAL);
+ robotsDeniedMaxCrawlDelayCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL);
+ robotsDeferVisitsDroppedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL);
+ redirectCountExceededCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL);
+ redirectDeduplicatedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_DEDUPLICATED_TOTAL);
+ redirectNotCreatedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_NOT_CREATED_TOTAL);
+ hitByTimeLimitCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL);
+ aboveExceptionThresholdCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL);
+ outlinksDetectedCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL);
+ outlinksFollowingCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL);
+
+ // Common Crawl specific counters
+ ipv4Counter = context.getCounter(
+ NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP,
+ NutchMetrics.FETCHER_IPV4_TOTAL);
+ ipv6Counter = context.getCounter(
+ NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP,
+ NutchMetrics.FETCHER_IPV6_TOTAL);
+ robotsTxtArchivingFilteredCounter = context.getCounter(
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP,
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL);
+ robotsTxtArchivingFilteredMimeCounter = context.getCounter(
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP,
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL);
+ robotsTxtArchivingRobotsDeniedCounter = context.getCounter(
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP,
+ NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL);
+
+ // Initialize latency tracker for fetch timing
+ fetchLatencyTracker = new LatencyTracker(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY);
+
+ // Initialize error tracker for categorized error metrics
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
}
@Override
@@ -353,7 +433,7 @@ public void run() {
LOG.debug("redirectCount={}", redirectCount);
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.u);
- BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum,
+ BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum,
robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
@@ -372,20 +452,18 @@ public void run() {
fit.getQueueID(), this.robotsDeferVisitsRetries + 1,
this.robotsDeferVisitsDelay);
if (killedURLs != 0) {
- context
- .getCounter("FetcherStatus", "robots_defer_visits_dropped")
- .increment(killedURLs);
+ robotsDeferVisitsDroppedCounter.increment(killedURLs);
}
continue;
}
- if (!rules.isAllowed(fit.url.toString())) {
+ if (!rules.isAllowed(fit.u)) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Denied by robots.txt: {}", fit.url);
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
- context.getCounter("FetcherStatus", "robots_denied").increment(1);
+ robotsDeniedCounter.increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
@@ -397,8 +475,7 @@ public void run() {
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
- context.getCounter("FetcherStatus",
- "robots_denied_maxcrawldelay").increment(1);
+ robotsDeniedMaxCrawlDelayCounter.increment(1);
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
@@ -415,8 +492,11 @@ public void run() {
fit.queueID, fiq.crawlDelay, fit.url);
}
}
+ // Track fetch latency
+ long fetchStart = System.currentTimeMillis();
ProtocolOutput output = protocol.getProtocolOutput(fit.url,
fit.datum);
+ fetchLatencyTracker.record(System.currentTimeMillis() - fetchStart);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
@@ -436,7 +516,8 @@ public void run() {
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
- context.getCounter("FetcherStatus", status.getName()).increment(1);
+ // Dynamic counter for protocol status - can't cache as status varies
+ context.getCounter(NutchMetrics.GROUP_FETCHER, status.getName()).increment(1);
if (storingProtocolVersions && content != null) {
countProtocolVersions(content.getMetadata());
@@ -489,8 +570,7 @@ public void run() {
int killedURLs = fetchQueues
.checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
- context.getCounter("FetcherStatus",
- "AboveExceptionThresholdInQueue").increment(killedURLs);
+ aboveExceptionThresholdCounter.increment(killedURLs);
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
@@ -520,8 +600,7 @@ public void run() {
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
- context.getCounter("FetcherStatus", "redirect_count_exceeded")
- .increment(1);
+ redirectCountExceededCounter.increment(1);
LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
Thread.currentThread().getId(), fit.url,
maxRedirectExceededSkip ? "skipped" : "linked");
@@ -540,15 +619,7 @@ public void run() {
} catch (Throwable t) { // unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
- String message;
- if (LOG.isDebugEnabled()) {
- message = StringUtils.stringifyException(t);
- } else if (logUtil.logShort(t)) {
- message = t.getClass().getName();
- } else {
- message = StringUtils.stringifyException(t);
- }
- logError(fit.url, message);
+ logError(fit.url, t);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
CrawlDatum.STATUS_FETCH_RETRY);
}
@@ -560,6 +631,10 @@ public void run() {
if (fit != null) {
fetchQueues.finishFetchItem(fit);
}
+ // Emit fetch latency metrics
+ fetchLatencyTracker.emitCounters(context);
+ // Emit error metrics
+ errorTracker.emitCounters(context);
activeThreads.decrementAndGet(); // count threads
LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(),
Thread.currentThread().getId(), getName(), activeThreads);
@@ -655,13 +730,13 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
throws ScoringFilterException {
if (fetchQueues.redirectIsQueuedRecently(redirUrl)) {
redirecting = false;
- context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1);
+ redirectDeduplicatedCounter.increment(1);
LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
redirUrl);
return null;
} else if (fetchQueues.timelimitExceeded()) {
redirecting = false;
- context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1);
+ hitByTimeLimitCounter.increment(1);
LOG.debug(" - ignoring redirect from {} to {} - timelimit reached",
fit.url, redirUrl);
return null;
@@ -674,15 +749,24 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
} else {
// stop redirecting
redirecting = false;
- context.getCounter("FetcherStatus", "FetchItem.notCreated.redirect").increment(1);
+ redirectNotCreatedCounter.increment(1);
}
return fit;
}
+ private void logError(Text url, Throwable t) {
+ String message = t.getClass().getName() + ": " + t.getMessage();
+ LOG.info("{} {} fetch of {} failed with: {}", getName(),
+ Thread.currentThread().getId(), url, message);
+ errors.incrementAndGet();
+ errorTracker.recordError(t);
+ }
+
private void logError(Text url, String message) {
LOG.info("{} {} fetch of {} failed with: {}", getName(),
Thread.currentThread().getId(), url, message);
errors.incrementAndGet();
+ errorTracker.recordError(ErrorTracker.ErrorType.OTHER);
}
private void countProtocolVersions(Metadata contentMetadata) {
@@ -693,21 +777,24 @@ private void countProtocolVersions(Metadata contentMetadata) {
if (versionStr != null) {
String[] versions = versionStr.split(",");
if (versions.length >= 1) {
- context.getCounter("HttpProtocolVersion", versions[0]).increment(1);
+ context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP,
+ versions[0]).increment(1);
} else {
- context.getCounter("HttpProtocolVersion", "unknown").increment(1);
+ context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP,
+ NutchMetrics.FETCHER_HTTP_PROTOCOL_UNKNOWN).increment(1);
}
for (int i = 1; i < versions.length; i++) {
- context.getCounter("TlsProtocolVersion", versions[i]).increment(1);
+ context.getCounter(NutchMetrics.FETCHER_TLS_PROTOCOL_VERSION_GROUP,
+ versions[i]).increment(1);
}
}
String ipaddress = contentMetadata.get(Response.IP_ADDRESS);
if (ipaddress == null) {
// IP address is not recorded
} else if (ipaddress.indexOf(':') != -1) {
- context.getCounter("IPaddressVersion", "IPv6").increment(1);
+ ipv6Counter.increment(1);
} else {
- context.getCounter("IPaddressVersion", "IPv4").increment(1);
+ ipv4Counter.increment(1);
}
}
@@ -885,8 +972,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
- context.getCounter("FetcherOutlinks", "outlinks_detected").increment(
- outlinks.size());
+ outlinksDetectedCounter.increment(outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
@@ -918,7 +1004,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
queueMode, outlinkDepth + 1);
- context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
+ outlinksFollowingCounter.increment(1);
fetchQueues.addFetchItem(fit);
@@ -944,7 +1030,8 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
if (parseResult != null && !parseResult.isEmpty()) {
Parse p = parseResult.get(content.getUrl());
if (p != null) {
- context.getCounter("ParserStatus", ParseStatus.majorCodes[p
+ // Dynamic counter for parse status - can't cache as status varies
+ context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[p
.getData().getStatus().getMajorCode()]).increment(1);
return p.getData().getStatus();
}
@@ -1012,7 +1099,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) {
if (robotsTxtArchivingFilterUrlAlways
|| !u.getFile().equals("/robots.txt")) {
LOG.info("Archiving of robots.txt {} skipped by URL filters", url);
- context.getCounter("RobotsTxtArchiving", "filtered").increment(1);
+ robotsTxtArchivingFilteredCounter.increment(1);
return false;
}
@@ -1036,8 +1123,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) {
if (!robotsTxtArchivingAcceptedMimeTypes.contains(contentType)) {
LOG.info("Archiving of robots.txt {} ({}) skipped by MIME filter",
url, contentType);
- context.getCounter("RobotsTxtArchiving", "filtered_mime")
- .increment(1);
+ robotsTxtArchivingFilteredMimeCounter.increment(1);
return false;
}
}
@@ -1057,8 +1143,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) {
LOG.info(
"Archiving of redirected robots.txt {} ({}) not allowed by robots.txt",
url, robotsTxt.getContentType());
- context.getCounter("RobotsTxtArchiving", "robots_denied")
- .increment(1);
+ robotsTxtArchivingRobotsDeniedCounter.increment(1);
return false;
}
}
diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
index c48c4b8f31..5dfa24fd06 100644
--- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java
+++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
@@ -22,9 +22,11 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.fetcher.FetchItemQueues.QueuingStatus;
import org.apache.nutch.fetcher.Fetcher.FetcherRun;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -47,6 +49,12 @@ public class QueueFeeder extends Thread {
private URLNormalizers urlNormalizers = null;
private String urlNormalizerScope = URLNormalizers.SCOPE_DEFAULT;
+ // Cached counter references to avoid repeated lookups in hot paths
+ private Counter hitByTimeoutCounter;
+ private Counter hitByTimelimitCounter;
+ private Counter filteredCounter;
+ private Counter aboveExceptionThresholdCounter;
+
public QueueFeeder(FetcherRun.Context context,
FetchItemQueues queues, int size) {
this.context = context;
@@ -61,6 +69,21 @@ public QueueFeeder(FetcherRun.Context context,
if (conf.getBoolean("fetcher.normalize.urls", false)) {
urlNormalizers = new URLNormalizers(conf, urlNormalizerScope);
}
+ initCounters();
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters() {
+ hitByTimeoutCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL);
+ hitByTimelimitCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL);
+ filteredCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_FILTERED_TOTAL);
+ aboveExceptionThresholdCounter = context.getCounter(
+ NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL);
}
/** Filter and normalize the url */
@@ -94,14 +117,14 @@ public void run() {
LOG.info("QueueFeeder stopping, timeout reached.");
}
queuingStatus[qstatus]++;
- context.getCounter("FetcherStatus", "hitByTimeout").increment(1);
+ hitByTimeoutCounter.increment(1);
} else {
int qstatus = QueuingStatus.HIT_BY_TIMELIMIT.ordinal();
if (queuingStatus[qstatus] == 0) {
LOG.info("QueueFeeder stopping, timelimit exceeded.");
}
queuingStatus[qstatus]++;
- context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1);
+ hitByTimelimitCounter.increment(1);
}
try {
hasMore = context.nextKeyValue();
@@ -133,7 +156,7 @@ public void run() {
String u = filterNormalize(url.toString());
if (u == null) {
// filtered or failed to normalize
- context.getCounter("FetcherStatus", "filtered").increment(1);
+ filteredCounter.increment(1);
continue;
}
url = new Text(u);
@@ -150,9 +173,7 @@ public void run() {
QueuingStatus status = queues.addFetchItem(url, datum);
queuingStatus[status.ordinal()]++;
if (status == QueuingStatus.ABOVE_EXCEPTION_THRESHOLD) {
- context
- .getCounter("FetcherStatus", "AboveExceptionThresholdInQueue")
- .increment(1);
+ aboveExceptionThresholdCounter.increment(1);
}
cnt++;
feed--;
diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index 2140ea52d1..05e4a940c8 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -21,9 +21,13 @@
import java.net.UnknownHostException;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,6 +45,17 @@ public class ResolverThread implements Runnable {
protected Context context;
protected int purgeFailedHostsThreshold;
+ // Cached counter references for performance
+ private Counter newKnownHostCounter;
+ private Counter rediscoveredHostCounter;
+ private Counter existingKnownHostCounter;
+ private Counter newUnknownHostCounter;
+ private Counter existingUnknownHostCounter;
+ private Counter purgedUnknownHostCounter;
+ private Counter checkedHostsCounter;
+ private Counter errorsCounter;
+ private Counter errorsNetworkCounter;
+
/**
* Overloaded constructor.
* @param host name of the host to lookup
@@ -58,6 +73,33 @@ public ResolverThread(String host, HostDatum datum,
this.datum = datum;
this.context = context;
this.purgeFailedHostsThreshold = purgeFailedHostsThreshold;
+
+ // Initialize cached counters for performance
+ initCounters();
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups.
+ */
+ private void initCounters() {
+ newKnownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL);
+ rediscoveredHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL);
+ existingKnownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL);
+ newUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL);
+ existingUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL);
+ purgedUnknownHostCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL);
+ checkedHostsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL);
+ errorsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_TOTAL);
+ errorsNetworkCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_NETWORK_TOTAL);
}
/**
@@ -72,16 +114,16 @@ public void run() {
InetAddress inetAddr = InetAddress.getByName(host);
if (datum.isEmpty()) {
- context.getCounter("UpdateHostDb", "new_known_host").increment(1);
+ newKnownHostCounter.increment(1);
datum.setLastCheck();
LOG.info("{}: new_known_host {}", host, datum);
} else if (datum.getDnsFailures() > 0) {
- context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
+ rediscoveredHostCounter.increment(1);
datum.setLastCheck();
datum.setDnsFailures(0l);
LOG.info("{}: rediscovered_host {}", host, datum);
} else {
- context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
+ existingKnownHostCounter.increment(1);
datum.setLastCheck();
LOG.info("{}: existing_known_host {}", host, datum);
}
@@ -95,7 +137,7 @@ public void run() {
datum.setLastCheck();
datum.setDnsFailures(1l);
context.write(hostText, datum);
- context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
+ newUnknownHostCounter.increment(1);
LOG.info("{}: new_unknown_host {}", host, datum);
} else {
datum.setLastCheck();
@@ -106,23 +148,34 @@ public void run() {
purgeFailedHostsThreshold < datum.getDnsFailures()) {
context.write(hostText, datum);
- context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1);
+ existingUnknownHostCounter.increment(1);
LOG.info("{}: existing_unknown_host {}", host, datum);
} else {
- context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1);
+ purgedUnknownHostCounter.increment(1);
LOG.info("{}: purged_unknown_host {}", host, datum);
}
}
- context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1);
+ // Dynamic counter based on failure count - can't cache
+ context.getCounter(NutchMetrics.GROUP_HOSTDB, createFailureCounterLabel(datum)).increment(1);
+ // Common error counters for consistency
+ errorsCounter.increment(1);
+ errorsNetworkCounter.increment(1);
} catch (Exception ioe) {
LOG.warn(StringUtils.stringifyException(ioe));
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ NutchMetrics.ERROR_TOTAL).increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ ErrorTracker.getCounterName(ioe)).increment(1);
}
} catch (Exception e) {
LOG.warn(StringUtils.stringifyException(e));
+ errorsCounter.increment(1);
+ context.getCounter(NutchMetrics.GROUP_HOSTDB,
+ ErrorTracker.getCounterName(e)).increment(1);
}
- context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
+ checkedHostsCounter.increment(1);
}
private String createFailureCounterLabel(HostDatum datum) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index ca6797ac0a..b1736348b8 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -24,12 +24,15 @@
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.ProtocolStatus;
@@ -60,6 +63,10 @@ public class UpdateHostDbMapper
protected URLFilters filters = null;
protected URLNormalizers normalizers = null;
+ // Cached counter references to avoid repeated lookups in hot paths
+ protected Counter filteredRecordsCounter;
+ protected ErrorTracker errorTracker;
+
@Override
public void setup(Mapper.Context context) {
Configuration conf = context.getConfiguration();
@@ -71,6 +78,19 @@ public void setup(Mapper.Context context) {
filters = new URLFilters(conf);
if (normalize)
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
+
+ // Initialize cached counter references
+ initCounters(context);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_HOSTDB, context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ filteredRecordsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL);
}
/**
@@ -136,7 +156,7 @@ public void map(Text key, Writable value,
try {
url = new URL(keyStr);
} catch (MalformedURLException e) {
- context.getCounter("UpdateHostDb", "malformed_url").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
String hostName = URLUtil.getHost(url);
@@ -146,7 +166,7 @@ public void map(Text key, Writable value,
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ filteredRecordsCounter.increment(1);
LOG.debug("UpdateHostDb: {} crawldatum has been filtered", hostName);
return;
}
@@ -219,7 +239,7 @@ public void map(Text key, Writable value,
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ filteredRecordsCounter.increment(1);
LOG.debug("UpdateHostDb: {} hostdatum has been filtered", keyStr);
return;
}
@@ -243,7 +263,7 @@ public void map(Text key, Writable value,
// Filtered out?
if (buffer == null) {
- context.getCounter("UpdateHostDb", "filtered_records").increment(1);
+ filteredRecordsCounter.increment(1);
LOG.debug("UpdateHostDb: {} score has been filtered", keyStr);
return;
}
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 1431b56365..878216b3c6 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -31,11 +31,13 @@
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metrics.NutchMetrics;
import com.tdunning.math.stats.TDigest;
@@ -72,6 +74,11 @@ public class UpdateHostDbReducer
protected BlockingQueue queue = new SynchronousQueue<>();
protected ThreadPoolExecutor executor = null;
+ // Cached counter references to avoid repeated lookups in hot paths
+ protected Counter urlLimitNotReachedCounter;
+ protected Counter totalHostsCounter;
+ protected Counter skippedNotEligibleCounter;
+
/**
* Configures the thread pool and prestarts all resolver threads.
*/
@@ -145,6 +152,21 @@ public void setup(Reducer.Context context)
// Run all threads in the pool
executor.prestartAllCoreThreads();
}
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Reducer.Context context) {
+ urlLimitNotReachedCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL);
+ totalHostsCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL);
+ skippedNotEligibleCounter = context.getCounter(
+ NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL);
}
/**
@@ -379,12 +401,12 @@ else if (value instanceof FloatWritable) {
// Impose limits on minimum number of URLs?
if (urlLimit > -1l) {
if (hostDatum.numRecords() < urlLimit) {
- context.getCounter("UpdateHostDb", "url_limit_not_reached").increment(1);
+ urlLimitNotReachedCounter.increment(1);
return;
}
}
- context.getCounter("UpdateHostDb", "total_hosts").increment(1);
+ totalHostsCounter.increment(1);
// See if this record is to be checked
if (shouldCheck(hostDatum)) {
@@ -401,7 +423,7 @@ else if (value instanceof FloatWritable) {
// Do not progress, the datum will be written in the resolver thread
return;
} else if (checkAny) {
- context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
+ skippedNotEligibleCounter.increment(1);
LOG.debug("UpdateHostDb: {}: skipped_not_eligible", key);
}
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
index cedee8e34c..dc466dad06 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -26,6 +26,7 @@
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -36,6 +37,7 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
@@ -88,6 +90,9 @@ public static class DeleterReducer extends
IndexWriters writers = null;
+ // Cached counter reference for performance
+ private Counter deletedDocumentsCounter;
+
@Override
public void setup(Reducer.Context context) {
Configuration conf = context.getConfiguration();
@@ -98,6 +103,17 @@ public void setup(Reducer.Context contex
throw new RuntimeException(e);
}
noCommit = conf.getBoolean("noCommit", false);
+
+ // Initialize cached counter reference
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ deletedDocumentsCounter = context.getCounter(
+ NutchMetrics.GROUP_CLEANING, NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL);
}
@Override
@@ -118,7 +134,7 @@ public void reduce(ByteWritable key, Iterable values,
for (Text document : values) {
writers.delete(document.toString());
totalDeleted++;
- context.getCounter("CleaningJobStatus", "Deleted documents").increment(1);
+ deletedDocumentsCounter.increment(1);
}
}
}
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 9fb8007715..50da12b8a2 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -30,6 +30,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
@@ -40,6 +41,9 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.LatencyTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
@@ -214,6 +218,22 @@ public static class IndexerReducer extends
private URLNormalizers urlNormalizers;
private URLFilters urlFilters;
+ // Latency tracker for indexing timing metrics
+ private LatencyTracker indexLatencyTracker;
+
+ // Cached counter references to avoid repeated lookups in hot paths
+ private Counter deletedRobotsNoIndexCounter;
+ private Counter deletedGoneCounter;
+ private Counter deletedRedirectsCounter;
+ private Counter deletedDuplicatesCounter;
+ private Counter skippedNotModifiedCounter;
+ private Counter deletedByIndexingFilterCounter;
+ private Counter skippedByIndexingFilterCounter;
+ private Counter indexedCounter;
+
+ // Error tracker with cached counters
+ private ErrorTracker errorTracker;
+
@Override
public void setup(Reducer.Context context) {
Configuration conf = context.getConfiguration();
@@ -238,6 +258,44 @@ public void setup(Reducer.Context c
if (filter) {
urlFilters = new URLFilters(conf);
}
+
+ // Initialize latency tracker for indexing timing
+ indexLatencyTracker = new LatencyTracker(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_LATENCY);
+
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Reducer.Context context) {
+ deletedRobotsNoIndexCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL);
+ deletedGoneCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_GONE_TOTAL);
+ deletedRedirectsCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL);
+ deletedDuplicatesCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL);
+ skippedNotModifiedCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL);
+ deletedByIndexingFilterCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL);
+ skippedByIndexingFilterCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL);
+ indexedCounter = context.getCounter(
+ NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_INDEXED_TOTAL);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_INDEXER, context);
+ }
+
+ @Override
+ public void cleanup(Reducer.Context context)
+ throws IOException, InterruptedException {
+ // Emit indexing latency metrics
+ indexLatencyTracker.emitCounters(context);
}
@Override
@@ -283,7 +341,7 @@ public void reduce(Text key, Iterable values,
.indexOf("noindex") != -1) {
// Delete it!
context.write(key, DELETE_ACTION);
- context.getCounter("IndexerStatus", "deleted (robots=noindex)").increment(1);
+ deletedRobotsNoIndexCounter.increment(1);
return;
}
}
@@ -300,7 +358,7 @@ public void reduce(Text key, Iterable values,
if (delete && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
- context.getCounter("IndexerStatus", "deleted (gone)").increment(1);
+ deletedGoneCounter.increment(1);
context.write(key, DELETE_ACTION);
return;
}
@@ -309,7 +367,7 @@ public void reduce(Text key, Iterable values,
|| fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
- context.getCounter("IndexerStatus", "deleted (redirects)").increment(1);
+ deletedRedirectsCounter.increment(1);
context.write(key, DELETE_ACTION);
return;
}
@@ -321,14 +379,14 @@ public void reduce(Text key, Iterable values,
// Whether to delete pages marked as duplicates
if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
- context.getCounter("IndexerStatus", "deleted (duplicates)").increment(1);
+ deletedDuplicatesCounter.increment(1);
context.write(key, DELETE_ACTION);
return;
}
// Whether to skip DB_NOTMODIFIED pages
if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
- context.getCounter("IndexerStatus", "skipped (not modified)").increment(1);
+ skippedNotModifiedCounter.increment(1);
return;
}
@@ -337,6 +395,9 @@ public void reduce(Text key, Iterable values,
return;
}
+ // Start timing document indexing
+ long indexStart = System.currentTimeMillis();
+
NutchDocument doc = new NutchDocument();
doc.add("id", key.toString());
@@ -355,7 +416,7 @@ public void reduce(Text key, Iterable values,
boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
inlinks, boost);
} catch (final ScoringFilterException e) {
- context.getCounter("IndexerStatus", "errors (ScoringFilter)").increment(1);
+ errorTracker.incrementCounters(e);
LOG.warn("Error calculating score {}: {}", key, e);
return;
}
@@ -390,7 +451,7 @@ public void reduce(Text key, Iterable values,
doc = filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (final IndexingException e) {
LOG.warn("Error indexing {}: ", key, e);
- context.getCounter("IndexerStatus", "errors (IndexingFilter)").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
@@ -400,9 +461,9 @@ public void reduce(Text key, Iterable values,
if (deleteSkippedByIndexingFilter) {
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
context.write(key, action);
- context.getCounter("IndexerStatus", "deleted (IndexingFilter)").increment(1);
+ deletedByIndexingFilterCounter.increment(1);
} else {
- context.getCounter("IndexerStatus", "skipped (IndexingFilter)").increment(1);
+ skippedByIndexingFilterCounter.increment(1);
}
return;
}
@@ -422,7 +483,10 @@ public void reduce(Text key, Iterable values,
doc.add("binaryContent", binary);
}
- context.getCounter("IndexerStatus", "indexed (add/update)").increment(1);
+ // Record indexing latency
+ indexLatencyTracker.record(System.currentTimeMillis() - indexStart);
+
+ indexedCounter.increment(1);
NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
context.write(key, action);
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index fc2c44a064..224b4118e6 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -30,6 +30,7 @@
import org.apache.commons.lang3.time.StopWatch;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
@@ -155,10 +156,14 @@ public void index(Path crawlDb, Path linkDb, List segments,
throw e;
}
LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
- for (Counter counter : job.getCounters().getGroup("IndexerStatus")) {
- LOG.info("Indexer: {} {}",
- String.format(Locale.ROOT, "%6d", counter.getValue()),
- counter.getName());
+ for (Counter counter : job.getCounters()
+ .getGroup(NutchMetrics.GROUP_INDEXER)) {
+ long counterValue = counter.getValue();
+ if (counterValue > 0) {
+ LOG.info("Indexer: {} {}",
+ String.format(Locale.ROOT, "%6d", counterValue),
+ counter.getName());
+ }
}
stopWatch.stop();
LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime(
diff --git a/src/java/org/apache/nutch/metrics/ErrorTracker.java b/src/java/org/apache/nutch/metrics/ErrorTracker.java
new file mode 100644
index 0000000000..1921071605
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/ErrorTracker.java
@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metrics;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.net.UnknownHostException;
+import java.util.EnumMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+/**
+ * A utility class for tracking errors by category with automatic classification.
+ *
+ * This class provides thread-safe error counting with automatic categorization
+ * based on exception type. It uses a bounded set of error categories to stay within
+ * Hadoop's counter limits (~120 counters).
+ *
+ *
Usage:
+ *
+ * // In mapper/reducer setup or thread initialization
+ * errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+ *
+ * // When catching exceptions
+ * try {
+ * // ... operation ...
+ * } catch (Exception e) {
+ * errorTracker.recordError(e); // Auto-categorizes
+ * }
+ *
+ * // Or with manual categorization
+ * errorTracker.recordError(ErrorTracker.ErrorType.NETWORK);
+ *
+ * // In cleanup - emit all error counters
+ * errorTracker.emitCounters(context);
+ *
+ *
+ * Emits the following counters:
+ *
+ * - errors_total - total number of errors across all categories
+ * - errors_network_total - network-related errors
+ * - errors_protocol_total - protocol errors
+ * - errors_parsing_total - parsing errors
+ * - errors_url_total - URL-related errors
+ * - errors_scoring_total - scoring filter errors
+ * - errors_indexing_total - indexing filter errors
+ * - errors_timeout_total - timeout errors
+ * - errors_other_total - uncategorized errors
+ *
+ *
+ * @since 1.22
+ */
+public class ErrorTracker {
+
+ /**
+ * Error type categories for classification.
+ * Uses a bounded set to stay within Hadoop's counter limits.
+ */
+ public enum ErrorType {
+ /** Network-related errors (IOException, SocketException, etc.) */
+ NETWORK,
+ /** Protocol errors (ProtocolException, ProtocolNotFound) */
+ PROTOCOL,
+ /** Parsing errors (ParseException, ParserNotFound) */
+ PARSING,
+ /** URL-related errors (MalformedURLException, URLFilterException) */
+ URL,
+ /** Scoring filter errors */
+ SCORING,
+ /** Indexing filter errors */
+ INDEXING,
+ /** Timeout errors (SocketTimeoutException) */
+ TIMEOUT,
+ /** Other uncategorized errors */
+ OTHER
+ }
+
+ private final String group;
+ private final Map counts;
+ private final AtomicLong totalCount;
+
+ // Cached counter references for performance (optional - set via initCounters)
+ private org.apache.hadoop.mapreduce.Counter cachedTotalCounter;
+ private final Map cachedCounters;
+
+ /**
+ * Creates a new ErrorTracker for the specified counter group.
+ *
+ * This constructor creates an ErrorTracker without cached counters.
+ * Call {@link #initCounters(TaskInputOutputContext)} in setup() to cache
+ * counter references for better performance.
+ *
+ * @param group the Hadoop counter group name (e.g., NutchMetrics.GROUP_FETCHER)
+ */
+ public ErrorTracker(String group) {
+ this.group = group;
+ this.counts = new EnumMap<>(ErrorType.class);
+ this.cachedCounters = new EnumMap<>(ErrorType.class);
+ this.totalCount = new AtomicLong(0);
+
+ // Initialize all counts to 0
+ for (ErrorType type : ErrorType.values()) {
+ counts.put(type, new AtomicLong(0));
+ }
+ }
+
+ /**
+ * Creates a new ErrorTracker with cached counter references.
+ *
+ *
This constructor caches all counter references at creation time,
+ * avoiding repeated counter lookups in hot paths.
+ *
+ * @param group the Hadoop counter group name
+ * @param context the Hadoop task context for caching counters
+ */
+ public ErrorTracker(String group, TaskInputOutputContext, ?, ?, ?> context) {
+ this(group);
+ initCounters(context);
+ }
+
+ /**
+ * Initializes cached counter references from the Hadoop context.
+ *
+ *
Call this method in the mapper/reducer setup() method to cache
+ * counter references and avoid repeated lookups during processing.
+ *
+ * @param context the Hadoop task context
+ */
+ public void initCounters(TaskInputOutputContext, ?, ?, ?> context) {
+ cachedTotalCounter = context.getCounter(group, NutchMetrics.ERROR_TOTAL);
+ for (ErrorType type : ErrorType.values()) {
+ cachedCounters.put(type, context.getCounter(group, getCounterName(type)));
+ }
+ }
+
+ /**
+ * Records an error with automatic categorization based on the throwable type.
+ *
+ * @param t the throwable to categorize and record
+ */
+ public void recordError(Throwable t) {
+ recordError(categorize(t));
+ }
+
+ /**
+ * Records an error with explicit category.
+ *
+ * @param type the error type category
+ */
+ public void recordError(ErrorType type) {
+ counts.get(type).incrementAndGet();
+ totalCount.incrementAndGet();
+ }
+
+ /**
+ * Returns the count for a specific error type.
+ *
+ * @param type the error type
+ * @return the count for that error type
+ */
+ public long getCount(ErrorType type) {
+ return counts.get(type).get();
+ }
+
+ /**
+ * Returns the total count of all errors.
+ *
+ * @return the total error count
+ */
+ public long getTotalCount() {
+ return totalCount.get();
+ }
+
+ /**
+ * Emits all error counters to the Hadoop context.
+ *
+ *
Should be called once during cleanup to emit aggregated metrics.
+ * Only emits counters for error types that have non-zero counts.
+ *
+ *
If counters were cached via {@link #initCounters(TaskInputOutputContext)},
+ * uses the cached references for better performance.
+ *
+ * @param context the Hadoop task context
+ */
+ public void emitCounters(TaskInputOutputContext, ?, ?, ?> context) {
+ // Use cached counters if available, otherwise look up
+ if (cachedTotalCounter != null) {
+ cachedTotalCounter.increment(totalCount.get());
+ for (ErrorType type : ErrorType.values()) {
+ long count = counts.get(type).get();
+ if (count > 0) {
+ cachedCounters.get(type).increment(count);
+ }
+ }
+ } else {
+ // Fallback to direct lookup
+ context.getCounter(group, NutchMetrics.ERROR_TOTAL).increment(totalCount.get());
+ for (ErrorType type : ErrorType.values()) {
+ long count = counts.get(type).get();
+ if (count > 0) {
+ context.getCounter(group, getCounterName(type)).increment(count);
+ }
+ }
+ }
+ }
+
+ /**
+ * Directly increments cached error counters without local accumulation.
+ *
+ *
Use this method when you want to immediately update Hadoop counters
+ * rather than accumulating locally and emitting in cleanup.
+ * Requires {@link #initCounters(TaskInputOutputContext)} to have been called.
+ *
+ * @param t the throwable to categorize and count
+ * @throws IllegalStateException if counters have not been initialized
+ */
+ public void incrementCounters(Throwable t) {
+ incrementCounters(categorize(t));
+ }
+
+ /**
+ * Directly increments cached error counters without local accumulation.
+ *
+ *
Use this method when you want to immediately update Hadoop counters
+ * rather than accumulating locally and emitting in cleanup.
+ * Requires {@link #initCounters(TaskInputOutputContext)} to have been called.
+ *
+ * @param type the error type to count
+ * @throws IllegalStateException if counters have not been initialized
+ */
+ public void incrementCounters(ErrorType type) {
+ if (cachedTotalCounter == null) {
+ throw new IllegalStateException(
+ "Counters not initialized. Call initCounters() first.");
+ }
+ cachedTotalCounter.increment(1);
+ cachedCounters.get(type).increment(1);
+ }
+
+ /**
+ * Categorizes a throwable into an error type.
+ *
+ *
The categorization checks the exception class hierarchy to determine
+ * the most appropriate category. Timeout exceptions are checked first as
+ * they are a subclass of IOException.
+ *
+ * @param t the throwable to categorize
+ * @return the appropriate ErrorType for the throwable
+ */
+ public static ErrorType categorize(Throwable t) {
+ if (t == null) {
+ return ErrorType.OTHER;
+ }
+
+ String className = t.getClass().getName();
+
+ // Check for timeout first (before general IOException)
+ if (t instanceof SocketTimeoutException
+ || className.contains("TimeoutException")
+ || className.contains("Timeout")) {
+ return ErrorType.TIMEOUT;
+ }
+
+ // Network errors
+ if (t instanceof SocketException
+ || t instanceof UnknownHostException
+ || className.contains("ConnectException")
+ || className.contains("NoRouteToHostException")
+ || className.contains("ConnectionRefusedException")) {
+ return ErrorType.NETWORK;
+ }
+
+ // URL errors (check before general IOException since MalformedURLException extends IOException)
+ if (t instanceof MalformedURLException
+ || className.contains("URLFilterException")
+ || className.contains("URISyntaxException")) {
+ return ErrorType.URL;
+ }
+
+ // General IOException (but not the specific subtypes above)
+ if (t instanceof IOException) {
+ return ErrorType.NETWORK;
+ }
+
+ // Protocol errors
+ if (className.contains("ProtocolException")
+ || className.contains("ProtocolNotFound")) {
+ return ErrorType.PROTOCOL;
+ }
+
+ // Parsing errors
+ if (className.contains("ParseException")
+ || className.contains("ParserNotFound")
+ || className.contains("SAXException")
+ || className.contains("ParserConfigurationException")) {
+ return ErrorType.PARSING;
+ }
+
+ // Scoring errors
+ if (className.contains("ScoringFilterException")) {
+ return ErrorType.SCORING;
+ }
+
+ // Indexing errors
+ if (className.contains("IndexingException")) {
+ return ErrorType.INDEXING;
+ }
+
+ // Check cause chain for more specific categorization
+ Throwable cause = t.getCause();
+ if (cause != null && cause != t) {
+ ErrorType causeType = categorize(cause);
+ if (causeType != ErrorType.OTHER) {
+ return causeType;
+ }
+ }
+
+ return ErrorType.OTHER;
+ }
+
+ /**
+ * Gets the counter name constant for a given error type.
+ *
+ * @param type the error type
+ * @return the counter name constant from NutchMetrics
+ */
+ public static String getCounterName(ErrorType type) {
+ switch (type) {
+ case NETWORK:
+ return NutchMetrics.ERROR_NETWORK_TOTAL;
+ case PROTOCOL:
+ return NutchMetrics.ERROR_PROTOCOL_TOTAL;
+ case PARSING:
+ return NutchMetrics.ERROR_PARSING_TOTAL;
+ case URL:
+ return NutchMetrics.ERROR_URL_TOTAL;
+ case SCORING:
+ return NutchMetrics.ERROR_SCORING_TOTAL;
+ case INDEXING:
+ return NutchMetrics.ERROR_INDEXING_TOTAL;
+ case TIMEOUT:
+ return NutchMetrics.ERROR_TIMEOUT_TOTAL;
+ case OTHER:
+ default:
+ return NutchMetrics.ERROR_OTHER_TOTAL;
+ }
+ }
+
+ /**
+ * Gets the counter name for a throwable based on its categorization.
+ *
+ *
This is a convenience method for direct use in catch blocks:
+ *
+ * } catch (Exception e) {
+ * context.getCounter(group, ErrorTracker.getCounterName(e)).increment(1);
+ * }
+ *
+ *
+ * @param t the throwable to get the counter name for
+ * @return the counter name constant from NutchMetrics
+ */
+ public static String getCounterName(Throwable t) {
+ return getCounterName(categorize(t));
+ }
+}
diff --git a/src/java/org/apache/nutch/metrics/LatencyTracker.java b/src/java/org/apache/nutch/metrics/LatencyTracker.java
new file mode 100644
index 0000000000..3777bb29e3
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/LatencyTracker.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metrics;
+
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+import com.tdunning.math.stats.TDigest;
+
+/**
+ * A utility class for tracking latency metrics using TDigest for percentile
+ * calculation.
+ *
+ * This class wraps a TDigest data structure to collect latency samples and
+ * emit Hadoop counters with count, sum, and percentile values (p50, p95, p99).
+ *
+ *
Usage:
+ *
+ * // In mapper/reducer setup
+ * latencyTracker = new LatencyTracker(NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY);
+ *
+ * // During processing
+ * long start = System.currentTimeMillis();
+ * // ... operation ...
+ * latencyTracker.record(System.currentTimeMillis() - start);
+ *
+ * // In cleanup
+ * latencyTracker.emitCounters(context);
+ *
+ *
+ * Emits the following counters:
+ *
+ * - {prefix}_count_total - total number of samples
+ * - {prefix}_sum_ms - sum of all latencies in milliseconds
+ * - {prefix}_p50_ms - 50th percentile (median) latency
+ * - {prefix}_p95_ms - 95th percentile latency
+ * - {prefix}_p99_ms - 99th percentile latency
+ *
+ *
+ * @since 1.22
+ */
+public class LatencyTracker {
+
+ /** Default compression factor for TDigest (controls accuracy vs memory). */
+ private static final double DEFAULT_COMPRESSION = 100.0;
+
+ private final TDigest digest;
+ private final String group;
+ private final String prefix;
+ private long count = 0;
+ private long sum = 0;
+
+ /**
+ * Creates a new LatencyTracker.
+ *
+ * @param group the Hadoop counter group name
+ * @param prefix the prefix for counter names (e.g., "fetch_latency")
+ */
+ public LatencyTracker(String group, String prefix) {
+ this.digest = TDigest.createDigest(DEFAULT_COMPRESSION);
+ this.group = group;
+ this.prefix = prefix;
+ }
+
+ /**
+ * Records a latency sample.
+ *
+ * @param latencyMs the latency in milliseconds
+ */
+ public void record(long latencyMs) {
+ digest.add(latencyMs);
+ count++;
+ sum += latencyMs;
+ }
+
+ /**
+ * Returns the number of recorded samples.
+ *
+ * @return the count of recorded latency samples
+ */
+ public long getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the sum of all recorded latencies.
+ *
+ * @return the sum of latencies in milliseconds
+ */
+ public long getSum() {
+ return sum;
+ }
+
+ /**
+ * Returns the percentile value for the given quantile.
+ *
+ * @param quantile the quantile (0.0 to 1.0)
+ * @return the percentile value in milliseconds
+ */
+ public long getPercentile(double quantile) {
+ if (count == 0) {
+ return 0;
+ }
+ return (long) digest.quantile(quantile);
+ }
+
+ /**
+ * Emits all latency counters to the Hadoop context.
+ *
+ * Should be called once during cleanup to emit aggregated metrics.
+ *
+ * @param context the Hadoop task context
+ */
+ public void emitCounters(TaskInputOutputContext, ?, ?, ?> context) {
+ context.getCounter(group, prefix + "_count_total").setValue(count);
+ context.getCounter(group, prefix + "_sum_ms").setValue(sum);
+
+ if (count > 0) {
+ context.getCounter(group, prefix + "_p50_ms").setValue((long) digest.quantile(0.50));
+ context.getCounter(group, prefix + "_p95_ms").setValue((long) digest.quantile(0.95));
+ context.getCounter(group, prefix + "_p99_ms").setValue((long) digest.quantile(0.99));
+ } else {
+ // Set to 0 if no samples recorded
+ context.getCounter(group, prefix + "_p50_ms").setValue(0);
+ context.getCounter(group, prefix + "_p95_ms").setValue(0);
+ context.getCounter(group, prefix + "_p99_ms").setValue(0);
+ }
+ }
+}
+
+
diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java
new file mode 100644
index 0000000000..ccb2d70ed3
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java
@@ -0,0 +1,717 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metrics;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.FetchSchedule;
+
+/**
+ * Centralized constants for Hadoop metrics counter groups and names.
+ *
+ *
Follows Prometheus
+ * naming conventions:
+ *
+ * - Counter groups use the {@code nutch_} prefix namespace
+ * - Counter names use snake_case
+ * - Accumulating counters use {@code _total} suffix
+ * - Units are included in counter names where applicable (e.g., {@code _bytes})
+ *
+ *
+ * @since 1.22
+ */
+public final class NutchMetrics {
+
+ private NutchMetrics() {
+ // Utility class - prevent instantiation
+ }
+
+ // =========================================================================
+ // Counter Groups (Prometheus namespace style with nutch_ prefix)
+ // =========================================================================
+
+ /** Counter group for fetcher operations. */
+ public static final String GROUP_FETCHER = "nutch_fetcher";
+
+ /** Counter group for fetcher outlink processing. */
+ public static final String GROUP_FETCHER_OUTLINKS = "nutch_fetcher_outlinks";
+
+ /** Counter group for generator operations. */
+ public static final String GROUP_GENERATOR = "nutch_generator";
+
+ /** Counter group for indexer operations. */
+ public static final String GROUP_INDEXER = "nutch_indexer";
+
+ /** Counter group for CrawlDb operations. */
+ public static final String GROUP_CRAWLDB = "nutch_crawldb";
+
+ /** Counter group for CrawlDb filter operations. */
+ public static final String GROUP_CRAWLDB_FILTER = "nutch_crawldb_filter";
+
+ /** Counter group for injector operations. */
+ public static final String GROUP_INJECTOR = "nutch_injector";
+
+ /** Counter group for HostDb operations. */
+ public static final String GROUP_HOSTDB = "nutch_hostdb";
+
+ /** Counter group for parser operations. */
+ public static final String GROUP_PARSER = "nutch_parser";
+
+ /** Counter group for deduplication operations. */
+ public static final String GROUP_DEDUP = "nutch_dedup";
+
+ /** Counter group for cleaning job operations. */
+ public static final String GROUP_CLEANING = "nutch_cleaning";
+
+ /** Counter group for WebGraph operations. */
+ public static final String GROUP_WEBGRAPH = "nutch_webgraph";
+
+ /** Counter group for sitemap processing operations. */
+ public static final String GROUP_SITEMAP = "nutch_sitemap";
+
+ /** Counter group for WARC export operations. */
+ public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter";
+
+ /** Counter group for domain statistics operations. */
+ public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats";
+
+ // =========================================================================
+ // Fetcher Counters
+ // =========================================================================
+
+ /** Total bytes downloaded by fetcher. */
+ public static final String FETCHER_BYTES_DOWNLOADED_TOTAL = "bytes_downloaded_total";
+
+ /** URLs denied by robots.txt. */
+ public static final String FETCHER_ROBOTS_DENIED_TOTAL = "robots_denied_total";
+
+ /** URLs denied due to crawl delay exceeding maximum. */
+ public static final String FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL = "robots_denied_maxcrawldelay_total";
+
+ /** URLs dropped due to robots.txt deferred visits. */
+ public static final String FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL = "robots_defer_visits_dropped_total";
+
+ /** Redirects that exceeded maximum redirect count. */
+ public static final String FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL = "redirect_count_exceeded_total";
+
+ /** Redirects deduplicated (already seen). */
+ public static final String FETCHER_REDIRECT_DEDUPLICATED_TOTAL = "redirect_deduplicated_total";
+
+ /** FetchItems not created for redirects. */
+ public static final String FETCHER_REDIRECT_NOT_CREATED_TOTAL = "redirect_not_created_total";
+
+ /** URLs hit by time limit. */
+ public static final String FETCHER_HIT_BY_TIMELIMIT_TOTAL = "hit_by_timelimit_total";
+
+ /** URLs hit by timeout. */
+ public static final String FETCHER_HIT_BY_TIMEOUT_TOTAL = "hit_by_timeout_total";
+
+ /** URLs hit by throughput threshold. */
+ public static final String FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL = "hit_by_throughput_threshold_total";
+
+ /** Threads that hung during fetching. */
+ public static final String FETCHER_HUNG_THREADS_TOTAL = "hung_threads_total";
+
+ /** URLs filtered during fetching. */
+ public static final String FETCHER_FILTERED_TOTAL = "filtered_total";
+
+ /** URLs dropped due to exception threshold in queue. */
+ public static final String FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL = "above_exception_threshold_total";
+
+ // =========================================================================
+ // Fetcher Outlinks Counters
+ // =========================================================================
+
+ /** Outlinks detected during parsing. */
+ public static final String FETCHER_OUTLINKS_DETECTED_TOTAL = "outlinks_detected_total";
+
+ /** Outlinks being followed. */
+ public static final String FETCHER_OUTLINKS_FOLLOWING_TOTAL = "outlinks_following_total";
+
+ // =========================================================================
+ // Fetcher Common Crawl extensions
+ // =========================================================================
+
+ /** HTTP protocol version group with dynamic counters. */
+ public static final String FETCHER_HTTP_PROTOCOL_VERSION_GROUP = "http_protocol_version";
+
+ public static final String FETCHER_HTTP_PROTOCOL_UNKNOWN = "unknown";
+
+ /** SSL/TLS protocol version group with dynamic counters. */
+ public static final String FETCHER_TLS_PROTOCOL_VERSION_GROUP = "tls_protocol_version";
+
+ /** IP address version group with two counters: ipv4 and ipv6. */
+ public static final String FETCHER_IP_ADDRESS_VERSION_GROUP = "ip_address_version";
+
+ /** Number of fetches over IPv4. */
+ public static final String FETCHER_IPV4_TOTAL = "ipv4";
+
+ /** Number of fetches over IPv6. */
+ public static final String FETCHER_IPV6_TOTAL = "ipv6";
+
+ /** Archiving of robots.txt captures. */
+ public static final String FETCHER_ROBOTSTXT_ARCHIVING_GROUP = "robotstxt_archiving";
+
+ /** Robots.txt not archived: URL rejected by URL filters. */
+ public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL = "filtered";
+
+ /** Robots.txt not archived: MIME type rejected. */
+ public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL = "filtered_mime";
+
+ /**
+ * Robots.txt not archived: URL path not /robots.txt and
+ * disallowed by robots.txt.
+ */
+ public static final String FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL = "robots_denied";
+
+ // =========================================================================
+ // Common Crawl's WarcWriter
+ // =========================================================================
+
+ /** Counter group for Common Crawl's WARC writer. */
+ public static final String GROUP_WARC_WRITER = "warc_writer";
+
+ /** Skipped records because no content (and protocol status) is available. */
+ public static final String WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL = "skipped_no_content";
+
+ /** Fixed records: invalid URI normalized. */
+ public static final String WARC_WRITER_URI_NORMALIZED_TOTAL = "fixed_uri";
+
+ /** Skipped records because URL is not a valid URI (no WARC-Target-URI). */
+ public static final String WARC_WRITER_SKIPPED_INVALID_URI_TOTAL = "skipped_invalid_uri";
+
+ /** Skipped records by content type / MIME type. */
+ public static final String WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL = "skipped_by_content_type";
+
+ /** Skipped duplicate records. */
+ public static final String WARC_WRITER_SKIPPED_DUPLICATE_TOTAL = "skipped_duplicate";
+
+ /** Skipped records: no protocol status. */
+ public static final String WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL = "skipped_no_protocol_status";
+
+ /** Skipped records: unknown protocol status. */
+ public static final String WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL = "skipped_unknown_protocol_status";
+
+ /** Prefix for error status of language identification (LID), returned by CLD2 Java bindings. */
+ public static final String WARC_WRITER_LID_ERROR_PREFIX = "lid_error: ";
+
+ /** Language identification (LID): no result. */
+ public static final String WARC_WRITER_LID_NO_RESULT_TOTAL = "lid_no_result";
+
+ /** Language identification (LID): result is reliable. */
+ public static final String WARC_WRITER_LID_RESULT_RELIABLE_TOTAL = "lid_reliable";
+
+ /** Language identification (LID): result is not reliable. */
+ public static final String WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL = "lid_not_reliable";
+
+ // =========================================================================
+ // Generator Counters
+ // =========================================================================
+
+ /** URLs rejected by URL filters. */
+ public static final String GENERATOR_URL_FILTERS_REJECTED_TOTAL = "url_filters_rejected_total";
+
+ /** URLs rejected by fetch schedule. */
+ public static final String GENERATOR_SCHEDULE_REJECTED_TOTAL = "schedule_rejected_total";
+
+ /** URLs waiting for CrawlDb update. */
+ public static final String GENERATOR_WAIT_FOR_UPDATE_TOTAL = "wait_for_update_total";
+
+ /** URLs rejected by JEXL expression. */
+ public static final String GENERATOR_EXPR_REJECTED_TOTAL = "expr_rejected_total";
+
+ /** URLs rejected due to status restriction. */
+ public static final String GENERATOR_STATUS_REJECTED_TOTAL = "status_rejected_total";
+
+ /** URLs rejected due to score below threshold. */
+ public static final String GENERATOR_SCORE_TOO_LOW_TOTAL = "score_too_low_total";
+
+ /** URLs rejected due to fetch interval exceeding threshold. */
+ public static final String GENERATOR_INTERVAL_REJECTED_TOTAL = "interval_rejected_total";
+
+ /** URLs skipped due to per-host overflow. */
+ public static final String GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL = "urls_skipped_per_host_overflow_total";
+
+ /** Hosts affected by per-host overflow. */
+ public static final String GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL = "hosts_affected_per_host_overflow_total";
+
+ // =========================================================================
+ // Generator2-specific Counters
+ // =========================================================================
+
+ /** Domains affected by per-domain overflow. All remaining URLs of this domain have been skipped, but were not counted. */
+ public static final String GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL = "domains_affected_per_domain_overflow_total";
+
+ /** Domains affected by max. number of hosts per domain overflow. URLs from further hosts below this domain have been skipped. */
+ public static final String GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "domains_affected_num_hosts_overflow_total";
+
+ /** URLs skipped due to the max. number of hosts per domain overflow. */
+ public static final String GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "urls_skipped_per_max_num_host_overflow_total";
+
+ /** URLs skipped due to per-segment overflow. */
+ public static final String GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL = "urls_skipped_per_segment_overflow_total";
+
+ /**
+ * Counter group for items by status, rejected by the fetch schedule. See
+ * {@link FetchSchedule#shouldFetch(Text, CrawlDatum, long)}.
+ */
+ public static final String GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS = "schedule_rejected_by_status";
+
+ /**
+ * Counter group for items by status, rejected because the generator score is
+ * lower than the minimum score defined per generate.min.score.
+ */
+ public static final String GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS = "score_rejected_by_status";
+
+ /** Counter group for items by status, selected for fetch. */
+ public static final String GROUP_GENERATOR_SELECTED_BY_STATUS = "selected_by_status";
+
+ // =========================================================================
+ // Indexer Counters
+ // =========================================================================
+
+ /** Documents deleted due to robots noindex. */
+ public static final String INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL = "deleted_robots_noindex_total";
+
+ /** Documents deleted because they are gone. */
+ public static final String INDEXER_DELETED_GONE_TOTAL = "deleted_gone_total";
+
+ /** Documents deleted due to redirects. */
+ public static final String INDEXER_DELETED_REDIRECTS_TOTAL = "deleted_redirects_total";
+
+ /** Documents deleted as duplicates. */
+ public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total";
+
+ /** Documents deleted by indexing filter. */
+ public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total";
+
+ /** Documents skipped (not modified). */
+ public static final String INDEXER_SKIPPED_NOT_MODIFIED_TOTAL = "skipped_not_modified_total";
+
+ /** Documents skipped by indexing filter. */
+ public static final String INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL = "skipped_by_indexing_filter_total";
+
+ /** Documents indexed (added or updated). */
+ public static final String INDEXER_INDEXED_TOTAL = "indexed_total";
+
+ // =========================================================================
+ // CrawlDb Counters
+ // =========================================================================
+
+ /** URLs filtered during CrawlDb operations. */
+ public static final String CRAWLDB_URLS_FILTERED_TOTAL = "urls_filtered_total";
+
+ /** Gone (404) records removed during CrawlDb operations. */
+ public static final String CRAWLDB_GONE_RECORDS_REMOVED_TOTAL = "gone_records_removed_total";
+
+ /** Orphan records removed during CrawlDb operations. */
+ public static final String CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL = "orphan_records_removed_total";
+
+ // =========================================================================
+ // Injector Counters
+ // =========================================================================
+
+ /** URLs filtered during injection. */
+ public static final String INJECTOR_URLS_FILTERED_TOTAL = "urls_filtered_total";
+
+ /** URLs injected. */
+ public static final String INJECTOR_URLS_INJECTED_TOTAL = "urls_injected_total";
+
+ /** Unique URLs injected. */
+ public static final String INJECTOR_URLS_INJECTED_UNIQUE_TOTAL = "urls_injected_unique_total";
+
+ /** URLs merged with existing CrawlDb entries. */
+ public static final String INJECTOR_URLS_MERGED_TOTAL = "urls_merged_total";
+
+ /** URLs purged due to 404 status. */
+ public static final String INJECTOR_URLS_PURGED_404_TOTAL = "urls_purged_404_total";
+
+ /** URLs purged by filter. */
+ public static final String INJECTOR_URLS_PURGED_FILTER_TOTAL = "urls_purged_filter_total";
+
+ // =========================================================================
+ // HostDb Counters
+ // =========================================================================
+
+ /** Records filtered in HostDb. */
+ public static final String HOSTDB_FILTERED_RECORDS_TOTAL = "filtered_records_total";
+
+ /** Total hosts processed. */
+ public static final String HOSTDB_TOTAL_HOSTS_TOTAL = "total_hosts_total";
+
+ /** Hosts skipped (not eligible). */
+ public static final String HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL = "skipped_not_eligible_total";
+
+ /** Hosts where URL limit was not reached. */
+ public static final String HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL = "url_limit_not_reached_total";
+
+ /** New known hosts discovered. */
+ public static final String HOSTDB_NEW_KNOWN_HOST_TOTAL = "new_known_host_total";
+
+ /** Rediscovered hosts. */
+ public static final String HOSTDB_REDISCOVERED_HOST_TOTAL = "rediscovered_host_total";
+
+ /** Existing known hosts. */
+ public static final String HOSTDB_EXISTING_KNOWN_HOST_TOTAL = "existing_known_host_total";
+
+ /** New unknown hosts. */
+ public static final String HOSTDB_NEW_UNKNOWN_HOST_TOTAL = "new_unknown_host_total";
+
+ /** Existing unknown hosts. */
+ public static final String HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL = "existing_unknown_host_total";
+
+ /** Purged unknown hosts. */
+ public static final String HOSTDB_PURGED_UNKNOWN_HOST_TOTAL = "purged_unknown_host_total";
+
+ /** Hosts checked. */
+ public static final String HOSTDB_CHECKED_HOSTS_TOTAL = "checked_hosts_total";
+
+ // =========================================================================
+ // Deduplication Counters
+ // =========================================================================
+
+ /** Documents marked as duplicate. */
+ public static final String DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL = "documents_marked_duplicate_total";
+
+ // =========================================================================
+ // Redirect Deduplication Counters
+ // =========================================================================
+
+ /** Redirects kept as non-duplicates. */
+ public static final String DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL = "redirects_marked_not_duplicate_total";
+
+ /** Redirects in CrawlDb. */
+ public static final String DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL = "redirects_in_crawldb_total";
+
+ /** Self-referential redirects in CrawlDb. */
+ public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL = "redirects_self_referential_total";
+
+ /** Self-referential redirects kept as non-duplicates. */
+ public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL = "redirects_self_referential_marked_not_duplicate_total";
+
+ // =========================================================================
+ // Cleaning Job Counters
+ // =========================================================================
+
+ /** Documents deleted during cleaning. */
+ public static final String CLEANING_DELETED_DOCUMENTS_TOTAL = "deleted_documents_total";
+
+ // =========================================================================
+ // WebGraph Counters
+ // =========================================================================
+
+ /** Links added to WebGraph. */
+ public static final String WEBGRAPH_ADDED_LINKS_TOTAL = "added_links_total";
+
+ /** Links removed from WebGraph. */
+ public static final String WEBGRAPH_REMOVED_LINKS_TOTAL = "removed_links_total";
+
+ // =========================================================================
+ // Sitemap Counters
+ // =========================================================================
+
+ /** Filtered records in sitemap processing. */
+ public static final String SITEMAP_FILTERED_RECORDS_TOTAL = "filtered_records_total";
+
+ /** Seeds extracted from sitemaps. */
+ public static final String SITEMAP_SEEDS_TOTAL = "sitemap_seeds_total";
+
+ /** Sitemaps discovered from hostname. */
+ public static final String SITEMAP_FROM_HOSTNAME_TOTAL = "sitemaps_from_hostname_total";
+
+ /** Sitemaps filtered from hostname. */
+ public static final String SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL = "filtered_sitemaps_from_hostname_total";
+
+ /** Failed sitemap fetches. */
+ public static final String SITEMAP_FAILED_FETCHES_TOTAL = "failed_fetches_total";
+
+ /** Existing sitemap entries. */
+ public static final String SITEMAP_EXISTING_ENTRIES_TOTAL = "existing_sitemap_entries_total";
+
+ /** New sitemap entries. */
+ public static final String SITEMAP_NEW_ENTRIES_TOTAL = "new_sitemap_entries_total";
+
+ // =========================================================================
+ // SitemapInjector Counters
+ // =========================================================================
+
+ /** SitemapInjector counter group. */
+ public static final String GROUP_SITEMAP_INJECTOR = "sitemap_injector";
+
+ /** Failed to fetch sitemap content, disallowed per robots.txt. */
+ public static final String SITEMAP_ROBOTSTXT_DISALLOW_TOTAL = "sitemap_robotstxt_disallow";
+
+ /** Sitemap failed to parse. */
+ public static final String SITEMAP_FAILED_TO_PARSE_TOTAL = "sitemaps_failed_to_parse";
+
+ /** Prefix for sitemap type counter. */
+ public static final String SITEMAP_TYPE_PREFIX = "sitemap_type_";
+
+ /** Sitemaps processed total. */
+ public static final String SITEMAP_PROCESSED_TOTAL = "sitemaps_processed";
+
+ /** Sitemap index: affected by URL limit. */
+ public static final String SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL = "sitemap_index_url_limit";
+
+ /** Sitemap index: affected by depth limit. */
+ public static final String SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL = "sitemap_index_depth_limit";
+
+ /** Sitemap index: affected by time limit. */
+ public static final String SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL = "sitemap_index_time_limit";
+
+ /** Sitemap index: skipped because no URLs found after 50% of time limit. */
+ public static final String SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL = "sitemap_index_no_urls_after_50_percent_of_time_limit";
+
+ /** Sitemap index: skipped because of too many fetch failures. */
+ public static final String SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL = "sitemap_index_too_many_failures";
+
+ /** Sitemap index: processed sitemaps. */
+ public static final String SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL = "sitemap_index_processed_sitemaps";
+
+ /** Skipped duplicated or recursive sitemap URLs. */
+ public static final String SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL = "sitemap_skipped_duplicate_or_recursive_sitemap_url";
+
+ /** Sitemap index: affected by max. number of sitemaps in index. */
+ public static final String SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL = "sitemap_index_max_sitemaps_limit";
+
+ /** Sitemap failed to fetch. */
+ public static final String SITEMAP_FAILED_TO_FETCH_TOTAL = "sitemap_failed_to_fetch";
+
+ /** Sitemap skipped because of overlong URL. */
+ public static final String SITEMAP_SKIPPED_OVERLONG_URL_TOTAL = "sitemap_skipped_overlong_url";
+
+ /** Sitemap rejected by URL filters */
+ public static final String SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_rejected_by_url_filters";
+
+ /** Sitemap skipped, too many failures per host. */
+ public static final String SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL = "sitemap_skipped_too_many_failures_per_host";
+
+ /** Could not fetch sitemap content, protocol not supported. */
+ public static final String SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL = "sitemap_protocol_not_supported";
+
+ /** Failed to fetch sitemap content because of timeout. */
+ public static final String SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL = "sitemap_failed_to_fetch_timeout";
+
+ /** Failed to fetch sitemap content because of exception. */
+ public static final String SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL = "sitemap_failed_to_fetch_exception";
+
+ /** Sitemap redirect. */
+ public static final String SITEMAP_REDIRECT_TOTAL = "sitemap_redirect";
+
+ /** Sitemap redirect target rejected by URL filters */
+ public static final String SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_redirect_target_rejected_by_url_filters";
+
+ /** Sitemap redirect limit exceeded (max. number of redirects followed). */
+ public static final String SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL = "sitemap_redirect_limit_exceeded";
+
+ /** Failed to fetch sitemap content, HTTP status != 200. */
+ public static final String SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL = "sitemap_failed_to_fetch_http_status_code_not_200";
+
+ /** Failed to fetch sitemap content, empty content. */
+ public static final String SITEMAP_EMPTY_CONTENT_TOTAL = "sitemap_empty_content";
+
+ /** Empty sitemap. */
+ public static final String SITEMAP_EMPTY_TOTAL = "sitemap_empty";
+
+ /** Sitemap URL limit reached. */
+ public static final String SITEMAP_URL_LIMIT_REACHED_TOTAL = "sitemap_url_limit_reached";
+
+ /** URLs randomly skipped. */
+ public static final String SITEMAP_RANDOM_SKIP_TOTAL = "urls_random_skip";
+
+ /** URLs from sitemaps rejected, host limit reached. */
+ public static final String SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL = "urls_skipped_host_limit_reached";
+
+ /** URLs from sitemaps rejected, target not allowed by cross-submit. */
+ public static final String SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL = "urls_skipped_not_allowed_by_cross_submits";
+
+ /** URLs from sitemaps rejected by URL filters. */
+ public static final String SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS = "urls_from_sitemaps_rejected_by_url_filters";
+
+ /** URLs from sitemaps injected. */
+ public static final String SITEMAP_URLS_INJECTED = "urls_from_sitemaps_injected";
+
+ // =========================================================================
+ // WARC Exporter Counters
+ // =========================================================================
+
+ /** Missing content in WARC export. */
+ public static final String WARC_MISSING_CONTENT_TOTAL = "missing_content_total";
+
+ /** Missing metadata in WARC export. */
+ public static final String WARC_MISSING_METADATA_TOTAL = "missing_metadata_total";
+
+ /** Omitted empty responses in WARC export. */
+ public static final String WARC_OMITTED_EMPTY_RESPONSE_TOTAL = "omitted_empty_response_total";
+
+ /** WARC records generated. */
+ public static final String WARC_RECORDS_GENERATED_TOTAL = "records_generated_total";
+
+ // =========================================================================
+ // Domain Statistics Counters (enum-based, kept for compatibility)
+ // =========================================================================
+
+ /** Fetched URLs in domain statistics. */
+ public static final String DOMAIN_STATS_FETCHED_TOTAL = "fetched_total";
+
+ /** Not fetched URLs in domain statistics. */
+ public static final String DOMAIN_STATS_NOT_FETCHED_TOTAL = "not_fetched_total";
+
+ /** Empty results in domain statistics. */
+ public static final String DOMAIN_STATS_EMPTY_RESULT_TOTAL = "empty_result_total";
+
+ // =========================================================================
+ // UrlCleaner
+ // =========================================================================
+
+ public static final String GROUP_URLCLEANER = "urlcleaner";
+
+ public static final String URLCLEANER_REJECTED_TOTAL = "urls_rejected";
+
+ public static final String URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL = "urls_rejected_invalid_domain";
+
+ public static final String URLCLEANER_ACCEPTED_UNCHANGED_TOTAL = "urls_accepted_unchanged";
+
+ public static final String URLCLEANER_ACCEPTED_NORMALIZED_TOTAL = "urls_accepted_normalized";
+
+ // =========================================================================
+ // UrlSampler and UrlSamplerHost
+ // =========================================================================
+
+ public static final String GROUP_URLSAMPLER = "urlsampler";
+
+ public static final String GROUP_URLSAMPLER_HOST = "urlsamplerhost";
+
+ public static final String URLSAMPLER_MALFORMED_URL_TOTAL = "malformed_url";
+
+ public static final String URLSAMPLER_SKIPPED_MAX_URLS_TOTAL = "skipped_max_urls";
+
+ public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL = "skipped_max_urls_per_host";
+
+ public static final String URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL = "skipped_max_hosts";
+
+ public static final String URLSAMPLER_HOSTS = "hosts";
+
+ public static final String URLSAMPLER_URLS = "urls";
+
+ public static final String URLSAMPLER_HOSTS_WITH_LIMIT = "hosts_with_limit";
+
+ public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT = "urls_host_with_limit";
+
+ public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT = "hosts_without_limit";
+
+ public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT = "urls_host_without_limit";
+
+ public static final String URLSAMPLER_URLS_SAMPLED = "urls_sampled";
+
+ public static final String URLSAMPLER_HOSTS_SAMPLED = "hosts_sampled";
+
+ public static final String URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED = "hosts_with_limit_sampled";
+
+ public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED = "urls_host_with_limit_sampled";
+
+ public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED = "hosts_without_limit_sampled";
+
+ public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED = "urls_host_without_limit_sampled";
+
+ public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST = "skipped_max_urls_per_host";
+
+ public static final String URLSAMPLER_SKIPPED_RANDOM = "skipped_random";
+
+ // =========================================================================
+ // Latency Metric Prefixes (used with LatencyTracker)
+ // =========================================================================
+
+ /**
+ * Prefix for fetch latency metrics.
+ * Used with {@link LatencyTracker} to emit fetch timing counters.
+ */
+ public static final String FETCHER_LATENCY = "fetch_latency";
+
+ /**
+ * Prefix for parse latency metrics.
+ * Used with {@link LatencyTracker} to emit parse timing counters.
+ */
+ public static final String PARSER_LATENCY = "parse_latency";
+
+ /**
+ * Prefix for indexer latency metrics.
+ * Used with {@link LatencyTracker} to emit indexing timing counters.
+ */
+ public static final String INDEXER_LATENCY = "index_latency";
+
+ // =========================================================================
+ // Common Error Counter Names (used with component-specific groups)
+ // These constants are shared across all components for consistent error
+ // categorization. Use with ErrorTracker for automatic classification.
+ // =========================================================================
+
+ /**
+ * Total errors across all categories.
+ * This is incremented alongside any category-specific error counter.
+ */
+ public static final String ERROR_TOTAL = "errors_total";
+
+ /**
+ * Network-related errors.
+ * Includes: IOException, SocketException, ConnectException, UnknownHostException
+ */
+ public static final String ERROR_NETWORK_TOTAL = "errors_network_total";
+
+ /**
+ * Protocol errors.
+ * Includes: ProtocolException, ProtocolNotFound
+ */
+ public static final String ERROR_PROTOCOL_TOTAL = "errors_protocol_total";
+
+ /**
+ * Parsing errors.
+ * Includes: ParseException, ParserNotFound
+ */
+ public static final String ERROR_PARSING_TOTAL = "errors_parsing_total";
+
+ /**
+ * URL-related errors.
+ * Includes: MalformedURLException, URLFilterException
+ */
+ public static final String ERROR_URL_TOTAL = "errors_url_total";
+
+ /**
+ * Scoring filter errors.
+ * Includes: ScoringFilterException
+ */
+ public static final String ERROR_SCORING_TOTAL = "errors_scoring_total";
+
+ /**
+ * Indexing filter errors.
+ * Includes: IndexingException
+ */
+ public static final String ERROR_INDEXING_TOTAL = "errors_indexing_total";
+
+ /**
+ * Timeout errors.
+ * Includes: SocketTimeoutException, connection timeouts
+ */
+ public static final String ERROR_TIMEOUT_TOTAL = "errors_timeout_total";
+
+ /**
+ * Other uncategorized errors.
+ * Used as fallback for exceptions not matching any specific category.
+ */
+ public static final String ERROR_OTHER_TOTAL = "errors_other_total";
+}
+
diff --git a/src/java/org/apache/nutch/metrics/package-info.java b/src/java/org/apache/nutch/metrics/package-info.java
new file mode 100644
index 0000000000..376605d043
--- /dev/null
+++ b/src/java/org/apache/nutch/metrics/package-info.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Metrics infrastructure for Apache Nutch.
+ *
+ * This package provides centralized constants and utilities for Hadoop
+ * MapReduce metrics/counters following
+ * Prometheus naming
+ * conventions.
+ *
+ *
The main class is {@link org.apache.nutch.metrics.NutchMetrics} which
+ * defines all counter group names and counter names as constants.
+ *
+ * @since 1.22
+ */
+package org.apache.nutch.metrics;
+
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index 6b2fb5cee7..0b2a6f2290 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -37,6 +37,9 @@
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.LatencyTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -80,12 +83,25 @@ public static class ParseSegmentMapper extends
private Text newKey = new Text();
private ScoringFilters scfilters;
private boolean skipTruncated;
+ private LatencyTracker parseLatencyTracker;
+ private ErrorTracker errorTracker;
@Override
public void setup(Mapper>, Content, Text, ParseImpl>.Context context) {
Configuration conf = context.getConfiguration();
scfilters = new ScoringFilters(conf);
skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+ parseLatencyTracker = new LatencyTracker(
+ NutchMetrics.GROUP_PARSER, NutchMetrics.PARSER_LATENCY);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_PARSER, context);
+ }
+
+ @Override
+ public void cleanup(Mapper>, Content, Text, ParseImpl>.Context context)
+ throws IOException, InterruptedException {
+ // Emit parse latency metrics
+ parseLatencyTracker.emitCounters(context);
}
@Override
@@ -121,6 +137,7 @@ public void map(WritableComparable> key, Content content,
parseResult = parseUtil.parse(content);
} catch (Exception e) {
LOG.warn("Error parsing: {}: {}", key, StringUtils.stringifyException(e));
+ errorTracker.incrementCounters(e);
return;
}
@@ -129,7 +146,8 @@ public void map(WritableComparable> key, Content content,
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
- context.getCounter("ParserStatus",
+ // Dynamic counter based on parse status
+ context.getCounter(NutchMetrics.GROUP_PARSER,
ParseStatus.majorCodes[parseStatus.getMajorCode()]).increment(1);
if (!parseStatus.isSuccess()) {
@@ -151,10 +169,13 @@ public void map(WritableComparable> key, Content content,
scfilters.passScoreAfterParsing(url, content, parse);
} catch (ScoringFilterException e) {
LOG.warn("Error passing score: {}: {}", url, e.getMessage());
+ errorTracker.incrementCounters(ErrorTracker.ErrorType.SCORING);
}
long end = System.currentTimeMillis();
- LOG.info("Parsed ({}ms): {}", (end - start), url);
+ long parseTime = end - start;
+ parseLatencyTracker.record(parseTime);
+ LOG.info("Parsed ({}ms): {}", parseTime, url);
context.write(
url,
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
index ab4162c87f..2514eae33e 100644
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.protocol;
+import java.net.URL;
import java.util.List;
import org.apache.hadoop.conf.Configurable;
@@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable {
BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
List robotsTxtContent);
+ /**
+ * Retrieve robot rules applicable for this URL.
+ *
+ * @param url
+ * URL to check
+ * @param datum
+ * page datum
+ * @param robotsTxtContent
+ * container to store responses when fetching the robots.txt file for
+ * debugging or archival purposes. Instead of a robots.txt file, it
+ * may include redirects or an error page (404, etc.). Response
+ * {@link Content} is appended to the passed list. If null is passed
+ * nothing is stored.
+ * @return robot rules (specific for this URL or default), never null
+ */
+ default BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return getRobotRules(new Text(url.toString()), datum, robotsTxtContent);
+ }
+
}
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 4daefcd8f3..fee0921d0a 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -48,6 +48,7 @@
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
@@ -58,6 +59,7 @@
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
@@ -327,6 +329,10 @@ public static class OutlinkDbReducer extends
// url normalizers, filters and job configuration
private Configuration conf;
+ // Cached counter references for performance
+ private Counter addedLinksCounter;
+ private Counter removedLinksCounter;
+
/**
* Configures the OutlinkDb job reducer. Sets up internal links and link limiting.
*/
@@ -339,6 +345,18 @@ public void setup(Reducer.Context context)
limitPages = conf.getBoolean("link.ignore.limit.page", true);
limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ addedLinksCounter = context.getCounter(
+ NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL);
+ removedLinksCounter = context.getCounter(
+ NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL);
}
@Override
@@ -361,14 +379,14 @@ public void reduce(Text key, Iterable values,
mostRecent = timestamp;
}
outlinkList.add(WritableUtils.clone(next, conf));
- context.getCounter("WebGraph.outlinks", "added links").increment(1);
+ addedLinksCounter.increment(1);
} else if (value instanceof BooleanWritable) {
BooleanWritable delete = (BooleanWritable) value;
// Actually, delete is always true, otherwise we don't emit it in the
// mapper in the first place
if (delete.get() == true) {
// This page is gone, do not emit it's outlinks
- context.getCounter("WebGraph.outlinks", "removed links").increment(1);
+ removedLinksCounter.increment(1);
return;
}
}
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index bf824f9b3f..14b59ac85c 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -41,6 +41,7 @@
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
@@ -57,6 +58,8 @@
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.tools.WARCUtils;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -111,6 +114,35 @@ public static class WARCReducer
// Metadata to JSON
Gson gson = new Gson();
+ // Cached counter references to avoid repeated lookups in hot paths
+ private Counter missingContentCounter;
+ private Counter missingMetadataCounter;
+ private Counter omittedEmptyResponseCounter;
+ private Counter recordsGeneratedCounter;
+ private ErrorTracker errorTracker;
+
+ @Override
+ public void setup(Context context) {
+ // Initialize cached counter references
+ initCounters(context);
+ // Initialize error tracker with cached counters
+ errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER, context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ missingContentCounter = context.getCounter(
+ NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_CONTENT_TOTAL);
+ missingMetadataCounter = context.getCounter(
+ NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_METADATA_TOTAL);
+ omittedEmptyResponseCounter = context.getCounter(
+ NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL);
+ recordsGeneratedCounter = context.getCounter(
+ NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_RECORDS_GENERATED_TOTAL);
+ }
+
@Override
public void reduce(Text key, Iterable values,
Context context) throws IOException, InterruptedException {
@@ -147,13 +179,13 @@ public void reduce(Text key, Iterable values,
// check that we have everything we need
if (content == null) {
LOG.info("Missing content for {}", key);
- context.getCounter("WARCExporter", "missing content").increment(1);
+ missingContentCounter.increment(1);
return;
}
if (cd == null) {
LOG.info("Missing fetch datum for {}", key);
- context.getCounter("WARCExporter", "missing metadata").increment(1);
+ missingMetadataCounter.increment(1);
return;
}
@@ -161,8 +193,7 @@ public void reduce(Text key, Iterable values,
// Empty responses is everything that was not a regular response
if (!(cd.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS
|| cd.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED)) {
- context.getCounter("WARCExporter", "omitted empty response")
- .increment(1);
+ omittedEmptyResponseCounter.increment(1);
return;
}
}
@@ -237,7 +268,7 @@ public void reduce(Text key, Iterable values,
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
@@ -269,12 +300,12 @@ public void reduce(Text key, Iterable values,
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated").increment(1);
+ recordsGeneratedCounter.increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC resource record for {} : {}", key,
exception.getMessage());
- context.getCounter("WARCExporter", "exception").increment(1);
+ errorTracker.incrementCounters(exception);
}
// Do we need to emit a metadata record too?
@@ -316,7 +347,7 @@ public void reduce(Text key, Iterable values,
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
@@ -332,13 +363,12 @@ public void reduce(Text key, Iterable values,
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated")
- .increment(1);
+ recordsGeneratedCounter.increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC metadata record for {} : {}",
key, exception.getMessage(), exception);
- context.getCounter("WARCExporter", "exception").increment(1);
+ errorTracker.incrementCounters(exception);
}
}
@@ -376,7 +406,7 @@ public void reduce(Text key, Iterable values,
.append(uri.toASCIIString()).append(CRLF);
} catch (Exception e) {
LOG.error("Invalid URI {} ", key);
- context.getCounter("WARCExporter", "invalid URI").increment(1);
+ errorTracker.incrementCounters(e);
return;
}
@@ -392,13 +422,12 @@ public void reduce(Text key, Iterable values,
new ByteArrayInputStream(bos.toByteArray()));
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
- context.getCounter("WARCExporter", "records generated")
- .increment(1);
+ recordsGeneratedCounter.increment(1);
} catch (IOException | IllegalStateException exception) {
LOG.error(
"Exception when generating WARC metadata record for {} : {}",
key, exception.getMessage(), exception);
- context.getCounter("WARCExporter", "exception").increment(1);
+ errorTracker.incrementCounters(exception);
}
}
}
diff --git a/src/java/org/apache/nutch/util/DomainStatistics.java b/src/java/org/apache/nutch/util/DomainStatistics.java
index 5ee09c846a..4057795d52 100644
--- a/src/java/org/apache/nutch/util/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/DomainStatistics.java
@@ -28,6 +28,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -38,6 +39,7 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metrics.NutchMetrics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -52,10 +54,6 @@ public class DomainStatistics extends Configured implements Tool {
private static final Text FETCHED_TEXT = new Text("FETCHED");
private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED");
- public static enum MyCounter {
- FETCHED, NOT_FETCHED, EMPTY_RESULT
- };
-
private static final int MODE_HOST = 1;
private static final int MODE_DOMAIN = 2;
private static final int MODE_SUFFIX = 3;
@@ -158,10 +156,29 @@ static class DomainStatisticsMapper extends
Mapper {
int mode = 0;
+ // Cached counter references for performance
+ private Counter fetchedCounter;
+ private Counter notFetchedCounter;
+ private Counter emptyResultCounter;
+
@Override
public void setup(Context context) {
mode = context.getConfiguration().getInt("domain.statistics.mode",
MODE_DOMAIN);
+ // Initialize cached counter references
+ initCounters(context);
+ }
+
+ /**
+ * Initialize cached counter references to avoid repeated lookups in hot paths.
+ */
+ private void initCounters(Context context) {
+ fetchedCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_FETCHED_TOTAL);
+ notFetchedCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_NOT_FETCHED_TOTAL);
+ emptyResultCounter = context.getCounter(
+ NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_EMPTY_RESULT_TOTAL);
}
@Override
@@ -197,17 +214,17 @@ public void map(Text urlText, CrawlDatum datum, Context context)
}
if (out.trim().equals("")) {
LOG.info("url : {}", url);
- context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
+ emptyResultCounter.increment(1);
}
context.write(new Text(out), new LongWritable(1));
} catch (Exception ex) {
}
- context.getCounter(MyCounter.FETCHED).increment(1);
+ fetchedCounter.increment(1);
context.write(FETCHED_TEXT, new LongWritable(1));
} else {
- context.getCounter(MyCounter.NOT_FETCHED).increment(1);
+ notFetchedCounter.increment(1);
context.write(NOT_FETCHED_TEXT, new LongWritable(1));
}
}
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index d83a6e358c..21362223cd 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -31,6 +31,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -45,6 +46,8 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.hostdb.HostDatum;
+import org.apache.nutch.metrics.ErrorTracker;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.Content;
@@ -113,6 +116,14 @@ private static class SitemapMapper extends Mapper values, Context context)
originalDatum.setModifiedTime(sitemapDatum.getModifiedTime());
}
- context.getCounter("Sitemap", "existing_sitemap_entries").increment(1);
+ existingEntriesCounter.increment(1);
context.write(key, originalDatum);
}
else if(sitemapDatum != null) {
// For the newly discovered links via sitemap, set the status as unfetched and emit
- context.getCounter("Sitemap", "new_sitemap_entries").increment(1);
+ newEntriesCounter.increment(1);
sitemapDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
context.write(key, sitemapDatum);
}
@@ -457,11 +507,11 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric
FSUtils.replace(fs, current, tempCrawlDb, true);
LockUtil.removeLockFile(fs, lock);
- long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
- long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
- long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
- long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
- long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
+ long filteredRecords = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).getValue();
+ long fromHostname = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FROM_HOSTNAME_TOTAL).getValue();
+ long fromSeeds = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_SEEDS_TOTAL).getValue();
+ long failedFetches = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL).getValue();
+ long newSitemapEntries = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).getValue();
LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 158125999e..44c6309d2a 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target)
* https://publicsuffix.org/list/public_suffix_list.dat and are compared
* using
+ * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html">
* crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are
* used. Because EffectiveTldFinder loads the public suffix list as file
* "effective_tld_names.dat" from the Java classpath, it's possible to use the
diff --git a/src/java/org/commoncrawl/tools/UrlCleaner.java b/src/java/org/commoncrawl/tools/UrlCleaner.java
index a3f26b126b..c4d92ca669 100644
--- a/src/java/org/commoncrawl/tools/UrlCleaner.java
+++ b/src/java/org/commoncrawl/tools/UrlCleaner.java
@@ -40,6 +40,7 @@
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -134,18 +135,21 @@ public void map(Text key, Text value, Context context)
try {
url = urlNormalizers.normalize(url, scope);
} catch (MalformedURLException e) {
- context.getCounter("urlcleaner", "urls_rejected").increment(1);
- return;
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1);
+ return;
}
try {
url = filters.filter(url);
} catch (URLFilterException e) {
- context.getCounter("urlcleaner", "urls_rejected").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1);
return;
}
if (url == null) {
- context.getCounter("urlcleaner", "urls_rejected").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1);
return;
}
@@ -157,21 +161,26 @@ public void map(Text key, Text value, Context context)
if (needDomain) {
domain = EffectiveTldFinder.getAssignedDomain(host, true, true);
if (checkDomain && domain == null) {
- context.getCounter("urlcleaner", "urls_rejected_invalid_domain")
+ context
+ .getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL)
.increment(1);
return;
}
}
} catch (MalformedURLException e) {
- context.getCounter("urlcleaner", "urls_rejected").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1);
return;
}
}
if (url.equals(urlOrig)) {
- context.getCounter("urlcleaner", "urls_accepted_unchanged").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_ACCEPTED_UNCHANGED_TOTAL).increment(1);
} else {
- context.getCounter("urlcleaner", "urls_accepted_normalized").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLCLEANER,
+ NutchMetrics.URLCLEANER_ACCEPTED_NORMALIZED_TOTAL).increment(1);
key.set(url);
}
diff --git a/src/java/org/commoncrawl/tools/UrlSampler.java b/src/java/org/commoncrawl/tools/UrlSampler.java
index f28447a4cf..e2060e1f47 100644
--- a/src/java/org/commoncrawl/tools/UrlSampler.java
+++ b/src/java/org/commoncrawl/tools/UrlSampler.java
@@ -48,6 +48,7 @@
import org.apache.nutch.crawl.Generator2;
import org.apache.nutch.crawl.Generator2.DomainScorePair;
import org.apache.nutch.crawl.URLPartitioner;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -150,7 +151,8 @@ public void map(Text key, Text value, Context context)
domain = URLPartitioner.getDomainName(u.getHost());
} catch (Exception e) {
LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage());
- context.getCounter("UrlSampler", "MALFORMED_URL").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1);
return;
}
@@ -242,7 +244,8 @@ public void reduce(DomainScorePair key, Iterable values,
domain);
}
} catch (MalformedURLException e) {
- context.getCounter("UrlSampler", "MALFORMED_URL").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1);
continue;
}
nUrls++;
@@ -271,12 +274,14 @@ public void reduce(DomainScorePair key, Iterable values,
}
if (nUrls == 0)
return;
- context.getCounter("UrlSampler", "SKIPPED_MAX_URLS")
- .increment(skippedMaxUrls);
- context.getCounter("UrlSampler", "SKIPPED_MAX_URLS_PER_HOST")
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_TOTAL).increment(skippedMaxUrls);
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL)
.increment(skippedMaxUrlsPerHost);
- context.getCounter("UrlSampler", "SKIPPED_MAX_HOSTS")
- .increment(skippedMaxHosts);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL).increment(skippedMaxHosts);
LOG.info(
"Sampled for domain {} : {} hosts, {} URLs ({} skipped: {} max. URLs, {} max. per host, {} max. hosts), sum of scores = {}",
domain, hosts.size(), nUrlsSampled, (nUrls - nUrlsSampled),
@@ -336,8 +341,8 @@ private void sample(Path[] inputs, Path output) throws Exception {
}
public void usage() {
- System.err
- .println("Usage: UrlSampler [-D...] ... \n");
+ System.err.println(
+ "Usage: UrlSampler [-D...] ... \n");
}
@Override
diff --git a/src/java/org/commoncrawl/tools/UrlSamplerHost.java b/src/java/org/commoncrawl/tools/UrlSamplerHost.java
index e296ffa90b..bce68ad50f 100644
--- a/src/java/org/commoncrawl/tools/UrlSamplerHost.java
+++ b/src/java/org/commoncrawl/tools/UrlSamplerHost.java
@@ -44,6 +44,7 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Generator2;
import org.apache.nutch.crawl.Generator2.DomainScorePair;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -60,7 +61,8 @@
*
*
*
- * host name (leading www. may be stripped), limits and default score
+ * host name (leading www. may be stripped), limits and default
+ * score
*
*
* <host_name> \t <rank> \t <max_urls> \t <default_score>
@@ -180,7 +182,8 @@ public void map(Text key, Text value, Context context)
}
} catch (Exception e) {
LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage());
- context.getCounter("UrlSampler", "MALFORMED_URL").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
+ NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1);
return;
}
@@ -270,40 +273,59 @@ public void reduce(DomainScorePair key, Iterable values,
context.write(text, meta);
}
// hosts == reduce input groups
- context.getCounter("UrlSamplerHost", "HOSTS").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS).increment(1);
// URLs == map output records, reduce input records
- context.getCounter("UrlSamplerHost", "URLS").increment(nUrls);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS).increment(nUrls);
if (nUrls > 0) {
if (maxUrls > -1) {
- context.getCounter("UrlSamplerHost", "HOSTS_WITH_LIMIT").increment(1);
- context.getCounter("UrlSamplerHost", "URLS_HOST_WITH_LIMIT")
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT).increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT)
.increment(nUrls);
} else {
- context.getCounter("UrlSamplerHost", "HOSTS_WITHOUT_LIMIT")
- .increment(1);
- context.getCounter("UrlSamplerHost", "URLS_HOST_WITHOUT_LIMIT")
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT).increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT)
.increment(nUrls);
}
if (nUrlsSampled > 0) {
- context.getCounter("UrlSamplerHost", "URLS_SAMPLED")
- .increment(nUrlsSampled);
- context.getCounter("UrlSamplerHost", "HOSTS_SAMPLED").increment(1);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS_SAMPLED).increment(nUrlsSampled);
+ context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS_SAMPLED).increment(1);
if (maxUrls > -1) {
- context.getCounter("UrlSamplerHost", "HOSTS_WITH_LIMIT_SAMPLED")
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED)
.increment(1);
- context.getCounter("UrlSamplerHost", "URLS_HOST_WITH_LIMIT_SAMPLED")
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED)
.increment(nUrlsSampled);
} else {
- context.getCounter("UrlSamplerHost", "HOSTS_WITHOUT_LIMIT_SAMPLED")
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED)
.increment(1);
context
- .getCounter("UrlSamplerHost", "URLS_HOST_WITHOUT_LIMIT_SAMPLED")
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED)
.increment(nUrlsSampled);
}
}
- context.getCounter("UrlSamplerHost", "SKIPPED_MAX_URLS_PER_HOST")
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST)
.increment(skippedMaxUrlsPerHost);
- context.getCounter("UrlSamplerHost", "SKIPPED_RANDOM")
+ context
+ .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
+ NutchMetrics.URLSAMPLER_SKIPPED_RANDOM)
.increment(skippedRandom);
LOG.info(
"Sampled for host {} : {} URLs ({} skipped: {} max. per host, {} random), sum of scores = {}",
@@ -365,8 +387,8 @@ private void sample(Path[] inputs, Path output) throws Exception {
}
public void usage() {
- System.err
- .println("Usage: UrlSamplerHost [-D...] ... \n");
+ System.err.println(
+ "Usage: UrlSamplerHost [-D...] ... \n");
System.err.println(
"\nThe host_limits file defines the maximum number of URLs to sample per host.");
System.err.println("\nProperties:");
@@ -374,11 +396,12 @@ public void usage() {
"\t-Durlsample.host.strip.www=(true|false)\tstrip leading www. from host names");
System.err.println(
"\t\t\t(depending on whether the limits file uses stripped host names)");
- System.err.println("Properties to configure defaults, if host is not in the limits file:");
System.err.println(
- "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host");
+ "Properties to configure defaults, if host is not in the limits file:");
System.err.println(
- "\t\t\t-1 : sample randomly with low probability (default)");
+ "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host");
+ System.err
+ .println("\t\t\t-1 : sample randomly with low probability (default)");
System.err.println(
"\t-Durlsample.default.score\tdefault score for sampled URLs (default: 0.001)");
}
diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java
index 5656c2b3a3..05f2a304f6 100644
--- a/src/java/org/commoncrawl/util/WarcRecordWriter.java
+++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java
@@ -52,6 +52,7 @@
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.metrics.NutchMetrics;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
@@ -76,7 +77,6 @@ class WarcRecordWriter extends RecordWriter {
protected static final Pattern PROBLEMATIC_HEADERS = Pattern
.compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)");
protected static final String X_HIDE_HEADER = "X-Crawler-";
- public static final String WARC_WRITER_COUNTER_GROUP = "WARC-Writer";
protected static final Pattern STATUS_LINE_PATTERN = Pattern
.compile("^HTTP/1\\.[01] [0-9]{3}(?: .*)?$");
@@ -117,6 +117,8 @@ class WarcRecordWriter extends RecordWriter {
private URLNormalizers urlNormalizers;
private URLNormalizers urlNormalizersRedirect;
+ private SimpleDateFormat isoDate;
+
public WarcRecordWriter(Configuration conf, Path outputPath, int partition,
TaskAttemptContext context) throws IOException {
@@ -128,6 +130,9 @@ public WarcRecordWriter(Configuration conf, Path outputPath, int partition,
Locale.ROOT);
fileDate.setTimeZone(TimeZone.getTimeZone("UTC"));
+ isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT);
+ isoDate.setTimeZone(TimeZone.getTimeZone("UTC"));
+
String prefix = conf.get("warc.export.prefix", "NUTCH-CRAWL");
/*
@@ -527,18 +532,22 @@ public synchronized void write(Text key, WarcCapture value)
throws IOException {
if (value.content == null) {
- String reason = "";
+ ProtocolStatus pstatus = null;
if (value.datum != null) {
- ProtocolStatus pstatus = (ProtocolStatus) value.datum.getMetaData()
+ pstatus = (ProtocolStatus) value.datum.getMetaData()
.get(Nutch.WRITABLE_PROTO_STATUS_KEY);
- if (pstatus != null) {
- reason = ": " + pstatus.getName() + " - " + pstatus.getMessage();
- }
}
- LOG.warn("Cannot write WARC record, no content for {}{}", value.url,
- reason);
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (no content)").increment(1);
+ if (pstatus != null) {
+ LOG.warn(
+ "Cannot write WARC record, no content for {}, protocol status: {} - {}",
+ value.url, pstatus.getName(), pstatus.getMessage());
+ } else {
+ LOG.warn(
+ "Cannot write WARC record, no content and protocol status for {}",
+ value.url);
+ }
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL).increment(1);
return;
}
@@ -560,10 +569,8 @@ public synchronized void write(Text key, WarcCapture value)
try {
targetUri = new URI(urlNorm);
LOG.info("Normalized URL to valid URI: {} -> {}", url, urlNorm);
- context
- .getCounter(WARC_WRITER_COUNTER_GROUP,
- "fixed records (invalid URI successfully normalized)")
- .increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_URI_NORMALIZED_TOTAL).increment(1);
} catch (URISyntaxException ee) {
// ignore, log exception observed on original URL
}
@@ -571,8 +578,10 @@ public synchronized void write(Text key, WarcCapture value)
}
if (targetUri == null) {
LOG.error("Cannot write WARC record, invalid URI: {}", url);
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (invalid URI)").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_INVALID_URI_TOTAL)
+ .increment(1);
return;
}
}
@@ -594,8 +603,10 @@ public synchronized void write(Text key, WarcCapture value)
(truncated != null ? truncated : "-"),
value.content.getContentType(), value.content.getContent().length,
value.url);
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (by content)").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL)
+ .increment(1);
return;
}
}
@@ -637,8 +648,8 @@ public synchronized void write(Text key, WarcCapture value)
} catch (Throwable t) {
LOG.error(t.getMessage());
}
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (duplicate)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_DUPLICATE_TOTAL).increment(1);
return;
}
precedingURL = url;
@@ -668,8 +679,10 @@ public synchronized void write(Text key, WarcCapture value)
if (pstatus == null) {
LOG.warn("Cannot write WARC record, no protocol status for {}",
value.url);
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (no protocol status)").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL)
+ .increment(1);
return;
}
switch (pstatus.getCode()) {
@@ -698,8 +711,9 @@ public synchronized void write(Text key, WarcCapture value)
if (value.content.getMetadata()
.get(Response.RESPONSE_HEADERS) == null) {
LOG.warn("Unknown or ambiguous protocol status: {}", pstatus);
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "skipped records (unknown protocol status)").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL)
+ .increment(1);
return;
}
}
@@ -839,8 +853,8 @@ public synchronized void write(Text key, WarcCapture value)
}
LOG.info("WARC {} record {} ({}, status: {}, size: {})",
- (notModified ? "revisit" : "response"), targetUri, date, httpStatusCode,
- value.content.getContent().length);
+ (notModified ? "revisit" : "response"), targetUri, isoDate.format(date),
+ httpStatusCode, value.content.getContent().length);
URI requestId = null;
if (verbatimRequestHeaders != null) {
@@ -860,17 +874,22 @@ public synchronized void write(Text key, WarcCapture value)
// detect language only for successfully fetched primary documents
ldres = langDetect.detectLanguage(targetUri, value.content);
if (ldres.errorReason != null) {
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "language detection: " + ldres.errorStatus.name).increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_LID_ERROR_PREFIX + ldres.errorStatus.name)
+ .increment(1);
} else if (ldres.languages == null) {
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "language detection: no result").increment(1);
+ context.getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_LID_NO_RESULT_TOTAL).increment(1);
} else if (ldres.languages.isReliable()) {
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "language detection: reliable").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_LID_RESULT_RELIABLE_TOTAL)
+ .increment(1);
} else {
- context.getCounter(WARC_WRITER_COUNTER_GROUP,
- "language detection: not reliable").increment(1);
+ context
+ .getCounter(NutchMetrics.GROUP_WARC_WRITER,
+ NutchMetrics.WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL)
+ .increment(1);
}
if (generateCdx) {
if (ldres.charset != null) {
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 79b45882eb..caa3f861ea 100755
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
}
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
+ }
+
/**
* Transforming a String[] into a HashMap for faster searching
*
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index e4d2010696..877873b64b 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return RobotRulesParser.EMPTY_RULES;
}
+ /**
+ * No robots parsing is done for file protocol. So this returns a set of empty
+ * rules which will allow every url.
+ */
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return RobotRulesParser.EMPTY_RULES;
+ }
+
}
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 2a47b63d61..8cf58f75e7 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}
+ /**
+ * Get the robots rules for a given url
+ */
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return robots.getRobotRulesSet(this, url, robotsTxtContent);
+ }
+
public int getBufferSize() {
return BUFFER_SIZE;
}
diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml
index 0768def785..28f355d7b9 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -37,8 +37,9 @@
-
-
+
+
+
-
+
diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml
index e2183d2b50..51f65f5d25 100755
--- a/src/plugin/protocol-okhttp/plugin.xml
+++ b/src/plugin/protocol-okhttp/plugin.xml
@@ -28,13 +28,15 @@
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 954c3f6df1..a9d2b14d42 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -52,15 +52,19 @@
import org.slf4j.LoggerFactory;
import okhttp3.Authenticator;
+import okhttp3.CompressionInterceptor;
import okhttp3.Connection;
import okhttp3.ConnectionPool;
+import okhttp3.Gzip;
import okhttp3.Handshake;
import okhttp3.Headers;
import okhttp3.Interceptor;
import okhttp3.OkHttpClient;
import okhttp3.Protocol;
import okhttp3.Request;
-import okhttp3.brotli.BrotliInterceptor;
+import okhttp3.brotli.Brotli;
+import okhttp3.zstd.Zstd;
+
public class OkHttp extends HttpBase {
@@ -156,13 +160,11 @@ public boolean verify(String hostname, SSLSession session) {
String proxyUsername = conf.get("http.proxy.username");
if (proxyUsername == null) {
ProxySelector selector = new ProxySelector() {
- @SuppressWarnings("serial")
private final List noProxyList = new ArrayList() {
{
add(Proxy.NO_PROXY);
}
};
- @SuppressWarnings("serial")
private final List proxyList = new ArrayList() {
{
add(proxy);
@@ -224,8 +226,9 @@ public Request authenticate(okhttp3.Route route,
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
- // enable support for Brotli compression (Content-Encoding)
- builder.addInterceptor(BrotliInterceptor.INSTANCE);
+ // enable support for Zstd, Brotli, Gzip Content-Encoding
+ builder.addInterceptor(new CompressionInterceptor(Zstd.INSTANCE,
+ Brotli.INSTANCE, Gzip.INSTANCE));
// instantiate connection pool(s), cf.
// https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html
diff --git a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
index 377d49ec81..c06ae30076 100644
--- a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
+++ b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -24,6 +24,12 @@
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.Date;
+import java.util.Properties;
/**
* Test cases for AdaptiveFetchSchedule.
@@ -117,5 +123,91 @@ private void validateFetchInterval(int changed, int getInterval) {
}
}
+
+ /**
+ * Test https://issues.apache.org/jira/browse/NUTCH-1564
+ */
+ @Test
+ public void testSetFetchSchedule1() {
+ // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default)
+ // db.fetch.interval.default = 172800 (2 days)
+ // db.fetch.schedule.adaptive.min_interval = 86400 (1 day)
+ // db.fetch.schedule.adaptive.max_interval = 604800 (7 days)
+ // db.fetch.interval.max = 604800 (7 days)
+ // 3-days cycle
+ // 30 days since last modified
+ doTestSetFetchSchedule(0.3, 2, 1, 7, 7, 3, 30);
+ }
+
+ @Test
+ public void testSetFetchSchedule2() {
+ // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default)
+ // db.fetch.interval.default = 86400 (1 day)
+ // db.fetch.schedule.adaptive.min_interval = 86400 (1 day)
+ // db.fetch.schedule.adaptive.max_interval = 172800 (2 days)
+ // db.fetch.interval.max = 604800 (7 days)
+ // 1-day cycle
+ // 10 days since last modified
+ doTestSetFetchSchedule(0.3, 1, 1, 2, 7, 1, 10);
+ }
+
+ @Test
+ public void testSetFetchSchedule3() {
+ // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default)
+ // db.fetch.interval.default = 172800 (2 days)
+ // db.fetch.schedule.adaptive.min_interval = 86400 (1 day)
+ // db.fetch.schedule.adaptive.max_interval = 864000 (10 days)
+ // db.fetch.interval.max = 864000 (10 days)
+ // 3-days cycle
+ // 180 days since last modified
+ doTestSetFetchSchedule(0.3, 2, 1, 10, 10, 3, 180);
+ }
+
+ private void doTestSetFetchSchedule(double deltaRate, int intervalDefaultDays,
+ int minIntervalDays, int maxIntervalDays, int intervalMaxDays,
+ int previousFetchTimeDays, int modifiedTimeDays) {
+ // need to properly override defaults
+ Properties props = new Properties();
+ props.setProperty("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule");
+ props.setProperty("db.fetch.schedule.adaptive.sync_delta", "true"); // default
+ props.setProperty("db.fetch.schedule.adaptive.sync_delta_rate", String.valueOf(deltaRate));
+ props.setProperty("db.fetch.interval.default", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalDefaultDays));
+ props.setProperty("db.fetch.schedule.adaptive.min_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * minIntervalDays));
+ props.setProperty("db.fetch.schedule.adaptive.max_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * maxIntervalDays));
+ props.setProperty("db.fetch.interval.max", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalMaxDays));
+
+ conf = NutchConfiguration.create(true, props);
+ inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); // default
+ dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); // default
+
+ // ignore adaptive-host-specific-intervals.txt
+ Text url = new Text("http://www.example2.com");
+
+ AdaptiveFetchSchedule fs = new AdaptiveFetchSchedule();
+ fs.setConf(conf);
+
+ CrawlDatum datum = prepareCrawlDatum();
+ Date fetchTime = Date.from(Instant.now());
+ Date previousFetchTime = Date.from(Instant.now().minus(Duration.ofDays(previousFetchTimeDays)));
+ Date modifiedTime = Date.from(Instant.now().minus(Duration.ofDays(modifiedTimeDays)));
+ datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
+ datum.setRetriesSinceFetch(0);
+ datum.setModifiedTime(modifiedTime.getTime());
+ datum.setFetchTime(fetchTime.getTime());
+
+ System.out.println("CrawlDatum fetchTime: " + fetchTime + "; modifiedTime: " + modifiedTime);
+
+ fs.setFetchSchedule(url, datum, previousFetchTime.getTime(), modifiedTime.getTime(),
+ fetchTime.getTime(), modifiedTime.getTime(), CrawlDatum.STATUS_DB_NOTMODIFIED);
+
+ Date nextFetchTime = new Date(datum.getFetchTime());
+ System.out.println("CrawlDatum next fetchTime: " + nextFetchTime);
+
+ assertTrue(nextFetchTime.after(fetchTime));
+ // adapt milliseconds to seconds
+ long fetchTimeDiff = (nextFetchTime.getTime() - fetchTime.getTime()) / 1000L ;
+ assertTrue(fetchTimeDiff >= FetchSchedule.SECONDS_PER_DAY * minIntervalDays);
+ assertTrue(fetchTimeDiff <= FetchSchedule.SECONDS_PER_DAY * maxIntervalDays);
+ }
}
diff --git a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java
similarity index 99%
rename from src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
rename to src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java
index dfad393512..2e6ea55af1 100644
--- a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java
@@ -29,7 +29,7 @@
import static org.apache.nutch.crawl.CrawlDatum.*;
import static org.junit.jupiter.api.Assertions.fail;
-public class TODOTestCrawlDbStates extends TestCrawlDbStates {
+public class TestCrawlDbStatesExtended extends TestCrawlDbStates {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
diff --git a/src/test/org/apache/nutch/metrics/TestErrorTracker.java b/src/test/org/apache/nutch/metrics/TestErrorTracker.java
new file mode 100644
index 0000000000..5caa3e3a71
--- /dev/null
+++ b/src/test/org/apache/nutch/metrics/TestErrorTracker.java
@@ -0,0 +1,514 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metrics;
+
+import java.io.IOException;
+import java.net.ConnectException;
+import java.net.MalformedURLException;
+import java.net.SocketException;
+import java.net.SocketTimeoutException;
+import java.net.URISyntaxException;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParserNotFound;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.junit.jupiter.MockitoExtension;
+import org.xml.sax.SAXException;
+
+import static org.junit.jupiter.api.Assertions.*;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.*;
+
+import org.apache.nutch.metrics.ErrorTracker.ErrorType;
+
+/**
+ * Unit tests for {@link ErrorTracker} categorization, counting, and Hadoop
+ * counter integration.
+ */
+@ExtendWith(MockitoExtension.class)
+public class TestErrorTracker {
+
+ @Mock
+ private TaskInputOutputContext, ?, ?, ?> mockContext;
+
+ @Mock
+ private Counter mockCounter;
+
+ @BeforeEach
+ void setUp() {
+ // Configure mock context to return mock counter for any counter request
+ lenient().when(mockContext.getCounter(anyString(), anyString())).thenReturn(mockCounter);
+ }
+
+ // =========================================================================
+ // Network Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeNetworkErrors() {
+ // Test IOException
+ assertEquals(ErrorType.NETWORK,
+ ErrorTracker.categorize(new IOException("Connection failed")));
+
+ // Test SocketException
+ assertEquals(ErrorType.NETWORK,
+ ErrorTracker.categorize(new SocketException("Socket closed")));
+
+ // Test UnknownHostException
+ assertEquals(ErrorType.NETWORK,
+ ErrorTracker.categorize(new UnknownHostException("example.com")));
+
+ // Test ConnectException
+ assertEquals(ErrorType.NETWORK,
+ ErrorTracker.categorize(new ConnectException("Connection refused")));
+ }
+
+ // =========================================================================
+ // Timeout Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeTimeoutErrors() {
+ // Test SocketTimeoutException
+ assertEquals(ErrorType.TIMEOUT,
+ ErrorTracker.categorize(new SocketTimeoutException("Read timed out")));
+ }
+
+ @Test
+ public void testCategorizeTimeoutByClassName() {
+ // Test custom exception with "Timeout" in class name
+ // The categorize method checks className.contains("Timeout")
+ Exception customTimeout = new CustomTimeoutException("Custom timeout");
+ assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(customTimeout));
+ }
+
+ // Custom exception class for testing class name-based detection
+ private static class CustomTimeoutException extends Exception {
+ CustomTimeoutException(String message) {
+ super(message);
+ }
+ }
+
+ // =========================================================================
+ // URL Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeUrlErrors() {
+ // Test MalformedURLException
+ assertEquals(ErrorType.URL,
+ ErrorTracker.categorize(new MalformedURLException("Invalid URL")));
+
+ // Test URISyntaxException
+ assertEquals(ErrorType.URL,
+ ErrorTracker.categorize(new URISyntaxException("bad uri", "Invalid syntax")));
+ }
+
+ @Test
+ public void testCategorizeUrlFilterException() {
+ // Test URLFilterException (Nutch-specific)
+ assertEquals(ErrorType.URL,
+ ErrorTracker.categorize(new URLFilterException("URL filtered")));
+ }
+
+ // =========================================================================
+ // Protocol Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeProtocolErrors() {
+ // Test ProtocolException (Nutch-specific)
+ assertEquals(ErrorType.PROTOCOL,
+ ErrorTracker.categorize(new ProtocolException("Protocol error")));
+
+ // Test ProtocolNotFound (Nutch-specific)
+ assertEquals(ErrorType.PROTOCOL,
+ ErrorTracker.categorize(new ProtocolNotFound("ftp")));
+ }
+
+ // =========================================================================
+ // Parsing Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeParsingErrors() {
+ // Test ParseException (Nutch-specific)
+ assertEquals(ErrorType.PARSING,
+ ErrorTracker.categorize(new ParseException("Parse failed")));
+
+ // Test ParserNotFound (Nutch-specific)
+ assertEquals(ErrorType.PARSING,
+ ErrorTracker.categorize(new ParserNotFound("text/unknown")));
+
+ // Test SAXException
+ assertEquals(ErrorType.PARSING,
+ ErrorTracker.categorize(new SAXException("XML parse error")));
+ }
+
+ // =========================================================================
+ // Scoring Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeScoringErrors() {
+ // Test ScoringFilterException (Nutch-specific)
+ assertEquals(ErrorType.SCORING,
+ ErrorTracker.categorize(new ScoringFilterException("Scoring failed")));
+ }
+
+ // =========================================================================
+ // Indexing Error Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeIndexingErrors() {
+ // Test IndexingException (Nutch-specific)
+ assertEquals(ErrorType.INDEXING,
+ ErrorTracker.categorize(new IndexingException("Indexing failed")));
+ }
+
+ // =========================================================================
+ // Other/Fallback Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeNullThrowable() {
+ // Null should return OTHER
+ assertEquals(ErrorType.OTHER, ErrorTracker.categorize(null));
+ }
+
+ @Test
+ public void testCategorizeGenericException() {
+ // Generic Exception should return OTHER
+ assertEquals(ErrorType.OTHER,
+ ErrorTracker.categorize(new Exception("Generic error")));
+
+ // RuntimeException should return OTHER
+ assertEquals(ErrorType.OTHER,
+ ErrorTracker.categorize(new RuntimeException("Runtime error")));
+ }
+
+ // =========================================================================
+ // Cause Chain Categorization Tests
+ // =========================================================================
+
+ @Test
+ public void testCategorizeCauseChain() {
+ // Exception with a network cause should be categorized as NETWORK
+ IOException cause = new IOException("Root cause");
+ Exception wrapper = new Exception("Wrapper", cause);
+ assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(wrapper));
+
+ // Exception with a timeout cause should be categorized as TIMEOUT
+ SocketTimeoutException timeoutCause = new SocketTimeoutException("Timeout");
+ Exception timeoutWrapper = new Exception("Wrapper", timeoutCause);
+ assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(timeoutWrapper));
+ }
+
+ @Test
+ public void testCategorizeNestedCauseChain() {
+ // Deep nested cause chain: RuntimeException -> Exception -> IOException
+ IOException rootCause = new IOException("Root cause");
+ Exception middleWrapper = new Exception("Middle", rootCause);
+ RuntimeException outerWrapper = new RuntimeException("Outer", middleWrapper);
+ assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(outerWrapper));
+
+ // Deep nested with Nutch-specific exception
+ ScoringFilterException scoringCause = new ScoringFilterException("Scoring error");
+ Exception wrapper1 = new Exception("Wrapper 1", scoringCause);
+ Exception wrapper2 = new Exception("Wrapper 2", wrapper1);
+ assertEquals(ErrorType.SCORING, ErrorTracker.categorize(wrapper2));
+ }
+
+ // =========================================================================
+ // Record Error Tests (Local Accumulation)
+ // =========================================================================
+
+ @Test
+ public void testRecordErrorByType() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Initially all counts should be 0
+ assertEquals(0, tracker.getTotalCount());
+ assertEquals(0, tracker.getCount(ErrorType.NETWORK));
+
+ // Record a NETWORK error
+ tracker.recordError(ErrorType.NETWORK);
+ assertEquals(1, tracker.getTotalCount());
+ assertEquals(1, tracker.getCount(ErrorType.NETWORK));
+ assertEquals(0, tracker.getCount(ErrorType.TIMEOUT));
+
+ // Record another NETWORK error
+ tracker.recordError(ErrorType.NETWORK);
+ assertEquals(2, tracker.getTotalCount());
+ assertEquals(2, tracker.getCount(ErrorType.NETWORK));
+
+ // Record a TIMEOUT error
+ tracker.recordError(ErrorType.TIMEOUT);
+ assertEquals(3, tracker.getTotalCount());
+ assertEquals(2, tracker.getCount(ErrorType.NETWORK));
+ assertEquals(1, tracker.getCount(ErrorType.TIMEOUT));
+ }
+
+ @Test
+ public void testRecordErrorByThrowable() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Record an IOException (should be categorized as NETWORK)
+ tracker.recordError(new IOException("Test"));
+ assertEquals(1, tracker.getTotalCount());
+ assertEquals(1, tracker.getCount(ErrorType.NETWORK));
+
+ // Record a SocketTimeoutException (should be categorized as TIMEOUT)
+ tracker.recordError(new SocketTimeoutException("Test"));
+ assertEquals(2, tracker.getTotalCount());
+ assertEquals(1, tracker.getCount(ErrorType.TIMEOUT));
+
+ // Record a MalformedURLException (should be categorized as URL)
+ tracker.recordError(new MalformedURLException("Test"));
+ assertEquals(3, tracker.getTotalCount());
+ assertEquals(1, tracker.getCount(ErrorType.URL));
+ }
+
+ // =========================================================================
+ // Counter Name Mapping Tests
+ // =========================================================================
+
+ @Test
+ public void testGetCounterName() {
+ // Test counter name mapping
+ assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.NETWORK));
+ assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.PROTOCOL));
+ assertEquals(NutchMetrics.ERROR_PARSING_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.PARSING));
+ assertEquals(NutchMetrics.ERROR_URL_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.URL));
+ assertEquals(NutchMetrics.ERROR_SCORING_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.SCORING));
+ assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.INDEXING));
+ assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.TIMEOUT));
+ assertEquals(NutchMetrics.ERROR_OTHER_TOTAL,
+ ErrorTracker.getCounterName(ErrorType.OTHER));
+ }
+
+ @Test
+ public void testGetCounterNameForThrowable() {
+ // Test getting counter name directly from throwable
+ assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL,
+ ErrorTracker.getCounterName(new IOException("Test")));
+ assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL,
+ ErrorTracker.getCounterName(new SocketTimeoutException("Test")));
+ assertEquals(NutchMetrics.ERROR_URL_TOTAL,
+ ErrorTracker.getCounterName(new MalformedURLException("Test")));
+ assertEquals(NutchMetrics.ERROR_OTHER_TOTAL,
+ ErrorTracker.getCounterName(new RuntimeException("Test")));
+
+ // Test Nutch-specific exceptions
+ assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL,
+ ErrorTracker.getCounterName(new ProtocolException("Test")));
+ assertEquals(NutchMetrics.ERROR_PARSING_TOTAL,
+ ErrorTracker.getCounterName(new ParseException("Test")));
+ assertEquals(NutchMetrics.ERROR_SCORING_TOTAL,
+ ErrorTracker.getCounterName(new ScoringFilterException("Test")));
+ assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL,
+ ErrorTracker.getCounterName(new IndexingException("Test")));
+ }
+
+ // =========================================================================
+ // Hadoop Context Integration Tests (Using Mocks)
+ // =========================================================================
+
+ @Test
+ public void testConstructorWithContext() {
+ // Create ErrorTracker with context - should initialize counters
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext);
+
+ // Verify counters were requested from context
+ // Total counter + 8 error type counters = 9 calls
+ verify(mockContext, atLeast(9)).getCounter(anyString(), anyString());
+ }
+
+ @Test
+ public void testInitCounters() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Initialize counters
+ tracker.initCounters(mockContext);
+
+ // Verify counters were requested
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL);
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL);
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL);
+ }
+
+ @Test
+ public void testIncrementCountersWithType() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext);
+
+ // Increment counters directly
+ tracker.incrementCounters(ErrorType.NETWORK);
+
+ // Verify counter was incremented (total + specific type)
+ verify(mockCounter, times(2)).increment(1);
+ }
+
+ @Test
+ public void testIncrementCountersWithThrowable() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext);
+
+ // Increment counters with throwable
+ tracker.incrementCounters(new IOException("Test"));
+
+ // Verify counter was incremented (total + NETWORK type)
+ verify(mockCounter, times(2)).increment(1);
+ }
+
+ @Test
+ public void testIncrementCountersWithoutInit() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Should throw IllegalStateException when counters not initialized
+ assertThrows(IllegalStateException.class, () -> {
+ tracker.incrementCounters(ErrorType.NETWORK);
+ });
+ }
+
+ @Test
+ public void testEmitCounters() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Record some errors locally
+ tracker.recordError(ErrorType.NETWORK);
+ tracker.recordError(ErrorType.NETWORK);
+ tracker.recordError(ErrorType.TIMEOUT);
+
+ // Emit counters (without cached counters - uses fallback)
+ tracker.emitCounters(mockContext);
+
+ // Verify counters were requested and incremented
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL);
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL);
+ verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL);
+ }
+
+ @Test
+ public void testEmitCountersWithCachedCounters() {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext);
+
+ // Reset mock to clear constructor calls
+ reset(mockCounter);
+
+ // Record some errors locally
+ tracker.recordError(ErrorType.NETWORK);
+ tracker.recordError(ErrorType.NETWORK);
+ tracker.recordError(ErrorType.TIMEOUT);
+
+ // Emit counters (with cached counters)
+ tracker.emitCounters(mockContext);
+
+ // Verify cached counters were used (increment called with accumulated values)
+ verify(mockCounter).increment(3L); // total count
+ verify(mockCounter).increment(2L); // NETWORK count
+ verify(mockCounter).increment(1L); // TIMEOUT count
+ }
+
+ // =========================================================================
+ // Thread Safety Tests
+ // =========================================================================
+
+ @Test
+ public void testThreadSafety() throws InterruptedException {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Create multiple threads that record errors concurrently
+ Thread[] threads = new Thread[10];
+ for (int i = 0; i < threads.length; i++) {
+ threads[i] = new Thread(() -> {
+ for (int j = 0; j < 100; j++) {
+ tracker.recordError(ErrorType.NETWORK);
+ }
+ });
+ }
+
+ // Start all threads
+ for (Thread thread : threads) {
+ thread.start();
+ }
+
+ // Wait for all threads to complete
+ for (Thread thread : threads) {
+ thread.join();
+ }
+
+ // Verify counts
+ assertEquals(1000, tracker.getTotalCount());
+ assertEquals(1000, tracker.getCount(ErrorType.NETWORK));
+ }
+
+ @Test
+ public void testThreadSafetyMixedErrorTypes() throws InterruptedException {
+ ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+
+ // Create threads that record different error types concurrently
+ Thread networkThread = new Thread(() -> {
+ for (int i = 0; i < 500; i++) {
+ tracker.recordError(ErrorType.NETWORK);
+ }
+ });
+
+ Thread timeoutThread = new Thread(() -> {
+ for (int i = 0; i < 300; i++) {
+ tracker.recordError(ErrorType.TIMEOUT);
+ }
+ });
+
+ Thread urlThread = new Thread(() -> {
+ for (int i = 0; i < 200; i++) {
+ tracker.recordError(ErrorType.URL);
+ }
+ });
+
+ networkThread.start();
+ timeoutThread.start();
+ urlThread.start();
+
+ networkThread.join();
+ timeoutThread.join();
+ urlThread.join();
+
+ // Verify counts
+ assertEquals(1000, tracker.getTotalCount());
+ assertEquals(500, tracker.getCount(ErrorType.NETWORK));
+ assertEquals(300, tracker.getCount(ErrorType.TIMEOUT));
+ assertEquals(200, tracker.getCount(ErrorType.URL));
+ }
+}
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index b14b55af09..4d8ae07971 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -146,7 +146,9 @@ public void testGetDomainSuffix() throws Exception {
url = new URL("http://www.example.2000.hu");
assertEquals("2000.hu", URLUtil.getDomainSuffix(url));
- // test non-ascii
+ // test non-ASCII
+ url = new URL("https://www.taiuru.māori.nz/");
+ assertEquals("xn--mori-qsa.nz", URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.flå.no");
assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.栃木.jp");