" + title + "

diff --git a/.github/workflows/cc-build.yml b/.github/workflows/cc-build.yml new file mode 100644 index 0000000000..2ef2fb6fca --- /dev/null +++ b/.github/workflows/cc-build.yml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: cc ci + +on: + push: + branches: [cc] + pull_request: + types: [opened, synchronize, reopened] + branches: [cc] +jobs: + test: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v5 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v5 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Install CLD2 + run: | + sudo apt-get update + sudo apt-get install libcld2-0 libcld2-dev + - name: Install language-detection-cld2 + run: | + git clone https://github.com/commoncrawl/language-detection-cld2.git + cd language-detection-cld2/ + mvn install + - name: Install crawler-commons development version + run: | + git clone https://github.com/commoncrawl/crawler-commons.git + cd crawler-commons/ + git checkout commons-io-downgrade + mvn install + - name: Install recent public suffix list + run: | + curl https://publicsuffix.org/list/public_suffix_list.dat -o conf/effective_tld_names.dat + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- + - name: Test + run: ant clean test test-protocol-integration -buildfile build.xml diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index ead3e5b325..e2359737ba 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -25,30 +25,41 @@ jobs: checks: runs-on: ubuntu-latest steps: - - name: Download Test Report + - name: Download Test Report (Ubuntu) uses: dawidd6/action-download-artifact@v11 with: - name: junit-test-results + name: junit-test-results-ubuntu-latest workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} + continue-on-error: true - name: Publish Test Report - uses: mikepenz/action-junit-report@v5 + uses: mikepenz/action-junit-report@v6 with: report_paths: |- ./test/TEST-*.xml ./**/test/TEST-*.xml + check_name: |- + JUnit Test Report + JUnit Test Report Plugins commit: ${{ github.event.workflow_run.head_sha }} - comment: true - pr_id: ${{ github.event.workflow_run.pull_requests[0].number }} - fail_on_failure: true + fail_on_failure: false + fail_on_parse_error: true + require_tests: true + require_passed_tests: true + include_passed: false + include_skipped: true + check_annotations: true + annotate_notice: true job_summary: true detailed_summary: true - truncate_stack_traces: false - fail_on_parse_error: false # temporary while debugging TestMimeUtil - require_tests: true + flaky_summary: true + skip_success_summary: true include_time_in_summary: true - include_passed: true + group_suite: true + comment: true + updateComment: true + skip_comment_without_tests: true job_name: tests - check_name: |- - JUnit Test Report Core - JUnit Test Report Plugins + truncate_stack_traces: false + annotations_limit: 50 + pr_id: ${{ github.event.workflow_run.pull_requests[0].number || '' }} diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index f7265e5b52..4a1604d928 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -24,7 +24,7 @@ jobs: javadoc: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -34,12 +34,19 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Javadoc run: ant clean javadoc -buildfile build.xml rat: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -49,6 +56,13 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Run Apache Rat run: ant clean run-rat -buildfile build.xml - name: Cache unknown licenses @@ -62,10 +76,10 @@ jobs: tests: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.os }} - timeout-minutes: 30 + timeout-minutes: 45 steps: - uses: actions/checkout@v5 - name: Set up JDK ${{ matrix.java }} @@ -73,6 +87,13 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 id: filter with: @@ -83,6 +104,10 @@ jobs: - 'src/testresources/**' plugins: - 'src/plugin/**' + indexer_plugins: + - 'src/plugin/indexer-*/**' + protocol_plugins: + - 'src/plugin/protocol-*/**' buildconf: - 'build.xml' - 'ivy/ivy.xml' @@ -99,13 +124,31 @@ jobs: - name: test plugins if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-plugins -buildfile build.xml + # run indexer integration tests when indexer plugin files change (Docker required, ubuntu-latest only) + - name: test indexer integration + if: ${{ steps.filter.outputs.indexer_plugins == 'true' && matrix.os == 'ubuntu-latest' }} + run: ant clean test-indexer-integration -buildfile build.xml + # run protocol integration tests when protocol plugin files change (Docker required, ubuntu-latest only) + - name: test protocol integration + if: ${{ steps.filter.outputs.protocol_plugins == 'true' && matrix.os == 'ubuntu-latest' }} + run: ant clean test-protocol-integration -buildfile build.xml + - name: Check for test results + id: check_tests + if: always() && matrix.os == 'ubuntu-latest' + run: | + shopt -s globstar nullglob + files=(./build/test/TEST-*.xml ./build/**/test/TEST-*.xml) + if [ ${#files[@]} -gt 0 ]; then + echo "has_results=true" >> $GITHUB_OUTPUT + else + echo "has_results=false" >> $GITHUB_OUTPUT + fi - name: Upload Test Report uses: actions/upload-artifact@v4 - if: always() + if: always() && matrix.os == 'ubuntu-latest' && steps.check_tests.outputs.has_results == 'true' with: - name: junit-test-results + name: junit-test-results-${{ matrix.os }} path: | ./build/test/TEST-*.xml ./build/**/test/TEST-*.xml - retention-days: 1 - overwrite: true \ No newline at end of file + retention-days: 1 \ No newline at end of file diff --git a/LICENSE-binary b/LICENSE-binary index 538e3baf7c..addc4a2824 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -245,7 +245,6 @@ com.google.inject.extensions:guice-servlet com.google.j2objc:j2objc-annotations com.healthmarketscience.jackcess:jackcess com.healthmarketscience.jackcess:jackcess-encrypt -com.intellij:annotations com.maxmind.db:maxmind-db com.maxmind.geoip2:geoip2 com.nimbusds:nimbus-jose-jwt @@ -257,7 +256,12 @@ com.rometools:rome-utils com.shapesecurity:salvation2 com.squareup.okhttp3:okhttp com.squareup.okhttp3:okhttp-brotli +com.squareup.okhttp3:okhttp-jvm +com.squareup.okhttp3:okhttp-zstd com.squareup.okio:okio +com.squareup.okio:okio-jvm +com.squareup.zstd:zstd-kmp-jvm +com.squareup.zstd:zstd-kmp-okio-jvm com.tdunning:t-digest com.typesafe.netty:netty-reactive-streams com.typesafe.scala-logging:scala-logging_2.12 @@ -275,13 +279,14 @@ commons-lang:commons-lang commons-logging:commons-logging commons-net:commons-net commons-validator:commons-validator +de.l3s.boilerpipe:boilerpipe de.vandermeer:ascii-utf-themes de.vandermeer:asciitable de.vandermeer:char-translation de.vandermeer:skb-interfaces dev.failsafe:failsafe +info.picocli:picocli io.dropwizard.metrics:metrics-core -io.netty:netty io.netty:netty-all io.netty:netty-buffer io.netty:netty-codec @@ -378,7 +383,7 @@ org.apache.hadoop:hadoop-yarn-api org.apache.hadoop:hadoop-yarn-client org.apache.hadoop:hadoop-yarn-common org.apache.hadoop.thirdparty:hadoop-shaded-guava -org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7 +org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25 org.apache.httpcomponents:httpasyncclient org.apache.httpcomponents:httpclient org.apache.httpcomponents:httpcore @@ -398,21 +403,13 @@ org.apache.kafka:kafka-storage org.apache.kafka:kafka-storage-api org.apache.kafka:kafka-tools-api org.apache.kafka:kafka_2.12 -org.apache.kerby:kerb-admin -org.apache.kerby:kerb-client -org.apache.kerby:kerb-common org.apache.kerby:kerb-core org.apache.kerby:kerb-crypto -org.apache.kerby:kerb-identity -org.apache.kerby:kerb-server -org.apache.kerby:kerb-simplekdc org.apache.kerby:kerb-util org.apache.kerby:kerby-asn1 org.apache.kerby:kerby-config org.apache.kerby:kerby-pkix org.apache.kerby:kerby-util -org.apache.kerby:kerby-xdr -org.apache.kerby:token-provider org.apache.logging.log4j:log4j-api org.apache.logging.log4j:log4j-core org.apache.logging.log4j:log4j-slf4j2-impl @@ -435,6 +432,7 @@ org.apache.pdfbox:fontbox org.apache.pdfbox:jbig2-imageio org.apache.pdfbox:jempbox org.apache.pdfbox:pdfbox +org.apache.pdfbox:pdfbox-io org.apache.pdfbox:pdfbox-tools org.apache.pdfbox:xmpbox org.apache.poi:poi @@ -443,6 +441,7 @@ org.apache.poi:poi-ooxml-lite org.apache.poi:poi-scratchpad org.apache.solr:solr-solrj org.apache.tika:tika-core +org.apache.tika:tika-handler-boilerpipe org.apache.tika:tika-langdetect-optimaize org.apache.tika:tika-parser-apple-module org.apache.tika:tika-parser-audiovideo-module @@ -476,8 +475,6 @@ org.asynchttpclient:async-http-client org.asynchttpclient:async-http-client-netty-utils org.bitbucket.b_c:jose4j org.ccil.cowan.tagsoup:tagsoup -org.codehaus.jackson:jackson-core-asl -org.codehaus.jackson:jackson-mapper-asl org.codehaus.jettison:jettison org.eclipse.jetty:jetty-alpn-client org.eclipse.jetty:jetty-alpn-java-client @@ -515,9 +512,6 @@ org.gagravarr:vorbis-java-core org.gagravarr:vorbis-java-tika org.jetbrains:annotations org.jetbrains.kotlin:kotlin-stdlib -org.jetbrains.kotlin:kotlin-stdlib-common -org.jetbrains.kotlin:kotlin-stdlib-jdk7 -org.jetbrains.kotlin:kotlin-stdlib-jdk8 org.jspecify:jspecify org.littleshoot:littleproxy org.locationtech.spatial4j:spatial4j @@ -595,9 +589,7 @@ BSD 2-Clause com.barchart.udt:barchart-udt-bundle com.github.luben:zstd-jni -com.google.protobuf:protobuf-java dk.brics:automaton -dnsjava:dnsjava org.codehaus.woodstox:stax2-api org.jline:jline @@ -609,6 +601,7 @@ BSD 3-Clause com.adobe.xmp:xmpcore com.github.virtuald:curvesapi +dnsjava:dnsjava org.fusesource.leveldbjni:leveldbjni-all org.ow2.asm:asm @@ -633,7 +626,7 @@ Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) -org.bouncycastle:bcmail-jdk18on +org.bouncycastle:bcjmail-jdk18on org.bouncycastle:bcpkix-jdk18on org.bouncycastle:bcprov-jdk18on org.bouncycastle:bcutil-jdk18on @@ -717,6 +710,8 @@ jakarta.jws:jakarta.jws-api jakarta.xml.bind:jakarta.xml.bind-api jakarta.xml.soap:jakarta.xml.soap-api jakarta.xml.ws:jakarta.xml.ws-api +org.eclipse.angus:angus-activation +org.glassfish.jaxb:jaxb-core org.glassfish.jaxb:jaxb-runtime org.glassfish.jaxb:txw2 @@ -724,6 +719,8 @@ org.glassfish.jaxb:txw2 Eclipse Public License - Version 2.0 ------------------------------------ +(licenses-binary/LICENSE-eclipse-public-license---version-2.0.txt) + org.eclipse.jetty:jetty-http org.eclipse.jetty:jetty-io org.eclipse.jetty:jetty-security @@ -734,6 +731,8 @@ org.eclipse.jetty:jetty-util MIT --- +(licenses-binary/LICENSE-mit-license.txt) + net.sourceforge.argparse4j:argparse4j org.slf4j:slf4j-api @@ -781,7 +780,6 @@ Public Domain (licenses-binary/LICENSE-public-domain.txt) aopalliance:aopalliance -org.tukaani:xz Public Domain, per Creative Commons CC0 diff --git a/NOTICE-binary b/NOTICE-binary index 99fea523a4..412ce7d38e 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -48,7 +48,7 @@ Apache projects # org.apache.avro:avro -Apache Avro (http://avro.apache.org) +Apache Avro (https://avro.apache.org) # org.apache.commons:commons-collections4 Apache Commons Collections (https://commons.apache.org/proper/commons-collections/) @@ -60,6 +60,8 @@ Apache Commons Configuration (https://commons.apache.org/proper/commons-configur Apache Commons CSV (https://commons.apache.org/proper/commons-csv/) # org.apache.commons:commons-exec Apache Commons Exec (http://commons.apache.org/proper/commons-exec/) +# org.apache.commons:commons-exec +Apache Commons Exec (https://commons.apache.org/proper/commons-exec/) # org.apache.commons:commons-jexl3 Apache Commons JEXL (https://commons.apache.org/proper/commons-jexl/) # org.apache.commons:commons-lang3 @@ -68,8 +70,6 @@ Apache Commons Lang (https://commons.apache.org/proper/commons-lang/) Apache Commons Lang (http://commons.apache.org/proper/commons-lang/) # org.apache.commons:commons-math3 Apache Commons Math (http://commons.apache.org/proper/commons-math/) -# org.apache.commons:commons-math3 -Apache Commons Math (http://commons.apache.org/math/) # org.apache.commons:commons-text Apache Commons Text (https://commons.apache.org/proper/commons-text) @@ -132,8 +132,8 @@ Apache Hadoop YARN Common # org.apache.hadoop.thirdparty:hadoop-shaded-guava Apache Hadoop shaded Guava -# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7 -Apache Hadoop shaded Protobuf 3.7 +# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25 +Apache Hadoop shaded Protobuf # org.apache.httpcomponents:httpasyncclient Apache HttpAsyncClient (http://hc.apache.org/httpcomponents-asyncclient) @@ -146,6 +146,8 @@ Apache HttpCore (http://hc.apache.org/httpcomponents-core-ga) # org.apache.httpcomponents:httpcore-nio Apache HttpCore NIO (http://hc.apache.org/httpcomponents-core-ga) # org.apache.httpcomponents:httpmime +Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client-ga) +# org.apache.httpcomponents:httpmime Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client) # org.apache.james:apache-mime4j-core @@ -178,22 +180,10 @@ Apache Kafka (https://kafka.apache.org) # org.apache.kafka:kafka_2.12 Apache Kafka (https://kafka.apache.org) -# org.apache.kerby:kerb-admin -Apache Kerby-kerb Admin -# org.apache.kerby:kerb-client -Apache Kerby-kerb Client -# org.apache.kerby:kerb-common -Apache Kerby-kerb Common # org.apache.kerby:kerb-core Apache Kerby-kerb core # org.apache.kerby:kerb-crypto Apache Kerby-kerb Crypto -# org.apache.kerby:kerb-identity -Apache Kerby-kerb Identity -# org.apache.kerby:kerb-server -Apache Kerby-kerb Server -# org.apache.kerby:kerb-simplekdc -Apache Kerb Simple Kdc # org.apache.kerby:kerb-util Apache Kerby-kerb Util # org.apache.kerby:kerby-asn1 @@ -204,10 +194,6 @@ Apache Kerby Config Apache Kerby PKIX Project # org.apache.kerby:kerby-util Apache Kerby Util -# org.apache.kerby:kerby-xdr -Apache Kerby XDR Project -# org.apache.kerby:token-provider -Apache Token provider # org.apache.logging.log4j:log4j-api Apache Log4j API @@ -258,6 +244,8 @@ Apache PDFBox JBIG2 ImageIO plugin Apache JempBox # org.apache.pdfbox:pdfbox Apache PDFBox +# org.apache.pdfbox:pdfbox-io +Apache PDFBox io # org.apache.pdfbox:pdfbox-tools Apache PDFBox tools # org.apache.pdfbox:xmpbox @@ -277,6 +265,8 @@ Apache Solr Solrj # org.apache.tika:tika-core Apache Tika core (https://tika.apache.org/) +# org.apache.tika:tika-handler-boilerpipe +Apache # org.apache.tika:tika-langdetect-optimaize Apache Tika Optimaize langdetect # org.apache.tika:tika-parser-apple-module @@ -391,10 +381,10 @@ Jackson-annotations (http://github.com/FasterXML/jackson) Jackson-annotations (https://github.com/FasterXML/jackson) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-core -Jackson-core (https://github.com/FasterXML/jackson) +Jackson-core (https://github.com/FasterXML/jackson-core) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-core -Jackson-core (https://github.com/FasterXML/jackson-core) +Jackson-core (https://github.com/FasterXML/jackson) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-databind jackson-databind (http://github.com/FasterXML/jackson) @@ -519,10 +509,10 @@ error-prone annotations # com.google.guava:failureaccess Guava InternalFutureFailureAccess and InternalFutures -- license: The Apache Software License, Version 2.0 +- license: Apache License, Version 2.0 # com.google.guava:failureaccess Guava InternalFutureFailureAccess and InternalFutures -- license: Apache License, Version 2.0 +- license: The Apache Software License, Version 2.0 # com.google.guava:guava Guava: Google Core Libraries for Java (https://github.com/google/guava) - license: Apache License, Version 2.0 @@ -548,14 +538,10 @@ J2ObjC Annotations (https://github.com/google/j2objc/) J2ObjC Annotations (https://github.com/google/j2objc/) - license: The Apache Software License, Version 2.0 -# com.google.protobuf:protobuf-java -Protocol Buffer Java API (http://code.google.com/p/protobuf) -- license: New BSD license - (licenses-binary/LICENSE-bsd-2-clause.txt) - # com.google.re2j:re2j re2j (http://github.com/google/re2j) - license: The Go license + (licenses-binary/LICENSE-the-go-license.txt) # com.googlecode.juniversalchardet:juniversalchardet juniversalchardet (http://juniversalchardet.googlecode.com/) @@ -577,10 +563,7 @@ Jackcess Encrypt (http://jackcessencrypt.sf.net) # com.ibm.icu:icu4j ICU4J (https://icu.unicode.org/) - license: Unicode-3.0 - -# com.intellij:annotations -IntelliJ IDEA Annotations (http://www.jetbrains.org) -- license: Apache License 2 + (licenses-binary/LICENSE-unicode-icu-license.txt) # com.jcraft:jsch JSch (http://www.jcraft.com/jsch/) @@ -633,14 +616,30 @@ salvation (http://cspvalidator.org) - license: Apache License, Version 2.0 # com.squareup.okhttp3:okhttp -OkHttp (https://square.github.io/okhttp/) +okhttp (https://square.github.io/okhttp/) - license: The Apache Software License, Version 2.0 # com.squareup.okhttp3:okhttp-brotli okhttp-brotli (https://square.github.io/okhttp/) - license: The Apache Software License, Version 2.0 +# com.squareup.okhttp3:okhttp-jvm +okhttp (https://square.github.io/okhttp/) +- license: The Apache Software License, Version 2.0 +# com.squareup.okhttp3:okhttp-zstd +okhttp-zstd (https://square.github.io/okhttp/) +- license: The Apache Software License, Version 2.0 # com.squareup.okio:okio -Okio (https://github.com/square/okio/) +okio (https://github.com/square/okio/) +- license: The Apache Software License, Version 2.0 +# com.squareup.okio:okio-jvm +okio (https://github.com/square/okio/) +- license: The Apache Software License, Version 2.0 + +# com.squareup.zstd:zstd-kmp-jvm +zstd-kmp (https://github.com/square/okio-zstd/) +- license: The Apache Software License, Version 2.0 +# com.squareup.zstd:zstd-kmp-okio-jvm +zstd-kmp-okio (https://github.com/square/okio-zstd/) - license: The Apache Software License, Version 2.0 # com.sun.activation:jakarta.activation @@ -778,6 +777,10 @@ Apache Commons Net (https://commons.apache.org/proper/commons-net/) Apache Commons Validator (http://commons.apache.org/proper/commons-validator/) - license: Apache License, Version 2.0 +# de.l3s.boilerpipe:boilerpipe +Apache License 2.0 (http://code.google.com/p/boilerpipe/) +- license: Apache License 2.0 + # de.vandermeer:ascii-utf-themes ASCII and UTF Themes (https://github.com/vdmeer/ascii-utf-themes) - license: Apache 2 @@ -801,17 +804,18 @@ dk.brics.automaton (https://www.brics.dk/automaton) (licenses-binary/LICENSE-bsd-2-clause.txt) # dnsjava:dnsjava -dnsjava (http://www.dnsjava.org) -- license: BSD 2-Clause license - (licenses-binary/LICENSE-bsd-2-clause.txt) +dnsjava (https://github.com/dnsjava/dnsjava) +- license: BSD-3-Clause + (licenses-binary/LICENSE-bsd-3-clause.txt) + +# info.picocli:picocli +picocli (https://picocli.info) +- license: The Apache Software License, version 2.0 # io.dropwizard.metrics:metrics-core Metrics Core - license: Apache License 2.0 -# io.netty:netty -Netty (http://netty.io/) -- license: Apache License, Version 2.0 # io.netty:netty-all Netty/All-in-One (https://netty.io/netty-all/) - license: Apache License, Version 2.0 @@ -969,6 +973,10 @@ Google S2 geometry library (https://github.com/sgr-io/s2-geometry-library-java) # jakarta.activation:jakarta.activation-api Jakarta Activation API jar +- license: EDL 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) +# jakarta.activation:jakarta.activation-api +Jakarta Activation API (https://github.com/jakartaee/jaf-api) - license: EDL 1.0 (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) @@ -1019,7 +1027,7 @@ javax.ws.rs-api (https://github.com/eclipse-ee4j/jaxrs-api) (licenses-binary/LICENSE-epl-2.0.txt) # javax.ws.rs:jsr311-api jsr311-api (https://jsr311.dev.java.net) -- license: CDDL License +- license: CDDL License (licenses-binary/LICENSE-cddl-license.txt) # javax.xml.bind:jaxb-api @@ -1060,6 +1068,7 @@ JOpt Simple (http://jopt-simple.github.io/jopt-simple) # net.sourceforge.argparse4j:argparse4j argparse4j (http://argparse4j.github.io) - license: MIT + (licenses-binary/LICENSE-mit-license.txt) # net.sourceforge.htmlunit:htmlunit HtmlUnit (http://htmlunit.sourceforge.net) @@ -1105,20 +1114,24 @@ Asynchronous Http Client Netty Utils jose4j (https://bitbucket.org/b_c/jose4j/) - license: The Apache Software License, Version 2.0 -# org.bouncycastle:bcmail-jdk18on -Bouncy Castle S/MIME API (https://www.bouncycastle.org/java.html) +# org.bouncycastle:bcjmail-jdk18on +Bouncy Castle JavaMail Jakarta S/MIME APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcpkix-jdk18on -Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/java.html) +Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcprov-jdk18on Bouncy Castle Provider (https://www.bouncycastle.org/java.html) +- license: Bouncy Castle Licence + (licenses-binary/LICENSE-bouncy-castle-licence.txt) +# org.bouncycastle:bcprov-jdk18on +Bouncy Castle Provider (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcutil-jdk18on -Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/java.html) +Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) @@ -1140,13 +1153,6 @@ Checker Qual (https://checkerframework.org/) - license: The MIT License (licenses-binary/LICENSE-mit-license.txt) -# org.codehaus.jackson:jackson-core-asl -Jackson (http://jackson.codehaus.org) -- license: The Apache Software License, Version 2.0 -# org.codehaus.jackson:jackson-mapper-asl -Data Mapper for Jackson (http://jackson.codehaus.org) -- license: The Apache Software License, Version 2.0 - # org.codehaus.jettison:jettison Jettison (https://github.com/jettison-json/jettison) - license: Apache License, Version 2.0 @@ -1163,7 +1169,12 @@ Stax2 API (http://github.com/FasterXML/stax2-api) # org.codelibs:jhighlight JHighlight (https://github.com/codelibs/jhighlight) - license: CDDL, v1.0 - (licenses-binary/LICENSE-cddl-v1.0.txt) + (licenses-binary/LICENSE-cddl-1.0.txt) + +# org.eclipse.angus:angus-activation +Angus Activation Registries +- license: EDL 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) # org.eclipse.jetty:jetty-alpn-client Jetty :: ALPN :: Client @@ -1180,18 +1191,22 @@ Jetty :: Http Utility # org.eclipse.jetty:jetty-http Jetty :: Http Utility - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-io Jetty :: IO Utility - license: Apache Software License - Version 2.0 # org.eclipse.jetty:jetty-io Jetty :: IO Utility - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-security Jetty :: Security - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-server Jetty :: Server Core - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-servlet Jetty :: Servlet Handling - license: Apache Software License - Version 2.0 @@ -1201,6 +1216,7 @@ Jetty :: Utilities # org.eclipse.jetty:jetty-util Jetty :: Utilities - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-util-ajax Jetty :: Utilities :: Ajax(JSON) - license: Apache Software License - Version 2.0 @@ -1295,6 +1311,10 @@ Ogg and Vorbis for Java, Core (https://github.com/Gagravarr/VorbisJava) Apache Tika plugin for Ogg, Vorbis and FLAC (https://github.com/Gagravarr/VorbisJava) - license: The Apache Software License, Version 2.0 +# org.glassfish.jaxb:jaxb-core +JAXB Core (https://eclipse-ee4j.github.io/jaxb-ri/) +- license: Eclipse Distribution License - v 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) # org.glassfish.jaxb:jaxb-runtime JAXB Runtime (https://eclipse-ee4j.github.io/jaxb-ri/) - license: Eclipse Distribution License - v 1.0 @@ -1326,22 +1346,16 @@ JDOM (http://www.jdom.org) JDOM (http://www.jdom.org) - license: Similar to Apache License but with the acknowledgment clause removed +# org.jetbrains:annotations +JetBrains Java Annotations (https://github.com/JetBrains/java-annotations) +- license: The Apache Software License, Version 2.0 # org.jetbrains:annotations IntelliJ IDEA Annotations (http://www.jetbrains.org) - license: The Apache Software License, Version 2.0 # org.jetbrains.kotlin:kotlin-stdlib -org.jetbrains.kotlin:kotlin-stdlib (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-common -org.jetbrains.kotlin:kotlin-stdlib-common (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-jdk7 -org.jetbrains.kotlin:kotlin-stdlib-jdk7 (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-jdk8 -org.jetbrains.kotlin:kotlin-stdlib-jdk8 (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 +Kotlin Stdlib (https://kotlinlang.org/) +- license: Apache-2.0 # org.jline:jline JLine Bundle @@ -1349,6 +1363,10 @@ JLine Bundle (licenses-binary/LICENSE-bsd-2-clause.txt) # org.jsoup:jsoup +jsoup Java HTML Parser (https://jsoup.org/) +- license: The MIT License + (licenses-binary/LICENSE-mit-license.txt) +# org.jsoup:jsoup jsoup (http://jsoup.org/) - license: The MIT License (licenses-binary/LICENSE-mit-license.txt) @@ -1517,6 +1535,9 @@ org.seleniumhq.selenium:selenium-support (https://selenium.dev/) # org.slf4j:jcl-over-slf4j JCL 1.2 implemented over SLF4J (http://www.slf4j.org) - license: Apache License, Version 2.0 +# org.slf4j:jcl-over-slf4j +JCL 1.2 implemented over SLF4J (http://www.slf4j.org) +- license: Apache-2.0 # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) - license: MIT License @@ -1524,6 +1545,7 @@ SLF4J API Module (http://www.slf4j.org) # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) - license: MIT + (licenses-binary/LICENSE-mit-license.txt) # org.tallison:jmatio JMatIO (https://github.com/tballison/jmatio) @@ -1532,8 +1554,7 @@ JMatIO (https://github.com/tballison/jmatio) # org.tukaani:xz XZ for Java (https://tukaani.org/xz/java.html) -- license: Public Domain - (licenses-binary/LICENSE-public-domain.txt) +- license: Zero-Clause BSD (0BSD) # org.xerial.snappy:snappy-java Apache-2.0 (https://github.com/xerial/snappy-java) diff --git a/README.md b/README.md index f1322aa5e5..07af23b4d1 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,47 @@ -Apache Nutch README -=================== +Common Crawl Fork of Apache Nutch +================================= + +Please also have a look at the [Apache Nutch](https://github.com/apache/nutch) repository and all information about Apache Nutch given below. + +Notable additions in Common Crawl's fork of Nutch (not yet pushed to upstream Nutch although this is planned): +- WARC and CDX writer integrated into Fetcher and able to detect the language of HTML pages using the CLD2 language detector +- [Generator2](src/java/org/apache/nutch/crawl/Generator2.java): alternative implementation of Generator + - allowing to combine per-domain and per-host limits and + - optimized to create many (eg. 100) segments in a single job +- Unused plugins disabled in `build.xml`, to achieve a considerably more lightweight installation for our massively parallel setup. + +How to install additional requirements to build this fork of Nutch: +- [crawler-commons](https://github.com/crawler-commons/crawler-commons) development snapshot package: + ``` + git clone https://github.com/crawler-commons/crawler-commons.git + cd crawler-commons/ + mvn install + ``` +- install the latest public suffix list into `conf/` to ensure that it is definitely used (see [#17](https://github.com/commoncrawl/nutch/issues/17)): + ``` + wget https://publicsuffix.org/list/public_suffix_list.dat -O conf/effective_tld_names.dat + ``` +- [Java wrapper for CLD2 language detection](https://github.com/commoncrawl/language-detection-cld2) + ``` + git clone https://github.com/commoncrawl/language-detection-cld2.git + cd language-detection-cld2/ + mvn install + ``` + For runtime, if WARC language detection is enabled (`warc.detect.language` = true), also the CLD2 shared objects are required, e.g. on Ubuntu + ``` + sudo apt install libcld2-0 libcld2-dev + ``` + +- An example for running this version can be found [here](https://github.com/commoncrawl/cc-nutch-example). + +Apache Nutch +============ [![master pull request ci](https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg)](https://github.com/apache/nutch/actions/workflows/master-build.yml)

-For the latest information about Nutch, please visit our website at: +For the latest information about Nutch, please visit the Nutch website at: https://nutch.apache.org/ diff --git a/build.xml b/build.xml index 278c19e257..8e68ebd9a0 100644 --- a/build.xml +++ b/build.xml @@ -48,6 +48,8 @@ + + @@ -126,7 +128,7 @@ + @@ -456,7 +459,7 @@ - + @@ -509,7 +512,7 @@ - + @@ -532,6 +535,14 @@ + + + + + + + + @@ -702,6 +713,7 @@ + @@ -1108,19 +1120,6 @@ - - - - - - - - - - - - - @@ -1130,7 +1129,6 @@ - @@ -1141,18 +1139,24 @@ - + + + + + + + dest="${ivy.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" /> - + - + @@ -1167,7 +1171,7 @@ + classpath="${ant-eclipse.jar}" /> @@ -1191,7 +1195,7 @@ - + @@ -1203,24 +1207,24 @@ - + - - - - - + + + + + - + - + - + @@ -1237,31 +1241,33 @@ - + - + - + - - + + + + - - - + + + diff --git a/conf/adaptive-scoring.txt.template b/conf/adaptive-scoring.txt.template new file mode 100644 index 0000000000..c0a7276a1c --- /dev/null +++ b/conf/adaptive-scoring.txt.template @@ -0,0 +1,17 @@ +# +# Configuration file for scoring-adaptive +# +# See also properties +# scoring.adaptive.sort.by_status.file +# scoring.adaptive.factor.fetchtime +# scoring.adaptive.penalty.fetch_retry +# scoring.adaptive.boost.injected +# +# Format: +# +# e.g. +# db_unfetched .1 +# db_gone -.5 +# +# The sort value is added to other sort values (score, fetch time). +# It may be negative to penalize fetch items. diff --git a/conf/generate-domain-limits.txt.template b/conf/generate-domain-limits.txt.template new file mode 100644 index 0000000000..7d6e32e394 --- /dev/null +++ b/conf/generate-domain-limits.txt.template @@ -0,0 +1,15 @@ +# +# Fetch list limits by domain +# +# Note: please register this file using the property `generate.domain.limits.file`. +# +# Fields (tab-separated): +# 1 - domain name +# 2 - max. number of URLs per domain and segment +# 3 - max. number of URLs per host (in domain) and segment +# 4 - max. number of hosts per domain over all segments +# 5 - max. number of partitions (fetcher tasks) for domain +# +# Lines starting with `#` are ignored. +# +# wikipedia.org 5000 500 20 1 diff --git a/conf/log4j2.xml b/conf/log4j2.xml index 713bfdc7fe..6faf4329fa 100644 --- a/conf/log4j2.xml +++ b/conf/log4j2.xml @@ -19,16 +19,16 @@ - ${sys:hadoop.log.dir:-./logs} - ${sys:hadoop.log.file:-hadoop.log} + ${sys:hadoop.log.dir:-./logs} + ${sys:hadoop.log.file:-hadoop.log} - + - + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index b41f7ecdab..8ae225735d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -996,6 +996,43 @@ https://issues.apache.org/jira/browse/NUTCH-1248 + + + + generate.count.keep.min.urls.per.segment + 120 + (Generator2 only) When distributing URLs of the same + host/domain over segments keep at least this number of URLs together + in one segment to avoid needless robots.txt fetches and DNS + look-ups. + + + + generate.max.count.per.host.by.domain + -1 + (Generator2 only) Max. number of URLs per host if + `generate.count.mode == domain`. Unlimited if -1. + + + + + generate.max.hosts.per.domain + -1 + (Generator2 only) Max. number of different hosts + allowed per domain. Unlimited if -1. Used only if + `generate.count.mode == domain`. + + + + generate.domain.limits.file + + (Generator2 only) Name of file with domain-specific + limits. Used only if not empty and `generate.count.mode == domain`. + See generate-domain-limits.txt.template for the file format. + + + + @@ -1142,6 +1179,15 @@ + + fetcher.detect.canonical.link + false + If true, fetcher will detect canonical links in HTML content + relying on the class org.commoncrawl.util.CanonicalLinkDetector. Found + links are store in CrawlDatum metadata as "canonical.link". + + + fetcher.timelimit.mins -1 @@ -1306,7 +1352,6 @@ - fetcher.store.robotstxt false If true (and fetcher.store.content is also true), @@ -1316,6 +1361,62 @@ + + fetcher.store.404s + false + If true, fetcher will also store the content of 404s and other + non-successful responses for debugging or archival purposes. + + + + fetcher.store.warc + false + If true, fetcher will write WARC files. + + + + + fetcher.robotstxt.archiving.filter.url + false + Skip archiving of a robots.txt response if the URL is + rejected by URL filters. + + + + fetcher.robotstxt.archiving.filter.url.always + false + Skip archiving of a robots.txt response also if the + combination of URL path and query equals /robots.txt and the URL is + rejected by URL filters. According to the robots.txt RFC 9309 + "the /robots.txt URI is implicitly allowed." + + + + fetcher.robotstxt.archiving.filter.mime + false + Skip archiving of a robots.txt response if the detected + MIME type does not match one listed in the property + fetcher.robotstxt.archiving.filter.mime.accept + + + + fetcher.robotstxt.archiving.filter.mime.accept + text/plain,text/x-robots + Comma-separated list of MIME types allowed for + successfully fetched robots.txt responses. Only used if + fetcher.robotstxt.archiving.filter.mime is true. + + + + fetcher.robotstxt.archiving.check.robotstxt + false + Verify whether a redirected robots.txt is allowed by + the robots.txt of the target host. If not skip archiving the + robots.txt response. Not applied if the URL file part (path and + query) is `/robots.txt` as this path is assumed to be always + allowed. + + fetcher.publisher false @@ -1556,6 +1657,17 @@ + + urlnormalizer.basic.host.idna2008 + false + If true, let urlnormalizer-basic + normalize Internationalized Domain Names (IDNs) using the + standard IDNA2008 (RFC 5890). If false, use IDNA2003 (RFC 3490). + Note that urlnormalizer.basic.host.idn must be set, otherwise + this property has no effect. + + + urlnormalizer.basic.host.trim-trailing-dot false @@ -1606,7 +1718,10 @@ plugins Directories where Nutch plugins are located. Each element may be a relative or absolute path. If absolute, it is used - as is. If relative, it is searched for on the classpath. + as is. If relative, it is searched for on the classpath. + For secure deployments, treat these directories as trusted code: use + read-only filesystem permissions or immutable images so untrusted + parties cannot add or replace plugin JARs or plugin.xml files. @@ -2053,6 +2168,126 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this + + + scoring.adaptive.factor.fetchtime + .01 + + Generator sort value factor for pages to be (re)fetched based on the time + (in days) elapsed since the scheduled fetch time: + generator_sort_value += (factor * days_elapsed) + + + + + scoring.adaptive.factor.fetchtime.random + .0 + + Random factor added to the sort value factor for pages to be + (re)fetched based on the time, see scoring.adaptive.factor.fetchtime: + `generator_sort_value += (factor * days_elapsed * (1.0 + random_factor * Random.nextGaussian()))` + With a value of 0.0 no random factor is added. + + + + + scoring.adaptive.factor.lastseentime + .005 + + Generator sort value factor for pages to be (re)fetched based on + the time (in days) elapsed since a URL has been seen as seed or + link. URLs not seen since long are penalized by this factor + (opposed to scoring.adaptive.factor.fetchtime which prefers pages + not revisited for a longer period of time): + generator_sort_value -= (factor * days_since_last_seen) + + + + + scoring.adaptive.penalty.fetch_retry + .1 + + Factor penalizing pages not successfully fetched for each failed fetch trial: + generator_sort_value -= (penalty * retries_since_fetch) + + + + + scoring.adaptive.boost.injected + .2 + + Boost recently injected URLs (injected within the last 7 days): + generator_sort_value += injected_boost + + + + + scoring.adaptive.sort.by_status.file + adaptive-scoring.txt + + File containing generator sort values (penalize/boost) by CrawlDatum status. + See file / template for more information. + + + + + scoring.adaptive.penalty.non_canonical + .07 + + Penalize non-canonical pages, i.e., pages with a canonical link not equal to the URL. + The default is to delay the revisit up to 7 days (7 * scoring.adaptive.factor.fetchtime). + + + + + scoring.adaptive.mark.orphan.after + 518400 + + Time span (in minutes) after which a page not seen anymore by inlink or + seed is marked as orphaned. Default = 518400 minutes = one year. + + + + + scoring.adaptive.mark.gone.orphan.after + 172800 + + Time span (in minutes) after which a gone page not seen anymore + by inlink or seed is marked as orphaned. Also duplicates and unfetched pages + with a retry count >= 3 are considered as gone. + Default = 172800 minutes = four month. + + + + + scoring.adaptive.mark.redirect.orphan.after + 518400 + + Time span (in minutes) after which a redirect not seen anymore by inlink + or seed is marked as orphaned. Default = 518400 minutes = one year. + + + + + scoring.adaptive.mark.unfetched.orphan.after + 518400 + + Time span (in minutes) after which a still unfetched page not seen anymore + by inlink or seed is marked as orphaned. Default = 518400 minutes = one year. + + + + + scoring.adaptive.mark.orphan.last.seen.default.date + 2017-07-15T00:00:00 + + Date when a page is considered to be seen last time by default (as + fall-back) if there is no last seen time tracked in the CrawlDatum. + + + + + + + + + warc.export.operator + + + Information put into the "operator" field of the WARC info record + (see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo). + + + + + warc.export.publisher + + + Information put into the "publisher" field of the WARC info record + (see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo). + + + + + warc.export.software + + + Information put into the "software" field of the WARC info record + (see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo). + + + + + warc.export.description + + + Information put into the "description" field of the WARC info record + (see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo). + + + + + warc.export.isPartOf + + + Information put into the "isPartOf" field of the WARC info record + (see https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo). + + + + + warc.deduplicate + false + + Deduplicate WARC records. + + + + + warc.export.crawldiagnostics + true + + If true, the WARC writer also writes WARC files with + "unsuccessful" fetched (HTTP 404, redirects, etc.). + + + + + warc.export.robotstxt + true + + If true, the WARC writer also writes robots.txt WARC files. + + + + + warc.export.cdx + true + + If true, the WARC writer also writes a CDX file along with every + WARC file. The CDX file holds an index of WARC response records. + + + + + warc.export.prefix + + + WARC file name prefix. The WARC file name is: + prefix-date-enddate-partition.warc.gz + + + + + warc.export.path + + + Path to filesystem directory where WARC files are placed. + + + + + warc.export.cdx.path + + + Path to filesystem directory where CDX files are placed. + + + + + warc.detect.language + false + + If true, detect the content language of HTML captures using + CLD2. Note: the CDL2 library needs to be installed, see + https://github.com/commoncrawl/language-detection-cld2 + + + + + warc.skip.mimetype.pattern + + + Regular expression pattern to match content (MIME) types which are + randomly skipped to reduce the storage footprint of + "undesired" file formats. E.g., the pattern + `application/(?:x-(?:7z-compressed|bzip2?|lzip|tar)|z(?:std|ip)|gzip)` + would select compressed content and archives to be skipped randomly. + See also `warc.skip.mimetype.factor` and `warc.skip.mimetype.truncated.factor`. + + + + + warc.skip.mimetype.factor + 0.0 + + Factor to configure the likelihood that captures of matched MIME + types (see `warc.skip.mimetype.pattern`) are skipped. The factor + is multiplied by the quotient of content size and max. content + size (http.content.limit). That is, larger captures are more + likely skipped. + + + + + warc.skip.mimetype.truncated.factor + 0.0 + + Additional factor added if page content is truncated. + See `warc.skip.mimetype.factor`. + + + diff --git a/default.properties b/default.properties index 4b56086474..7aab590417 100644 --- a/default.properties +++ b/default.properties @@ -43,7 +43,7 @@ test.build.javadoc = ${test.build.dir}/docs/api javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=https://docs.oracle.com/en/java/javase/11/docs/api/ -javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.4.2/api/ +javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.6/api/ javadoc.packages=org.apache.nutch.* dist.dir=./dist @@ -73,6 +73,14 @@ ivy.shared.default.root=${ivy.default.ivy.user.dir}/shared ivy.shared.default.ivy.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext] ivy.shared.default.artifact.pattern=[organisation]/[module]/[revision]/[type]s/[artifact].[ext] +# work-around to fix failing dependency download of +# javax.ws.rs-api.jar +# required by Tika (1.19 and higher) +# cf. (also affects ant/ivy) +# https://github.com/eclipse-ee4j/jaxrs-api/issues/572 +# https://github.com/gradle/gradle/issues/3065 +packaging.type=jar + # # Plugins API # @@ -124,6 +132,7 @@ plugins.urlnormalizer=\ # Scoring Plugins # plugins.scoring=\ + org.apache.nutch.scoring.adaptive*:\ org.apache.nutch.scoring.depth*:\ org.apache.nutch.scoring.link*:\ org.apache.nutch.scoring.opic*:\ diff --git a/docker/README.md b/docker/README.md index 80e1a1d6d9..720fdf8165 100644 --- a/docker/README.md +++ b/docker/README.md @@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080 ``` +## Security and plugin directories + +Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree. + +User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments. + ## Usage If not already running, start docker diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f149ce13d5..5ed19206ca 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -43,38 +43,52 @@ - + - - + + - + - + - + - + - - + + + + + + + + + + + @@ -83,7 +97,11 @@ - + + @@ -101,27 +119,45 @@ - - + + + - + + - - + - - + + + + + + + + + + + + + + diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml index 91de33c457..3a794df37f 100644 --- a/ivy/ivysettings.xml +++ b/ivy/ivysettings.xml @@ -29,6 +29,9 @@ value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/> + @@ -57,15 +60,34 @@ pattern="${maven2.pattern.ext}" m2compatible="true" /> + + + + + + + + + + + + + + + @@ -81,5 +103,7 @@ rather than look for them online. --> + + diff --git a/licenses-binary/LICENSE-bsd-licence.txt b/licenses-binary/LICENSE-bsd-licence.txt new file mode 100644 index 0000000000..ce7787d52f --- /dev/null +++ b/licenses-binary/LICENSE-bsd-licence.txt @@ -0,0 +1,39 @@ +(source: http://antlr.org/license.html) + +ANTLR v4 License + +ANTLR + +ANTLR 4 License +[The BSD License] +Copyright (c) 2012 Terence Parr and Sam Harwell +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Developer's Certificate of Origin +As of 4.10, ANTLR uses the Linux Foundation's Developer Certificate of Origin, DCO, version 1.1. See certificate +of origin. To contribute: + +- fork the dev branch of the ANTLR v4 github repository +- make your changes +- commit your changes, signing your commits with git commit -s .... +- send a pull request diff --git a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt b/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt deleted file mode 100644 index a25e8c704e..0000000000 --- a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt +++ /dev/null @@ -1,15 +0,0 @@ -(source: http://www.gnu.org/software/classpath/license.html) - - -GNU Classpath License - GNU Project - Free Software Foundation (FSF) - - - - -Classpath is distributed under the terms of the GNU General Public License with the following clarification and special exception. - - Linking this library statically or dynamically with other modules is making a combined work based on this library. Thus, the terms and conditions of the GNU General Public License cover the whole combination. - - As a special exception, the copyright holders of this library give you permission to link this library with independent modules to produce an executable, regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An independent module is a module which is not derived from or based on this library. If you modify this library, you may extend this exception to your version of the library, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version. - -As such, it can be used to run, create and distribute a large class of applications and applets. When GNU Classpath is used unmodified as the core class library for a virtual machine, compiler for the java languge, or for a program written in the java programming language it does not affect the licensing for distributing those programs directly. diff --git a/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt b/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/bin/nutch b/src/bin/nutch index 0f08a35b0c..07b2defd45 100755 --- a/src/bin/nutch +++ b/src/bin/nutch @@ -75,6 +75,7 @@ if [ $# = 0 ]; then echo " readdb read / dump crawl db" echo " mergedb merge crawldb-s, with optional filtering" echo " dedup deduplicate entries in the crawldb and assign them a special status" + echo " dedupredirects deduplicate redirects in the crawldb (keep one of multiple redirects pointing to same target)" echo " domainstats calculate domain statistics from crawldb" echo " protocolstats calculate protocol status code stats from crawldb" echo " crawlcomplete calculate crawl completion stats from crawldb" @@ -280,6 +281,8 @@ elif [ "$COMMAND" = "index" ] ; then CLASS=org.apache.nutch.indexer.IndexingJob elif [ "$COMMAND" = "dedup" ] ; then CLASS=org.apache.nutch.crawl.DeduplicationJob +elif [ "$COMMAND" = "dedupredirects" ] ; then + CLASS=org.apache.nutch.crawl.DedupRedirectsJob elif [ "$COMMAND" = "clean" ] ; then CLASS=org.apache.nutch.indexer.CleaningJob elif [ "$COMMAND" = "parsechecker" ] ; then diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 6575ccb886..68d65ba1ad 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -35,6 +35,7 @@ import java.lang.invoke.MethodHandles; import java.net.URI; import java.net.URISyntaxException; +import java.time.Duration; /** * This class implements an adaptive re-fetch algorithm. This works as follows: @@ -219,15 +220,15 @@ private void setHostSpecificIntervals(String fileName, // The custom intervals should respect the boundaries of the default values. if (m < defaultMin) { LOG.error( - "Min. interval out of bounds on line {} in the config. file: `{}`", - lineNo, line); + "Min. interval out of bounds ({}) on line {} in the config. file: `{}`", + defaultMin, lineNo, line); continue; } if (M > defaultMax) { LOG.error( - "Max. interval out of bounds on line {} in the config. file: `{}`", - lineNo, line); + "Max. interval out of bounds ({}) on line {} in the config. file: `{}`", + defaultMax, lineNo, line); continue; } @@ -332,17 +333,30 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, case FetchSchedule.STATUS_UNKNOWN: break; } - if (SYNC_DELTA) { - // try to synchronize with the time of change - long delta = (fetchTime - modifiedTime) / 1000L; - if (delta > interval) - interval = delta; - refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000); - } // Ensure the interval does not fall outside of bounds float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL; float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL; + + if (SYNC_DELTA) { + // try to synchronize with the time of change + long delta = (fetchTime - modifiedTime); + if (delta > (interval * 1000)) + interval = delta / 1000L; + // offset: a fraction (sync_delta_rate) of the difference between the last modification time, and the last fetch time. + long offset = Math.round(delta * SYNC_DELTA_RATE); + long maxIntervalMillis = (long) maxInterval * 1000L; + if (LOG.isTraceEnabled()) { + LOG.trace("delta (days): {}; offset (days): {}; maxInterval (days): {}", + Duration.ofMillis(delta).toDays(), Duration.ofMillis(offset).toDays(), Duration.ofMillis(maxIntervalMillis).toDays()); + } + // convert the offset to a ratio of max interval: avoid next fetchTime in the past, and mimic fetches within max interval + if (delta > 0 && offset > maxIntervalMillis) { + offset = offset / delta * maxIntervalMillis; // ex: 9/30*7 = 2.1 + } + refTime = fetchTime - offset; + } + if (interval < minInterval) { interval = minInterval; } else if (interval > maxInterval) { @@ -389,7 +403,8 @@ public static void main(String[] args) throws Exception { (p.getFetchInterval() / SECONDS_PER_DAY), miss); if (p.getFetchTime() <= curTime) { fetchCnt++; - fs.setFetchSchedule(new Text("http://www.example.com"), p, p + // Text (url) required by the API, but not relevant here. + fs.setFetchSchedule(new Text(), p, p .getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); diff --git a/src/java/org/apache/nutch/crawl/CCFetchSchedule.java b/src/java/org/apache/nutch/crawl/CCFetchSchedule.java new file mode 100644 index 0000000000..55b6238ce4 --- /dev/null +++ b/src/java/org/apache/nutch/crawl/CCFetchSchedule.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * FetchSchedule which allows to reset the fetch interval to the default value + */ +public class CCFetchSchedule extends DefaultFetchSchedule { + + private final static Logger LOG = LoggerFactory + .getLogger(CCFetchSchedule.class); + + private static final String RESET_FETCH_INTERVAL = "db.fetch.interval.reset"; + private static final String FETCH_TIME_MAX_DAYS_AHEAD = "db.fetch.time.max.days.ahead"; + private static final String RESET_NOT_MODIFIED = "db.fetch.reset.notmodified.after"; + + private boolean resetFetchInterval = false; + private boolean resetFetchTime = false; + private long latestFetchTime; + private long minModifiedTime = 0; + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) + return; + resetFetchInterval = conf.getBoolean(RESET_FETCH_INTERVAL, false); + if (resetFetchInterval) { + resetFetchInterval = true; + LOG.info("Resetting fetch interval if exceeding {} = {}", + "db.fetch.interval.max", maxInterval); + } + latestFetchTime = System.currentTimeMillis(); + int fetchTimeMaxDaysAhead = conf.getInt(FETCH_TIME_MAX_DAYS_AHEAD, 0); + if (fetchTimeMaxDaysAhead > 0) { + resetFetchTime = true; + latestFetchTime += fetchTimeMaxDaysAhead * 24 * 60 * 60 * 1000L; + LOG.info("Resetting fetch time if more than {} = {} days ahead", + FETCH_TIME_MAX_DAYS_AHEAD, fetchTimeMaxDaysAhead); + } + int secondsResetNotModifiedTime = conf.getInt(RESET_NOT_MODIFIED, 0); + if (secondsResetNotModifiedTime > 0) { + minModifiedTime = System.currentTimeMillis() - secondsResetNotModifiedTime * 1000; + } + } + + @Override + public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { + if (datum.getFetchTime() > curTime) { + return false; // not time yet + } + if (datum.getModifiedTime() > 0 + && datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED + && datum.getModifiedTime() < minModifiedTime) { + // trigger a full re-fetch of not-modified pages + // (do not send if-not-modified-since requests) + datum.setModifiedTime(0); + } + return true; + } + + @Override + public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime) { + if (resetFetchInterval && datum.getFetchInterval() > maxInterval) { + datum.setFetchInterval(maxInterval); + } + return super.setPageGoneSchedule(url, datum, prevFetchTime, + prevModifiedTime, fetchTime); + } + + @Override + public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, long fetchTime) { + if (resetFetchInterval && datum.getFetchInterval() > maxInterval) { + datum.setFetchInterval(maxInterval); + } + return super.setPageRetrySchedule(url, datum, prevFetchTime, + prevModifiedTime, fetchTime); + } + + @Override + public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, + long prevFetchTime, long prevModifiedTime, + long fetchTime, long modifiedTime, int state) { + if (resetFetchInterval && datum.getFetchInterval() > maxInterval) { + datum.setFetchInterval(maxInterval); + } + if (resetFetchTime && datum.getFetchTime() > latestFetchTime) { + datum.setFetchTime(latestFetchTime); + } + return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, + fetchTime, modifiedTime, state); + } +} diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 01598a5f18..32081e1d61 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -43,6 +43,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; @@ -145,7 +146,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, if (filter) { long urlsFiltered = job.getCounters() - .findCounter("CrawlDB filter", "URLs filtered").getValue(); + .findCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).getValue(); LOG.info( "CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}", urlsFiltered); diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java index d9ab0d3cc0..912c6e4abf 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java @@ -22,8 +22,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -49,6 +51,11 @@ public class CrawlDbFilter extends private String scope; + // Cached counter references for performance + private Counter goneRecordsRemovedCounter; + private Counter orphanRecordsRemovedCounter; + private Counter urlsFilteredCounter; + private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); @@ -67,6 +74,21 @@ public void setup(Mapper.Context context) { scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB); normalizers = new URLNormalizers(conf, scope); } + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + goneRecordsRemovedCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL); + orphanRecordsRemovedCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL); + urlsFilteredCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL); } private Text newKey = new Text(); @@ -80,15 +102,13 @@ public void map(Text key, CrawlDatum value, // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, // cheaper than normalizing or filtering if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) { - context.getCounter("CrawlDB filter", - "Gone records removed").increment(1); + goneRecordsRemovedCounter.increment(1); return; } // Whether to remove orphaned pages // https://issues.apache.org/jira/browse/NUTCH-1932 if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) { - context.getCounter("CrawlDB filter", - "Orphan records removed").increment(1); + orphanRecordsRemovedCounter.increment(1); return; } if (url != null && urlNormalizers) { @@ -108,7 +128,7 @@ public void map(Text key, CrawlDatum value, } } if (url == null) { - context.getCounter("CrawlDB filter", "URLs filtered").increment(1); + urlsFilteredCounter.increment(1); } else { // URL has passed filters newKey.set(url); // collect it diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 03cf0fbd39..57e684374c 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -906,7 +906,7 @@ public void setup( retry = config.getInt("retry", -1); if (config.get("expr", null) != null) { - expr = JexlUtil.parseExpression(config.get("expr", null)); + expr = JexlUtil.parseExpression(config, config.get("expr", null)); } sample = config.getFloat("sample", 1); } diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java index deb266af61..3454116575 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java @@ -18,19 +18,24 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.PriorityQueue; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.StringUtil; @@ -48,6 +53,10 @@ public class CrawlDbReducer extends private boolean additionsAllowed; private int maxInterval; private FetchSchedule schedule; + private ErrorTracker errorTracker; + + // Cached counter references for status-based metrics + private Map statusCounters = new HashMap<>(); @Override public void setup(Reducer.Context context) { @@ -59,6 +68,17 @@ public void setup(Reducer.Context context) { schedule = FetchScheduleFactory.getFetchSchedule(conf); int maxLinks = conf.getInt("db.update.max.inlinks", 10000); linked = new InlinkPriorityQueue(maxLinks); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB, context); + } + + /** + * Get counter for status, caching for subsequent lookups. + */ + private Counter getStatusCounter(byte status, Context context) { + return statusCounters.computeIfAbsent(status, + s -> context.getCounter(NutchMetrics.GROUP_CRAWLDB, + CrawlDatum.getStatusName(s))); } @Override @@ -161,10 +181,11 @@ public void reduce(Text key, Iterable values, scfilters.orphanedScore(key, old); } catch (ScoringFilterException e) { LOG.warn("Couldn't update orphaned score, key={}: {}", key, e); + errorTracker.incrementCounters(e); } context.write(key, old); - context.getCounter("CrawlDB status", - CrawlDatum.getStatusName(old.getStatus())).increment(1); + // Dynamic counter based on status name + getStatusCounter(old.getStatus(), context).increment(1); } else { LOG.warn("Missing fetch and old value, signature={}", StringUtil.toHexString(signature)); @@ -206,6 +227,7 @@ public void reduce(Text key, Iterable values, } catch (ScoringFilterException e) { LOG.warn("Cannot filter init score for url {}, using default: {}", key, e.getMessage()); + errorTracker.incrementCounters(e); result.setScore(0.0f); } } @@ -315,12 +337,13 @@ public void reduce(Text key, Iterable values, scfilters.updateDbScore(key, oldSet ? old : null, result, linkList); } catch (Exception e) { LOG.warn("Couldn't update score, key={}: {}", key, e); + errorTracker.incrementCounters(e); } // remove generation time, if any result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); context.write(key, result); - context.getCounter("CrawlDB status", - CrawlDatum.getStatusName(result.getStatus())).increment(1); + // Dynamic counter based on status name + getStatusCounter(result.getStatus(), context).increment(1); } } diff --git a/src/java/org/apache/nutch/crawl/CrawlDbToSeeds.java b/src/java/org/apache/nutch/crawl/CrawlDbToSeeds.java new file mode 100644 index 0000000000..8797040314 --- /dev/null +++ b/src/java/org/apache/nutch/crawl/CrawlDbToSeeds.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.lang.invoke.MethodHandles; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +/** + * Export tuples ⟨url, score⟩ from CrawlDb as seeds to be consumed by + * {@link Injector}. Output format is: + * + *

+ * https://example.com/ \t nutch.score=1.0
+ *

+ * + * Exported items from CrawlDb can be selected/filtered using the options + * available in "CrawlDbReader -dump". + */ +public class CrawlDbToSeeds extends CrawlDbReader { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + public static class CrawlDbToSeedsOutputFormat + extends TextOutputFormat { + + protected static class LineRecordWriter + extends TextOutputFormat.LineRecordWriter { + + public LineRecordWriter(DataOutputStream out) { + super(out, "\t"); + } + + protected float normalizeScore(float score) { + if (score > 10.0) { + return 10.0f; + } + return score; + } + + @Override + public synchronized void write(Text key, CrawlDatum value) + throws IOException { + out.writeBytes(key.toString()); + out.writeBytes("\tnutch.score="); + out.writeBytes(Float.toString(normalizeScore(value.getScore()))); + out.writeByte('\n'); + } + + } + + public RecordWriter getRecordWriter( + TaskAttemptContext job) throws IOException, InterruptedException { + Configuration conf = job.getConfiguration(); + boolean isCompressed = getCompressOutput(job); + CompressionCodec codec = null; + String extension = ""; + if (isCompressed) { + Class codecClass = getOutputCompressorClass( + job, GzipCodec.class); + codec = ReflectionUtils.newInstance(codecClass, conf); + extension = codec.getDefaultExtension(); + } + Path file = getDefaultWorkFile(job, extension); + FileSystem fs = file.getFileSystem(conf); + FSDataOutputStream fileOut = fs.create(file, false); + if (isCompressed) { + return new LineRecordWriter( + new DataOutputStream(codec.createOutputStream(fileOut))); + } else { + return new LineRecordWriter(fileOut); + } + } + } + + public void crawlDbToSeeds(String crawlDb, String output, String regex, + String status, Integer retry, String expr, Float sample) + throws IOException, ClassNotFoundException, InterruptedException { + + LOG.info("CrawlDbToSeeds: starting"); + LOG.info("CrawlDb: {}", crawlDb); + + Path outFolder = new Path(output); + + Job job = NutchJob.getInstance(getConf()); + job.setJobName("dump " + crawlDb); + Configuration jobConf = job.getConfiguration(); + + FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.setInputFormatClass(SequenceFileInputFormat.class); + FileOutputFormat.setOutputPath(job, outFolder); + job.setOutputFormatClass(CrawlDbToSeedsOutputFormat.class); + + if (status != null) + jobConf.set("status", status); + if (regex != null) + jobConf.set("regex", regex); + if (retry != null) + jobConf.setInt("retry", retry); + if (expr != null) { + jobConf.set("expr", expr); + LOG.info("CrawlDb: expr: {}", expr); + } + if (sample != null) { + jobConf.setFloat("sample", sample); + } + job.setMapperClass(CrawlDbDumpMapper.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + job.setJarByClass(CrawlDbToSeeds.class); + + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "CrawlDbToSeeds job did not succeed, job status:" + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error(StringUtils.stringifyException(e)); + throw e; + } + + LOG.info("CrawlDbToSeeds: done"); + } + + public static int help() { + System.err.println("Usage: CrawlDbToSeeds [options] \n"); + System.err.println( + "Select items from CrawlDb and export URLs and score as Nutch seed file\n"); + System.err + .println("\nOptions to filter records (cf. CrawlDbReader -dump):"); + System.err.println("\t\t[-regex ]\tfilter records with expression"); + System.err.println("\t\t[-retry ]\tminimum retry count"); + System.err + .println("\t\t[-status ]\tfilter records by CrawlDatum status"); + System.err.println( + "\t\t[-expr ]\tJexl expression to evaluate for this record"); + System.err.println( + "\t\t[-sample ]\tOnly process a random sample with this ratio"); + return -1; + } + + public int run(String[] args) throws IOException, InterruptedException, + ClassNotFoundException, Exception { + + if (args.length < 2) { + return help(); + } + + String regex = null; + Integer retry = null; + String status = null; + String expr = null; + Float sample = null; + String outputDir = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-regex")) { + regex = args[++i]; + } else if (args[i].equals("-retry")) { + retry = Integer.parseInt(args[++i]); + } else if (args[i].equals("-status")) { + status = args[++i]; + } else if (args[i].equals("-expr")) { + expr = args[++i]; + } else if (args[i].equals("-sample")) { + sample = Float.parseFloat(args[++i]); + } else if (crawlDb == null) { + crawlDb = args[i]; + } else if (outputDir == null) { + outputDir = args[i]; + } else { + System.err.println("Unknown argument: " + args[i] + "\n\n"); + return help(); + } + } + crawlDbToSeeds(crawlDb, outputDir, regex, status, retry, expr, sample); + return 0; + } + + public static void main(String[] args) throws Exception { + int result = ToolRunner.run(NutchConfiguration.create(), + new CrawlDbToSeeds(), args); + System.exit(result); + } + +} diff --git a/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java new file mode 100644 index 0000000000..3b77878211 --- /dev/null +++ b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.CounterGroup; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Simple deduplication of redirects pointing to the same target. If two + * redirects point to the same target and + *

the target is contained in the CrawlDb: both redirects are marked as + * duplicates
the target is not found in the CrawlDb: one redirect is marked as + * duplicate. Which one is chosen depends on the criteria defined by the + * -compareOrder argument.

+ * + * Unlike {@link DeduplicationJob} which deduplicates based on content + * signatures, deduplication of redirects is not done to clean up the index + * – redirects are not indexed resp. are removed from the index when the + * indexer is called with -deleteGone. Instead, the aim is to mark + * URLs in the CrawlDb which would cause unnecessary re-fetches when the fetcher + * is following redirects (http.redirects.max > 0). Duplicates can + * be removed from the CrawlDb by setting db.update.purge.404 to true. + */ +public class DedupRedirectsJob extends DeduplicationJob { + + public static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** + * Test whether CrawlDatum is a redirect. + * + * @param datum + * @return true if datum is a redirect, false otherwise + */ + public static boolean isRedirect(CrawlDatum datum) { + byte status = datum.getStatus(); + if (status == CrawlDatum.STATUS_DB_REDIR_PERM + || status == CrawlDatum.STATUS_DB_REDIR_TEMP) { + return true; + } + if (status == CrawlDatum.STATUS_DB_DUPLICATE) { + // check for redirects already marked as duplicate + ProtocolStatus pStatus = getProtocolStatus(datum); + if (pStatus != null && (pStatus.getCode() == ProtocolStatus.MOVED + || pStatus.getCode() == ProtocolStatus.TEMP_MOVED)) { + return true; + } + } + return false; + } + + /** + * Return protocol status of CrawlDatum if present. + * + * @param datum + * @return protocol status or null if not present + */ + public static ProtocolStatus getProtocolStatus(CrawlDatum datum) { + if (datum.getMetaData().containsKey(Nutch.WRITABLE_PROTO_STATUS_KEY)) + return (ProtocolStatus) datum.getMetaData() + .get(Nutch.WRITABLE_PROTO_STATUS_KEY); + return null; + } + + /** + * Get target URL of a redirect. Note: CrawlDatum is assumed to be a redirect. + * + * @param datum + * @return redirect target URL or null if not available or datum is not a + * redirect + */ + public static String getTargetURL(CrawlDatum datum) { + ProtocolStatus pStatus = getProtocolStatus(datum); + if (pStatus != null) { + return pStatus.getMessage(); + } + return null; + } + + /** + * Reset duplicate status of a redirect marked as duplicate and restore old + * status (permanent or temporary redirect). + * + * @param datum + */ + private static void unsetDuplicateStatus(CrawlDatum datum) { + byte status = datum.getStatus(); + if (status == CrawlDatum.STATUS_DB_DUPLICATE) { + ProtocolStatus pStatus = getProtocolStatus(datum); + if (pStatus != null) { + int code = pStatus.getCode(); + if (code == ProtocolStatus.MOVED) + datum.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM); + else if (code == ProtocolStatus.TEMP_MOVED) + datum.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP); + } + } + } + + public static class RedirTargetMapper + extends Mapper { + + @Override + public void map(Text key, CrawlDatum value, Context context) + throws IOException, InterruptedException { + + // output independent of status + context.write(key, value); + + if (isRedirect(value)) { + String redirTarget = getTargetURL(value); + if (redirTarget != null) { + // keep original URL in CrawlDatum's meta data and emit + // + value.getMetaData().put(urlKey, key); + Text redirKey = new Text(redirTarget); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL).increment(1); + if (redirKey.equals(key)) { + // exclude self-referential redirects + context + .getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL) + .increment(1); + } else { + context.write(redirKey, value); + } + } + } + } + + } + + public static class DedupRedirectReducer + extends DeduplicationJob.DedupReducer { + + @Override + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + CrawlDatum existingDoc = null; + for (CrawlDatum newDoc : values) { + if (existingDoc == null) { + existingDoc = new CrawlDatum(); + existingDoc.set(newDoc); + continue; + } + CrawlDatum duplicate = null; + if (isRedirect(existingDoc) + && !newDoc.getMetaData().containsKey(urlKey)) { + // newDoc is known as redirect target + writeOutAsDuplicate(existingDoc, context); + existingDoc.set(newDoc); + } else if (isRedirect(newDoc) + && !existingDoc.getMetaData().containsKey(urlKey)) { + // existingDoc is known as redirect target + writeOutAsDuplicate(newDoc, context); + } else { + // existingDoc and newDoc are redirects and point to the same target + duplicate = getDuplicate(existingDoc, newDoc); + if (duplicate == null) { + // no decision possible in getDuplicate() + // and both are redirects: dedup newDoc + duplicate = newDoc; + } + writeOutAsDuplicate(duplicate, context); + if (duplicate == existingDoc) { + existingDoc.set(newDoc); + } + } + } + // finally output existingDoc as non-duplicate if + if (!isRedirect(existingDoc)) { + // (a) it is not a redirect + // Text url = (Text) existingDoc.getMetaData().remove(urlKey); + context.write(key, existingDoc); + } else { + Text origURL = (Text) existingDoc.getMetaData().remove(urlKey); + if (origURL != null) { + // (b) it is the value passed to the reducer under the target URL and + // not under the original URL key. It's a redirect and not a + // duplicate! + unsetDuplicateStatus(existingDoc); + context.write(origURL, existingDoc); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL).increment(1); + } else { + // (c) it is a self-referential redirect + String targetURL = getTargetURL(existingDoc); + if (key.toString().equals(targetURL)) { + context.write(key, existingDoc); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL) + .increment(1); + } + // else: ignore redirects emitted under original URL because they are + // collected under the target URL + } + } + } + } + + public int run(String[] args) throws IOException { + + if (args.length < 1) { + System.err.println( + "Usage: DedupRedirectsJob [-compareOrder ,,,] [-noSort]"); + return 1; + } + + Path crawlDb = new Path(args[0]); + String compareOrder = "score,fetchTime,urlLength"; + + boolean noSortingAfterDedup = false; + for (int i = 1; i < args.length; i++) { + if (args[i].equals("-noSort")) + noSortingAfterDedup = true; + if (args[i].equals("-compareOrder")) { + compareOrder = args[++i]; + + if (compareOrder.indexOf("score") == -1 + || compareOrder.indexOf("fetchTime") == -1 + || compareOrder.indexOf("urlLength") == -1) { + System.err.println( + "DedupRedirectsJob: compareOrder must contain score, fetchTime and urlLength."); + return 1; + } + } + } + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DedupRedirectsJob: starting"); + + Path tempDir = new Path(crawlDb, "dedup-temp-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + Job job = NutchJob.getInstance(getConf()); + Configuration conf = job.getConfiguration(); + job.setJobName("Redirect deduplication on " + crawlDb); + conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); + job.setJarByClass(DedupRedirectsJob.class); + + FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.setInputFormatClass(SequenceFileInputFormat.class); + + FileOutputFormat.setOutputPath(job, tempDir); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(CrawlDatum.class); + + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + + job.setMapperClass(RedirTargetMapper.class); + job.setReducerClass(DedupRedirectReducer.class); + + FileSystem fs = tempDir.getFileSystem(getConf()); + long numDuplicates = 0; + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "Crawl job did not succeed, job status:" + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + fs.delete(tempDir, true); + throw new RuntimeException(message); + } + CounterGroup g = job.getCounters().getGroup(NutchMetrics.GROUP_DEDUP); + if (g != null) { + Counter counter = g + .findCounter(NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL); + numDuplicates = counter.getValue(); + LOG.info("Deduplication: {} documents marked as duplicates", + numDuplicates); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("DeduplicationJob: ", e); + fs.delete(tempDir, true); + return -1; + } + + if (numDuplicates == 0) { + LOG.info("No duplicates found, skip writing CrawlDb"); + + } else if (noSortingAfterDedup) { + LOG.info("Skipping step to sort CrawlDb"); + LOG.warn("Make sure that the CrawlDb is sorted again by a following job"); + CrawlDb.install(job, crawlDb); + + } else { + // temporary output is the deduped crawldb but not in proper sorting + // (sorted by redirect target), use "merge" job to achieve proper sorting + LOG.info("Redirect deduplication: writing CrawlDb."); + + Job mergeJob = CrawlDb.createJob(getConf(), crawlDb); + FileInputFormat.addInputPath(mergeJob, tempDir); + mergeJob.setReducerClass(StatusUpdateReducer.class); + mergeJob.setJarByClass(DedupRedirectsJob.class); + mergeJob.setReducerClass(StatusUpdateReducer.class); + + fs = crawlDb.getFileSystem(getConf()); + Path outPath = FileOutputFormat.getOutputPath(job); + Path lock = CrawlDb.lock(getConf(), crawlDb, false); + try { + boolean success = mergeJob.waitForCompletion(true); + if (!success) { + String message = "Crawl job did not succeed, job status:" + + mergeJob.getStatus().getState() + ", reason: " + + mergeJob.getStatus().getFailureInfo(); + LOG.error(message); + fs.delete(tempDir, true); + NutchJob.cleanupAfterFailure(outPath, lock, fs); + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("DedupRedirectsJob: ", e); + fs.delete(tempDir, true); + NutchJob.cleanupAfterFailure(outPath, lock, fs); + return -1; + } + + CrawlDb.install(mergeJob, crawlDb); + } + + // clean up + fs.delete(tempDir, true); + + stopWatch.stop(); + LOG.info("DedupRedirectsJob finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + + return 0; + } + + public static void main(String[] args) throws Exception { + int result = ToolRunner.run(NutchConfiguration.create(), + new DedupRedirectsJob(), args); + System.exit(result); + } + +} diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 3e12d4598c..52bf422308 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -17,9 +17,9 @@ package org.apache.nutch.crawl; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.URLDecoder; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -34,7 +34,6 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Counter; -import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -45,6 +44,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; @@ -69,7 +69,7 @@ public class DeduplicationJob extends NutchTool implements Tool { protected final static Text urlKey = new Text("_URLTEMPKEY_"); protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; - protected final static String UTF_8 = StandardCharsets.UTF_8.toString(); + protected final static Charset UTF_8 = StandardCharsets.UTF_8; public static class DBFilter extends Mapper { @@ -127,11 +127,25 @@ public static class DedupReducer protected String[] compareOrder; + // Cached counter reference for performance + private Counter documentsMarkedDuplicateCounter; + @Override public void setup( Reducer.Context context) { Configuration conf = context.getConfiguration(); compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(","); + + // Initialize cached counter reference + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + documentsMarkedDuplicateCounter = context.getCounter( + NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL); } protected void writeOutAsDuplicate(CrawlDatum datum, @@ -139,8 +153,7 @@ protected void writeOutAsDuplicate(CrawlDatum datum, throws IOException, InterruptedException { datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE); Text key = (Text) datum.getMetaData().remove(urlKey); - context.getCounter("DeduplicationJobStatus", - "Documents marked as duplicate").increment(1); + documentsMarkedDuplicateCounter.increment(1); context.write(key, datum); } @@ -210,13 +223,13 @@ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) { String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); try { urlExisting = URLDecoder.decode(urlExisting, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlExisting, e); // use the encoded URL } try { urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + } catch (IllegalArgumentException e) { LOG.error("Error decoding: {}", urlnewDoc, e); // use the encoded URL } @@ -334,12 +347,10 @@ public int run(String[] args) throws IOException { fs.delete(tempDir, true); throw new RuntimeException(message); } - CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus"); - if (g != null) { - Counter counter = g.findCounter("Documents marked as duplicate"); - long dups = counter.getValue(); - LOG.info("Deduplication: {} documents marked as duplicates", dups); - } + long dups = job.getCounters() + .findCounter(NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL) + .getValue(); + LOG.info("Deduplication: {} documents marked as duplicates", dups); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationJob:", e); fs.delete(tempDir, true); diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 82475af5b8..aa8cfcbbfa 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -67,6 +67,8 @@ import org.apache.hadoop.io.WritableComparator; import org.apache.nutch.hostdb.HostDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -190,6 +192,18 @@ public static class SelectorMapper private int intervalThreshold = -1; private byte restrictStatus = -1; private JexlScript expr = null; + private ErrorTracker errorTracker; + + // Cached counter references for performance + private Counter urlFiltersRejectedCounter; + private Counter scheduleRejectedCounter; + private Counter waitForUpdateCounter; + private Counter exprRejectedCounter; + private Counter statusRejectedCounter; + private Counter scoreTooLowCounter; + private Counter intervalRejectedCounter; + private Counter hostsAffectedPerHostOverflowCounter; + private Counter urlsSkippedPerHostOverflowCounter; @Override public void setup( @@ -213,7 +227,35 @@ public void setup( if (!restrictStatusString.isEmpty()) { restrictStatus = CrawlDatum.getStatusByName(restrictStatusString); } - expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null)); + expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null)); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlFiltersRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL); + scheduleRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL); + waitForUpdateCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL); + exprRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL); + statusRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL); + scoreTooLowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL); + intervalRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL); + hostsAffectedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL); + urlsSkippedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL); } @Override @@ -225,11 +267,11 @@ public void map(Text key, CrawlDatum value, Context context) // URLFilters try { if (filters.filter(url.toString()) == null) { - context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1); + urlFiltersRejectedCounter.increment(1); return; } } catch (URLFilterException e) { - context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1); + errorTracker.incrementCounters(e); LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage()); } } @@ -239,7 +281,7 @@ public void map(Text key, CrawlDatum value, Context context) if (!schedule.shouldFetch(url, crawlDatum, curTime)) { LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url, crawlDatum.getFetchTime(), curTime); - context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1); + scheduleRejectedCounter.increment(1); return; } @@ -248,7 +290,7 @@ public void map(Text key, CrawlDatum value, Context context) if (oldGenTime != null) { // awaiting fetch & update if (oldGenTime.get() + genDelay > curTime) { // still wait for // update - context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1); + waitForUpdateCounter.increment(1); return; } } @@ -256,25 +298,26 @@ public void map(Text key, CrawlDatum value, Context context) try { sort = scfilters.generatorSortValue(key, crawlDatum, sort); } catch (ScoringFilterException sfe) { + errorTracker.incrementCounters(sfe); LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe); } // check expr if (expr != null) { if (!crawlDatum.execute(expr, key.toString())) { - context.getCounter("Generator", "EXPR_REJECTED").increment(1); + exprRejectedCounter.increment(1); return; } } if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) { - context.getCounter("Generator", "STATUS_REJECTED").increment(1); + statusRejectedCounter.increment(1); return; } // consider only entries with a score superior to the threshold if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) { - context.getCounter("Generator", "SCORE_TOO_LOW").increment(1); + scoreTooLowCounter.increment(1); return; } @@ -282,7 +325,7 @@ public void map(Text key, CrawlDatum value, Context context) // threshold if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) { - context.getCounter("Generator", "INTERVAL_REJECTED").increment(1); + intervalRejectedCounter.increment(1); return; } @@ -317,6 +360,11 @@ public static class SelectorReducer extends private JexlScript maxCountExpr = null; private JexlScript fetchDelayExpr = null; private Map hostDatumCache = new HashMap<>(); + private ErrorTracker errorTracker; + + // Cached counter references for performance + private Counter hostsAffectedPerHostOverflowCounter; + private Counter urlsSkippedPerHostOverflowCounter; public void readHostDb() throws IOException { if (conf.get(GENERATOR_HOSTDB) == null) { @@ -405,15 +453,29 @@ public void setup(Context context) throws IOException { URLNormalizers.SCOPE_GENERATE_HOST_COUNT); if (conf.get(GENERATOR_HOSTDB) != null) { - maxCountExpr = JexlUtil - .parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null)); - fetchDelayExpr = JexlUtil - .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); + maxCountExpr = JexlUtil.parseExpression(conf, + conf.get(GENERATOR_MAX_COUNT_EXPR, null)); + fetchDelayExpr = JexlUtil.parseExpression(conf, + conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); } + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); + // Initialize cached counter references + initReducerCounters(context); readHostDb(); } + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initReducerCounters(Context context) { + hostsAffectedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL); + urlsSkippedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL); + } + @Override public void cleanup(Context context) throws IOException, InterruptedException { @@ -507,7 +569,7 @@ public void reduce(FloatWritable key, Iterable values, } catch (MalformedURLException e) { LOG.warn("Malformed URL: '{}', skipping ({})", urlString, StringUtils.stringifyException(e)); - context.getCounter("Generator", "MALFORMED_URL").increment(1); + errorTracker.incrementCounters(e); continue; } @@ -539,16 +601,13 @@ public void reduce(FloatWritable key, Iterable values, hostCount[1] = 1; } else { if (hostCount[1] == (maxCount+1)) { - context - .getCounter("Generator", "HOSTS_AFFECTED_PER_HOST_OVERFLOW") - .increment(1); + hostsAffectedPerHostOverflowCounter.increment(1); LOG.info( "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", hostordomain, maxCount, maxNumSegments); } // skip this entry - context.getCounter("Generator", "URLS_SKIPPED_PER_HOST_OVERFLOW") - .increment(1); + urlsSkippedPerHostOverflowCounter.increment(1); continue; } } @@ -812,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * maximum number of segments to generate * @param expr * a Jexl expression to use in the Generator job. - * @see JexlUtil#parseExpression(String) + * @see JexlUtil#parseExpression(Configuration, String) * @throws IOException * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) @@ -863,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * @param hostdb * name of a hostdb from which to execute Jexl expressions in a bid * to determine the maximum URL count and/or fetch delay per host. - * @see JexlUtil#parseExpression(String) + * @see JexlUtil#parseExpression(Configuration, String) * @throws IOException * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) @@ -959,10 +1018,13 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, } LOG.info("Generator: number of items rejected during selection:"); - for (Counter counter : job.getCounters().getGroup("Generator")) { - LOG.info("Generator: {} {}", - String.format(Locale.ROOT, "%6d", counter.getValue()), - counter.getName()); + for (Counter counter : job.getCounters() + .getGroup(NutchMetrics.GROUP_GENERATOR)) { + long counterValue = counter.getValue(); + if (counterValue > 0) { + LOG.info("Generator: {} {}", + String.format(Locale.ROOT, "%6d", counterValue), counter.getName()); + } } if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { /* diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java new file mode 100644 index 0000000000..6b619445b7 --- /dev/null +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -0,0 +1,1518 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import java.io.BufferedReader; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.function.BiPredicate; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.WritableComparator; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskCounter; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hadoop.util.hash.MurmurHash; +import org.apache.nutch.crawl.Generator2.SelectorReducer.DomainLimits; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Generates a subset of a crawl db to fetch. + * + * This version works differently from the original in that it doesn't try to + * keep highest scoring things in earlier segments across domains. + * + * In order to speed up performance of generating dozens or hundreds of segments + * for multi-billion entry URL databases, we group by host (or domain) and + * secondary sort by score descending. No per-host (or per-domain) counts are + * hold in memory. Grouping by IP is not supported. + * + * If fetch lists are grouped by domain (see generate.count.mode) + * additional limits are configurable (per domain): max. number of hosts, max. + * number of URLs per host, number of partitions URLs of a single domain are + * distributed on. The per-domain limits configuration file + * (generate.domain.limits.file) can be kept on HDFS and then can + * easily hold many millions of domains because reduce tasks load only the + * limits required for their partition. + * + * URLs of a single host or domain are distributed over multiple segments but + * with a configurable number of URLs kept in a single segment (see + * {@link #GENERATOR_COUNT_KEEP_MIN_IN_SEGMENT}). Keeping a minimum number of + * same-host/domain URLs together minimizes the overhead caused by DNS lookups + * and robots.txt fetches, parsing and storing robots.txt rules. + * + * All segments are partitioned in a single job which saves time when many + * segments (e.g., -maxNumSegments 100) are generated + * ({@link Generator} launches one partition job per segment). + **/ +public class Generator2 extends Configured implements Tool { + + protected static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb"; + public static final String GENERATOR_MIN_SCORE = "generate.min.score"; + public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval"; + public static final String GENERATOR_RESTRICT_STATUS = "generate.restrict.status"; + public static final String GENERATOR_FILTER = "generate.filter"; + public static final String GENERATOR_NORMALISE = "generate.normalise"; + public static final String GENERATOR_MAX_COUNT = "generate.max.count"; + public static final String GENERATOR_COUNT_MODE = "generate.count.mode"; + public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain"; + public static final String GENERATOR_COUNT_VALUE_HOST = "host"; + public static final String GENERATOR_TOP_N = "generate.topN"; + public static final String GENERATOR_CUR_TIME = "generate.curTime"; + public static final String GENERATOR_DELAY = "crawl.gen.delay"; + public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments"; + /** + * When distributing URLs of the same host/domain over segments keep at least + * this number of URLs together in one segment + */ + public static final String GENERATOR_COUNT_KEEP_MIN_IN_SEGMENT = "generate.count.keep.min.urls.per.segment"; + /** Name of file with domain-specific limits */ + public static final String GENERATOR_DOMAIN_LIMITS_FILE = "generate.domain.limits.file"; + /** Max. number of hosts per domain (if generate.count.mode == domain) */ + public static final String GENERATOR_MAX_HOSTS_PER_DOMAIN = "generate.max.hosts.per.domain"; + /** Max. number of URLs per host (if generate.count.mode == domain) */ + public static final String GENERATOR_MAX_COUNT_PER_HOST = "generate.max.count.per.host.by.domain"; + + protected static Random random = new Random(); + + public static class DomainScorePair + implements WritableComparable { + private Text domain = new Text(); + private FloatWritable score = new FloatWritable(); + + public void set(String domain, float score) { + this.domain.set(domain); + this.score.set(score); + } + + public void set(Text domain, float score) { + this.domain.set(domain); + this.score.set(score); + } + + public Text getDomain() { + return domain; + } + + public FloatWritable getScore() { + return score; + } + + @Override + public void readFields(DataInput in) throws IOException { + domain.readFields(in); + score.readFields(in); + } + + @Override + public void write(DataOutput out) throws IOException { + domain.write(out); + score.write(out); + } + + @Override + public int hashCode() { + return domain.hashCode() + score.hashCode(); + } + + @Override + public boolean equals(Object right) { + if (right instanceof DomainScorePair) { + DomainScorePair r = (DomainScorePair) right; + return r.domain.equals(domain) && r.score.equals(score); + } else { + return false; + } + } + + /* Sorts domain ascending, score in descending order */ + @Override + public int compareTo(DomainScorePair o) { + if (!domain.equals(o.getDomain())) { + return domain.compareTo(o.getDomain()); + } else if (!score.equals(o.getScore())) { + return o.getScore().compareTo(score); + } else { + return 0; + } + } + } + + public static class DomainComparator extends WritableComparator { + public DomainComparator() { + super(DomainScorePair.class, true); + } + + public int compare(DomainScorePair a, DomainScorePair b) { + return a.getDomain().compareTo(b.getDomain()); + } + + @Override + public int compare(Object a, Object b) { + return compare((DomainScorePair) a, (DomainScorePair) b); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable a, WritableComparable b) { + return compare((DomainScorePair) a, (DomainScorePair) b); + } + } + + public static class ScoreComparator extends WritableComparator { + public ScoreComparator() { + super(DomainScorePair.class, true); + } + + // Some versions of hadoop don't seem to have a FloatWritable.compareTo + // also inverted for descending order + public int compare(DomainScorePair a, DomainScorePair b) { + return a.compareTo(b); + } + + @Override + public int compare(Object a, Object b) { + return compare((DomainScorePair) a, (DomainScorePair) b); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable a, WritableComparable b) { + return compare((DomainScorePair) a, (DomainScorePair) b); + } + } + + public static class SelectorEntry implements Writable { + public Text url; + public CrawlDatum datum; + public IntWritable segnum; + + public SelectorEntry() { + url = new Text(); + datum = new CrawlDatum(); + segnum = new IntWritable(0); + } + + @Override + public void readFields(DataInput in) throws IOException { + url.readFields(in); + datum.readFields(in); + segnum.readFields(in); + } + + @Override + public void write(DataOutput out) throws IOException { + url.write(out); + datum.write(out); + segnum.write(out); + } + + @Override + public String toString() { + return "url=" + url.toString() + ", datum=" + datum.toString() + + ", segnum=" + segnum.toString(); + } + } + + /* + * Takes the entire crawl db and filters down to those that are scheduled to + * be output, have a high enough score and limits by host/domain + */ + public static class Selector extends Partitioner + implements Configurable { + + private Configuration conf; + private MurmurHash hasher = (MurmurHash) MurmurHash.getInstance(); + private int seed; + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + seed = conf.getInt("partition.url.seed", 0); + } + + /** + * Partition by host / domain, use MurmurHash because of better hashCode + * distribution + */ + @Override + public int getPartition(DomainScorePair key, Writable value, + int numReduceTasks) { + Text domain = key.getDomain(); + /* + * Note: the backing byte[] array of Text may include zero bytes or + * garbage, must not use more than Text::getLength() bytes + */ + return (hasher.hash(domain.getBytes(), domain.getLength(), seed) + & Integer.MAX_VALUE) % numReduceTasks; + } + + } + + public static class SelectorMapper + extends Mapper { + + private Configuration conf; + private LongWritable genTime = new LongWritable(System.currentTimeMillis()); + private long curTime; + private boolean byDomain = false; + private URLFilters filters; + private URLNormalizers normalizers; + private ScoringFilters scfilters; + private SelectorEntry entry = new SelectorEntry(); + private boolean filter; + private boolean normalise; + private long genDelay; + private FetchSchedule schedule; + private float scoreThreshold = 0f; + private int intervalThreshold = -1; + private String restrictStatus = null; + private DomainScorePair outputKey = new DomainScorePair(); + private ErrorTracker errorTracker; + + @Override + public void setup( + Mapper.Context context) + throws IOException { + conf = context.getConfiguration(); + curTime = conf.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis()); + filters = new URLFilters(conf); + scfilters = new ScoringFilters(conf); + filter = conf.getBoolean(GENERATOR_FILTER, true); + normalise = conf.getBoolean(GENERATOR_NORMALISE, true); + if (normalise) { + normalizers = new URLNormalizers(conf, + URLNormalizers.SCOPE_GENERATE_HOST_COUNT); + } + genDelay = conf.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L; + long time = conf.getLong(Nutch.GENERATE_TIME_KEY, 0L); + if (time > 0) + genTime.set(time); + schedule = FetchScheduleFactory.getFetchSchedule(conf); + scoreThreshold = conf.getFloat(GENERATOR_MIN_SCORE, Float.NaN); + intervalThreshold = conf.getInt(GENERATOR_MIN_INTERVAL, -1); + restrictStatus = conf.get(GENERATOR_RESTRICT_STATUS, null); + + if (GENERATOR_COUNT_VALUE_DOMAIN.equals(conf.get(GENERATOR_COUNT_MODE))) { + byDomain = true; + } + + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); + } + + /** Select & invert subset due for fetch. */ + @Override + public void map(Text key, CrawlDatum value, Context context) + throws IOException, InterruptedException { + String urlString = key.toString(); + + if (filter) { + // If filtering is on don't generate URLs that don't pass + // URLFilters + try { + if (filters.filter(urlString) == null) { + context + .getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL) + .increment(1); + return; + } + } catch (URLFilterException e) { + LOG.warn("Couldn't filter url {}: {}", key, e.getMessage()); + errorTracker.incrementCounters(e); + } + } + + // check fetch schedule + if (!schedule.shouldFetch(key, value, curTime)) { + LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", key, + value.getFetchTime(), curTime); + context.getCounter( + NutchMetrics.GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS, + CrawlDatum.getStatusName(value.getStatus())).increment(1); + return; + } + + LongWritable oldGenTime = (LongWritable) value.getMetaData() + .get(Nutch.WRITABLE_GENERATE_TIME_KEY); + if (oldGenTime != null) { // awaiting fetch & update + if (oldGenTime.get() + genDelay > curTime) // still wait for + // update + return; + } + float sort = 1.0f; + try { + sort = scfilters.generatorSortValue(key, value, sort); + } catch (ScoringFilterException sfe) { + LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe); + } + + if (restrictStatus != null && !restrictStatus + .equalsIgnoreCase(CrawlDatum.getStatusName(value.getStatus()))) + return; + + // consider only entries with a score superior to the threshold + if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) { + context + .getCounter(NutchMetrics.GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS, + CrawlDatum.getStatusName(value.getStatus())) + .increment(1); + return; + } + + // consider only entries with a retry (or fetch) interval lower than + // threshold + if (intervalThreshold != -1 + && value.getFetchInterval() > intervalThreshold) + return; + + String hostordomain; + + try { + if (normalise && normalizers != null) { + urlString = normalizers.normalize(urlString, + URLNormalizers.SCOPE_GENERATE_HOST_COUNT); + } + URL u = new URL(urlString); + if (byDomain) { + hostordomain = URLPartitioner.getDomainName(u.getHost()); + } else { + hostordomain = u.getHost().toLowerCase(Locale.ROOT); + } + } catch (Exception e) { + LOG.warn("Malformed URL: '{}', skipping ({})", urlString, + e.getMessage()); + errorTracker.incrementCounters(e); + return; + } + + outputKey.set(hostordomain, sort); + + // record generation time + value.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime); + entry.datum = value; + entry.url = key; + context.write(outputKey, entry); + } + + } + + public static class SelectorReducer extends + Reducer { + + private Configuration conf; + private int maxCount; + private int maxNumSegments = 1; + private int currentSegment; + private int keepMinUrlsPerSegment; + private int segmentIncrement = 1; + + // additional host limits if byDomain + private boolean byDomainWithHostLimits = false; + private int maxCountPerHost = -1; + private int maxHostsPerDomain = -1; + private int partition = -1; + private int numReduces = 1; + private Selector selector = new Selector(); + private Map domainLimits = null; + + public static class DomainLimits { + int maxURLs; + int maxURLsPerHost; + int maxHosts; + int numPartitions; + public DomainLimits(int urls, int urlsHost, int hosts, int parts) { + maxURLs = urls; + maxURLsPerHost = urlsHost; + maxHosts = hosts; + numPartitions = parts; + } + @Override + public String toString() { + if (numPartitions > 1) { + return String.format( + "max. urls = %d, urls/host = %d, hosts = %d, partitions = %d", + maxURLs, maxURLsPerHost, maxHosts, numPartitions); + } else { + return String.format("max. urls = %d, urls/host = %d, hosts = %d", + maxURLs, maxURLsPerHost, maxHosts); + } + } + } + + @Override + public void setup(Context context) throws IOException { + conf = context.getConfiguration(); + maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1); + maxCount = conf.getInt(GENERATOR_MAX_COUNT, -1); + partition = conf.getInt("mapreduce.task.partition", -1); + numReduces = conf.getInt("mapreduce.job.reduces", 1); + LOG.info("Selecting fetch lists for partition {} (out of {} partitions)", + partition, numReduces); + selector.setConf(conf); + if (GENERATOR_COUNT_VALUE_DOMAIN.equals(conf.get(GENERATOR_COUNT_MODE))) { + maxHostsPerDomain = conf.getInt(GENERATOR_MAX_HOSTS_PER_DOMAIN, -1); + maxCountPerHost = conf.getInt(GENERATOR_MAX_COUNT_PER_HOST, -1); + domainLimits = readLimitsFile(conf, partition, numReduces, selector); + if (domainLimits != null || maxHostsPerDomain > 0 + || maxCountPerHost > 0) { + byDomainWithHostLimits = true; + LOG.info("Domain limits enabled:"); + LOG.info(" - {}: {}", GENERATOR_MAX_HOSTS_PER_DOMAIN, + maxHostsPerDomain); + LOG.info(" - {}: {}", GENERATOR_MAX_COUNT_PER_HOST, + maxCountPerHost); + if (domainLimits != null) { + LOG.info( + " - domain limits file ({}) contains limits for {} domains", + GENERATOR_DOMAIN_LIMITS_FILE, domainLimits.size()); + } + } + } + currentSegment = 0; + keepMinUrlsPerSegment = conf.getInt(GENERATOR_COUNT_KEEP_MIN_IN_SEGMENT, + 100); + segmentIncrement = 1; // increment to select next segment + int prime[] = { 2, 3, 5, 7, 11, 13, 17, 23, 29 }; + // select next segment with a larger step, so that fetching of smaller + // sites is paused between consecutive segments. Select a prime number + // - which is not bigger than 1/3 of the number of segments + // - number of segments isn't a multiple of + for (int i = 0; i < prime.length; i++) { + if (prime[i] >= (maxNumSegments / 3)) + break; + if (0 == (maxNumSegments % prime[i])) + continue; + segmentIncrement = prime[i]; + } + LOG.info("Segment increment for {} segments = {}", maxNumSegments, + segmentIncrement); + } + + private int nextSegment(int segment) { + segment += segmentIncrement; + if (segment >= maxNumSegments) { + segment = (segment % maxNumSegments); + } + return segment; + } + + private int nextSegment() { + currentSegment = nextSegment(currentSegment); + return currentSegment; + } + + /** + * Read domain-specific limits into a map + * + * @param conf + * @param acceptor + * predicate to select limits: the BiPredicate is passed a pair + * to decided whether it shall load the limits + * rules for a particular domain + * @return map + */ + public static Map readLimitsFile(Configuration conf, + BiPredicate acceptor) { + String limitsFile = conf.get(GENERATOR_DOMAIN_LIMITS_FILE); + if (limitsFile != null) { + LOG.info("Reading domain-specific limits from {}", limitsFile); + Path limitsFilePath = new Path(limitsFile); + if (limitsFilePath.toUri().getScheme() != null) { + try { + FileSystem fs = limitsFilePath.getFileSystem(conf); + Reader limitsReader = new InputStreamReader(fs.open(limitsFilePath)); + return readLimitsFile(limitsReader, acceptor); + } catch (IOException e) { + LOG.error("Failed to read domain-specific limits", e); + } + } + try (Reader limitsReader = conf.getConfResourceAsReader(limitsFile)) { + return readLimitsFile(limitsReader, acceptor); + } catch (IOException e) { + LOG.error("Failed to read domain-specific limits", e); + } + } + return null; + } + + /** + * Read domain-specific limits into a map. Read selectively only those limits + * which are applicable for the given partition (the domain is processed in + * the given partition). + * + * @param conf + * @param partition + * partition ID of current task: domains not matching the partition + * ID are skipped to keep the map small + * @param numReduces + * number of reduce tasks determining number of partitions (fetch + * lists) + * @param selector + * Selector to determine partition ID for a given domain + * @return map + */ + private static Map readLimitsFile(Configuration conf, + int partition, int numReduces, Selector selector) { + BiPredicate domainAcceptor = (String d, + DomainLimits l) -> true; + if (partition >= 0) { + LOG.info(" - filtering domains by partition ID {}", partition); + domainAcceptor = (String domain, DomainLimits limits) -> { + DomainScorePair key = new DomainScorePair(); + key.set(domain, .0f); + int p = selector.getPartition(key, null, numReduces); + return p == partition; + }; + } + return readLimitsFile(conf, domainAcceptor); + } + + private static Map readLimitsFile(Reader limitsReader, + BiPredicate acceptor) throws IOException { + + if (limitsReader == null) { + throw new IOException("Limits reader is null"); + } + + Map domainLimits = new HashMap<>(); + + BufferedReader reader = new BufferedReader(limitsReader); + + String line = null; + String[] splits = null; + int skipped = 0; + + // Read all lines + while ((line = reader.readLine()) != null) { + // Skip blank lines and comments + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + // Split the line by TAB + splits = line.split("\t"); + + if (splits.length < 5) { + LOG.warn("Invalid line: {}", line); + continue; + } + int urls, urlsHost, hosts, parts; + try { + urls = Integer.parseInt(splits[1]); + urlsHost = Integer.parseInt(splits[2]); + hosts = Integer.parseInt(splits[3]); + parts = Integer.parseInt(splits[4]); + } catch (NumberFormatException e) { + LOG.warn("Invalid number in line: {} - {}", line, e.getMessage()); + continue; + } + String domain = splits[0].toLowerCase(Locale.ROOT); + DomainLimits limits = new DomainLimits(urls, urlsHost, hosts, parts); + if (!acceptor.test(domain, limits)) { + skipped++; + continue; + } + domainLimits.put(domain, limits); + } + } + LOG.info("Loaded domain limits for {} domains", domainLimits.size()); + if (skipped > 0) { + LOG.info(" - {} per-domain limits were rejected by acceptor predicate", + skipped); + } + return domainLimits; + } + + /* + * Limit the number of URLs per host/domain and assign segment number to + * every record. + */ + @Override + public void reduce(DomainScorePair key, Iterable values, + Context context) throws IOException, InterruptedException { + + int hostOrDomainCount = 0; + int segment = nextSegment(); + + Map hosts = null; + int[] segments = null; + int maxCountPerSegment = maxCount; + int maxCountPerHostTotal = -1; + if (maxCountPerHost > 0) { + maxCountPerHostTotal = maxCountPerHost * maxNumSegments; + } + int maxHosts = maxHostsPerDomain; + int maxHostsOverflowCount = 0; + int maxUrlsPerHostOverflowCount = 0; + boolean maxUrlsOverflow = false; + String domain = null; + if (byDomainWithHostLimits) { + hosts = new HashMap<>(); + segments = new int[maxNumSegments]; + domain = key.getDomain().toString(); + int p = selector.getPartition(key, null, numReduces); + if (p != partition) { + LOG.error( + "Partition for domain {} not equal task partition: {} <> {}", + domain, p, partition); + } + if (domainLimits != null) { + DomainLimits limits = domainLimits.get(key.domain.toString()); + if (limits != null) { + LOG.info("Domain-specific limits for {}: {}", key.domain, limits); + maxCountPerSegment = limits.maxURLs; + maxCountPerHostTotal = limits.maxURLsPerHost * maxNumSegments; + maxHosts = limits.maxHosts; + } + } + } + int maxCountTotal = -1; + if (maxCountPerSegment > 0) { + maxCountTotal = maxCountPerSegment * maxNumSegments; + if (keepMinUrlsPerSegment > maxCountPerSegment) { + // if more URLs are allowed per segment and host, + // need to allow this number also per domain + maxCountPerSegment = keepMinUrlsPerSegment; + } + } + + for (SelectorEntry entry : values) { + + if (maxCountTotal > 0 && hostOrDomainCount >= maxCountTotal) { + LOG.info( + "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", + key.getDomain(), maxCountTotal, maxNumSegments); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL) + .increment(1); + maxUrlsOverflow = true; + break; + } + + hostOrDomainCount++; + + if (byDomainWithHostLimits) { + String host = null; + try { + host = new URL(entry.url.toString()).getHost().toLowerCase(Locale.ROOT); + if (host.endsWith(domain)) { + // clip common domain name suffix to save storage space in map keys + host = host.substring(0, host.length()-domain.length()); + } else { + LOG.warn("Host {} does not have domain {} as suffix!", host, domain); + } + } catch (MalformedURLException e) { + hostOrDomainCount--; + continue; + } + int[] counts = hosts.get(host); + if (counts == null) { + if (maxHosts > 0 && hosts.size() >= maxHosts) { + // skip current host, max. number of unique hosts reached for domain + hostOrDomainCount--; + maxHostsOverflowCount++; + continue; + } + counts = new int[3]; + counts[0] = 0; + if (hosts.isEmpty()) { + // first host in domain + counts[1] = segment; + } else { + counts[1] = nextSegment(); + } + counts[2] = 0; + hosts.put(host, counts); + } else if (maxCountPerHostTotal > 0 && counts[0] >= maxCountPerHostTotal) { + hostOrDomainCount--; + if (counts[0] == maxCountPerHostTotal) { + LOG.info( + "Host {}{} (domain: {}) has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", + host, domain, domain, maxCountPerHostTotal, maxNumSegments); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL) + .increment(1); + } + context + .getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL) + .increment(1); + maxUrlsPerHostOverflowCount++; + counts[0]++; + continue; + } else if ((counts[2] % keepMinUrlsPerSegment) == 0) { + counts[1] = nextSegment(counts[1]); + counts[2] = 0; + } + segment = counts[1]; + // select next segment if there are already too many URLs from the current domain + if (maxNumSegments > 1 && segments[segment] >= maxCountPerSegment) { + for (int i = 1; i < maxNumSegments; i++) { + int s = (segment + i) % maxNumSegments; + if (segments[s] < maxCountPerSegment) { + counts[1] = segment = s; + counts[2] = 0; + break; + } + } + } + counts[0]++; + counts[2]++; + segments[segment]++; + entry.segnum.set(segment); + } else { + entry.segnum.set(segment); + if ((hostOrDomainCount % keepMinUrlsPerSegment) == 0) { + segment = nextSegment(); + } + } + + context.getCounter(NutchMetrics.GROUP_GENERATOR_SELECTED_BY_STATUS, + CrawlDatum.getStatusName(entry.datum.getStatus())).increment(1); + + context.write(key.getScore(), entry); + } + + if (maxHostsOverflowCount > 0) { + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL) + .increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL) + .increment(maxHostsOverflowCount); + LOG.info( + "Domain {} has more than {} hosts, skipped {} URLs from remaining hosts", + key.getDomain(), maxHosts, maxHostsOverflowCount); + } + + // log metrics per host/domain + LOG.info( + "{} :: selected={}, selected_hosts={}, max_urls_overflow={}, max_hosts_overflow={}, max_urls_per_host_overflow={}", + key.getDomain(), hostOrDomainCount, + (hosts == null ? 0 : hosts.size()), maxUrlsOverflow, + maxHostsOverflowCount, maxUrlsPerHostOverflowCount); + } + + } + + public static class SegmenterKey implements WritableComparable { + private Text url = new Text(); + private IntWritable segment = new IntWritable(); + + public void set(String url, int segment) { + this.url.set(url); + this.segment.set(segment); + } + + public void set(Text url, IntWritable segment) { + this.url = url; + this.segment = segment; + } + + public Text getUrl() { + return url; + } + + public IntWritable getSegment() { + return segment; + } + + @Override + public void readFields(DataInput in) throws IOException { + url.readFields(in); + segment.readFields(in); + } + + @Override + public void write(DataOutput out) throws IOException { + url.write(out); + segment.write(out); + } + + @Override + public int hashCode() { + return url.hashCode() + segment.hashCode(); + } + + @Override + public boolean equals(Object right) { + if (right instanceof SegmenterKey) { + SegmenterKey r = (SegmenterKey) right; + return r.url.equals(url) && r.segment.equals(segment); + } else { + return false; + } + } + + /* Sorts primary by segment, secondary by URL */ + @Override + public int compareTo(SegmenterKey o) { + if (!segment.equals(o.getSegment())) { + return segment.compareTo(o.getSegment()); + } else if (!url.equals(o.getUrl())) { + return url.compareTo(o.getUrl()); + } else { + return 0; + } + } + + } + + public static class UrlHashComparator extends WritableComparator { + + HashComparator comp = new HashComparator(); + + public UrlHashComparator() { + super(SegmenterKey.class, true); + } + + public int compare(SegmenterKey a, SegmenterKey b) { + return comp.compare(a.getUrl(), b.getUrl()); + } + + @Override + public int compare(Object a, Object b) { + return compare((SegmenterKey) a, (SegmenterKey) b); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable a, WritableComparable b) { + return compare((SegmenterKey) a, (SegmenterKey) b); + } + } + + public static class SegmentComparator extends WritableComparator { + public SegmentComparator() { + super(SegmenterKey.class, true); + } + + public int compare(SegmenterKey a, SegmenterKey b) { + return a.getSegment().compareTo(b.getSegment()); + } + + @Override + public int compare(Object a, Object b) { + return compare((SegmenterKey) a, (SegmenterKey) b); + } + + @SuppressWarnings("rawtypes") + @Override + public int compare(WritableComparable a, WritableComparable b) { + return compare((SegmenterKey) a, (SegmenterKey) b); + } + } + + public static class SegmentPartitioner + extends Partitioner implements Configurable { + + private Configuration conf; + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public int getPartition(SegmenterKey key, Writable value, + int numReduceTasks) { + return key.segment.get() % numReduceTasks; + } + + } + + public static class SegmenterMapper extends + Mapper { + + SegmenterKey outputKey = new SegmenterKey(); + + @Override + public void map(FloatWritable key, SelectorEntry value, Context context) + throws IOException, InterruptedException { + outputKey.set(value.url, value.segnum); + context.write(outputKey, value); + } + + } + + /* + * This takes the filtered records from the Selector job, limits the number of + * records per segment in the reducer and saves each segment to its own file. + */ + public static class SegmenterReducer + extends Reducer { + + private long maxPerSegment; + private MultipleOutputs mos; + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + maxPerSegment = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE); + mos = new MultipleOutputs(context); + } + + @Override + public void reduce(SegmenterKey key, Iterable values, + Context context) throws IOException, InterruptedException { + long count = 0; + int segnum = -1; + String fileName = null; + for (SelectorEntry entry : values) { + if (segnum == -1) { + segnum = entry.segnum.get(); // same as key.getSegment().get() + fileName = generateFileName(entry); + LOG.info("Writing segment {} to {}", segnum, fileName); + } + if (count < maxPerSegment) { + mos.write("sequenceFiles", entry.url, entry, fileName); + } else { + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL) + .increment(1); + if (count == maxPerSegment) { + LOG.info( + "Maximum number of URLs per segment reached for segment {}, skipping remaining URLs", + key.getSegment().get()); + } + } + count++; + } + } + + private String generateFileName(SelectorEntry entry) { + return "fetchlist-" + entry.segnum.toString() + "/part"; + } + + @Override + public void cleanup(Context context) + throws IOException, InterruptedException { + mos.close(); + } + + } + + public static class SelectorInverseMapper + extends Mapper { + + private int numLists = 1; + private int mapno = 0; + private long currentTime = System.currentTimeMillis(); + private MultipleOutputs out; + URLPartitioner partitioner = new URLPartitioner(); + + @Override + public void setup(Context context) { + out = new MultipleOutputs(context); + Configuration conf = context.getConfiguration(); + mapno = conf.getInt(Context.TASK_PARTITION, + random.nextInt(Integer.MAX_VALUE)); + numLists = conf.getInt("num.lists", 1); + partitioner.setConf(conf); + BiPredicate acceptor = (String domain, + DomainLimits limits) -> { + return limits.numPartitions > 1; + }; + partitioner.setDomainLimits(SelectorReducer.readLimitsFile(conf, acceptor)); + } + + @Override + public void map(Text key, SelectorEntry value, Context context) + throws IOException, InterruptedException { + out.write("sequenceFilesPartitions", key, value.datum, + generateFileName(key, value.datum, numLists)); + } + + private String generateFileName(Text key, CrawlDatum value, int numLists) { + int partition = partitioner.getPartition(key, value, numLists); + return "" + currentTime + "." + mapno + "/" + CrawlDatum.GENERATE_DIR_NAME + + "/subfetchlist-" + partition; + } + + @Override + protected void cleanup(Context context) + throws IOException, InterruptedException { + out.close(); + } + + } + + /** Sort fetch lists by hash of URL. */ + public static class HashComparator extends WritableComparator { + public HashComparator() { + super(Text.class); + } + + @Override + @SuppressWarnings("rawtypes") + public int compare(WritableComparable a, WritableComparable b) { + Text url1 = (Text) a; + Text url2 = (Text) b; + int hash1 = hash(url1.getBytes(), 0, url1.getLength()); + int hash2 = hash(url2.getBytes(), 0, url2.getLength()); + return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1)); + } + + @Override + public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { + int hash1 = hash(b1, s1, l1); + int hash2 = hash(b2, s2, l2); + return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1)); + } + + private static int hash(byte[] bytes, int start, int length) { + int hash = 1; + // make later bytes more significant in hash code, so that sorting + // by hashcode correlates less with by-host ordering. + for (int i = length - 1; i >= 0; i--) + hash = (31 * hash) + bytes[start + i]; + return hash; + } + } + + public Generator2() { + } + + public Generator2(Configuration conf) { + setConf(conf); + } + + /** + * Generate fetchlists in one or more segments. Whether to filter URLs or not + * is read from the crawl.generate.filter property in the configuration files. + * If the property is not found, the URLs are filtered. Same for the + * normalisation. + * + * @param dbDir + * Crawl database directory + * @param segments + * Segments directory + * @param numLists + * Number of reduce tasks + * @param topN + * Number of top URLs to be selected + * @param curTime + * Current time in milliseconds + * + * @return Path to generated segment or null if no entries were selected + * @throws Exception + */ + public Path[] generate(Path dbDir, String dbVersion, Path segments, + int numLists, long topN, long curTime, boolean filter, boolean norm, + boolean force, int maxNumSegments, boolean keep, String stage2, String stage1) + throws Exception { + + Path tempDir = new Path(getConf().get("mapreduce.cluster.temp.dir", ".") + + "/generate-temp-" + System.currentTimeMillis()); + + Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); + FileSystem fs = lock.getFileSystem(getConf()); + FileSystem tempFs = tempDir.getFileSystem(getConf()); + Path stage1Dir = null, stage2Dir = null; + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Generator: starting"); + LOG.info("Generator: Selecting best-scoring urls due for fetch."); + LOG.info("Generator: filtering: {}", filter); + LOG.info("Generator: normalizing: {}", norm); + if (topN != Long.MAX_VALUE) { + LOG.info("Generator: perSegment: {}", topN); + } + LOG.info("Generator: generate.count.mode = {}", getConf().get(GENERATOR_COUNT_MODE)); + LOG.info("Generator: partition.url.mode = {}", getConf().get(URLPartitioner.PARTITION_MODE_KEY)); + + if (stage2 == null && stage1 == null) { + // map to inverted subset due for fetch, sort by score + Job job = NutchJob.getInstance(getConf()); + job.setJobName("generate: select from " + dbDir); + + Configuration conf = job.getConfiguration(); + if (numLists == -1) { + // for politeness create a single partition per fetcher map task + numLists = Integer.parseInt(conf.get("mapreduce.job.maps")); + } + if ("local".equals(conf.get("mapreduce.framework.name")) + && numLists != 1) { + // override + LOG.info( + "Generator: running in local mode, generating exactly one partition."); + numLists = 1; + } + conf.setLong(GENERATOR_CUR_TIME, curTime); + // record real generation time + long generateTime = System.currentTimeMillis(); + conf.setLong(Nutch.GENERATE_TIME_KEY, generateTime); + conf.setBoolean(GENERATOR_FILTER, filter); + conf.setBoolean(GENERATOR_NORMALISE, norm); + conf.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); + conf.setInt("partition.url.seed", new Random().nextInt()); + job.setSpeculativeExecution(true); + job.setReduceSpeculativeExecution(true); + + FileInputFormat.addInputPath(job, new Path(dbDir, dbVersion)); + job.setInputFormatClass(SequenceFileInputFormat.class); + + job.setMapperClass(SelectorMapper.class); + job.setPartitionerClass(Selector.class); + job.setReducerClass(SelectorReducer.class); + job.setJarByClass(Selector.class); + + stage1Dir = tempDir.suffix("/stage1"); + FileOutputFormat.setOutputPath(job, stage1Dir); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + job.setMapOutputKeyClass(DomainScorePair.class); + job.setOutputKeyClass(FloatWritable.class); + job.setSortComparatorClass(ScoreComparator.class); + job.setGroupingComparatorClass(DomainComparator.class); + job.setOutputValueClass(SelectorEntry.class); + + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "Generator job did not succeed, job status:" + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + NutchJob.cleanupAfterFailure(tempDir, lock, fs); + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("Generator job failed: {}", e.getMessage()); + NutchJob.cleanupAfterFailure(tempDir, lock, fs); + throw e; + } + + long selected = job.getCounters() + .findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS).getValue(); + if (selected == 0) { + LOG.warn("Generator: 0 records selected for fetching, exiting ..."); + NutchJob.cleanupAfterFailure(tempDir, lock, fs); + return null; + } + } else if (stage1 != null) { + stage1Dir = new Path(stage1); + } + + if (stage2 == null) { + // Read through the generated URL list and output individual segment files + Job job = NutchJob.getInstance(getConf()); + job.setJobName("generate: segmenter"); + Configuration conf = job.getConfiguration(); + conf.setLong(GENERATOR_TOP_N, topN); + + job.setSpeculativeExecution(true); + job.setReduceSpeculativeExecution(true); + + FileInputFormat.addInputPath(job, stage1Dir); + job.setInputFormatClass(SequenceFileInputFormat.class); + + job.setMapperClass(SegmenterMapper.class); + job.setReducerClass(SegmenterReducer.class); + job.setJarByClass(SegmenterMapper.class); + + /* + * Every reduce partition contains all data from a single segment in a + * single group to check for the max. segment size. + */ + job.setGroupingComparatorClass(SegmentComparator.class); + job.setPartitionerClass(SegmentPartitioner.class); + /* + * URLs are shuffled (sorted by pseudo-random hash value). + */ + job.setSortComparatorClass(UrlHashComparator.class); + /* ensure that every segments gets its own partition */ + job.setNumReduceTasks(maxNumSegments); + + job.setMapOutputKeyClass(SegmenterKey.class); + job.setMapOutputValueClass(SelectorEntry.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(SelectorEntry.class); + + stage2Dir = tempDir.suffix("/stage2"); + FileOutputFormat.setOutputPath(job, stage2Dir); + MultipleOutputs.addNamedOutput(job, "sequenceFiles", + SequenceFileOutputFormat.class, Text.class, SelectorEntry.class); + LazyOutputFormat.setOutputFormatClass(job, + SequenceFileOutputFormat.class); + + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "Generator job did not succeed, job status:" + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + if (!keep) { + NutchJob.cleanupAfterFailure(tempDir, lock, fs); + } + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("Generator job failed: {}", e.getMessage()); + if (!keep) { + NutchJob.cleanupAfterFailure(tempDir, lock, fs); + } + throw e; + } + } else { + stage2Dir = new Path(stage2); + } + // read the subdirectories generated in the temporary + // output and turn them into segments, shuffle URLs + // to spread URLs of the same host/domain over the entire + // fetch list + List generatedSegments; + + try { + FileStatus[] status = tempFs.listStatus(stage2Dir); + List inputDirs = new ArrayList(); + for (FileStatus stat : status) { + Path subfetchlist = stat.getPath(); + if (!subfetchlist.getName().startsWith("fetchlist-")) + continue; + inputDirs.add(subfetchlist); + } + generatedSegments = partitionSegments(segments.getFileSystem(getConf()), + segments, inputDirs, numLists); + } catch (Exception e) { + LOG.warn("Generator: exception while partitioning segments, exiting ...", + e); + if (!keep) { + tempFs.delete(tempDir, true); + } + return null; + } + + if (!keep) { + tempFs.delete(tempDir, true); + } + + stopWatch.stop(); + LOG.info("Generator: finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + + Path[] patharray = new Path[generatedSegments.size()]; + return generatedSegments.toArray(patharray); + } + + /** + * Partition segments: one partition is the fetch lists of a single fetcher + * task. + */ + private List partitionSegments(FileSystem fs, Path segmentsDir, + List inputDirs, int numLists) throws Exception { + LOG.info("Generator: Partitioning selected urls for politeness."); + + List generatedSegments = new ArrayList(); + + LOG.info("Generator: partitionSegment: {}", segmentsDir); + + Job job = NutchJob.getInstance(getConf()); + job.setJobName("generate: partition " + segmentsDir); + + Configuration conf = job.getConfiguration(); + conf.setInt("partition.url.seed", new Random().nextInt()); + + for (Path p : inputDirs) { + FileInputFormat.addInputPath(job, p); + } + job.setInputFormatClass(SequenceFileInputFormat.class); + + job.setSpeculativeExecution(false); + job.setMapSpeculativeExecution(false); + job.setMapperClass(SelectorInverseMapper.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(CrawlDatum.class); + job.setJarByClass(SelectorInverseMapper.class); + + /* Ensure output files from step 2 are not split. */ + conf.setLong("mapreduce.input.fileinputformat.split.minsize", + Long.MAX_VALUE); + /* + * Reduce the replication factor to limit the number of open HDFS + * files/blocks - we may write 100 segments each with 100 partitions / + * fetchers. + */ + conf.set("dfs.replication", "1"); + + /* Set number of fetchers */ + conf.setInt("num.lists", numLists); + job.setNumReduceTasks(0); + + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + FileOutputFormat.setOutputPath(job, segmentsDir); + MultipleOutputs.addNamedOutput(job, "sequenceFilesPartitions", + SequenceFileOutputFormat.class, Text.class, CrawlDatum.class); + LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); + if (LOG.isDebugEnabled()) { + // (for debugging) count records per output file + // NOTE: may create too many counters if used in production with many + // segments and partitions + MultipleOutputs.setCountersEnabled(job, true); + } + + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "Generator job did not succeed, job status:" + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("Generator job failed: {}", e.getMessage()); + throw e; + } + + return generatedSegments; + } + + /** + * Generate a fetchlist from the crawldb. + */ + public static void main(String args[]) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new Generator2(), + args); + System.exit(res); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.out.println( + "Usage: Generator2 [-force] [-keep] [-numPerSegment N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter] [-noNorm] [-maxNumSegments num]"); + return -1; + } + + Path dbDir = new Path(args[0]); + Path segmentsDir = new Path(args[1]); + long curTime = System.currentTimeMillis(); + long topN = Long.MAX_VALUE; + int numFetchers = -1; + boolean filter = true; + boolean norm = true; + boolean force = false; + boolean keep = false; + int maxNumSegments = 1; + String stage2 = null; + String stage1 = null; + String dbVersion = CrawlDb.CURRENT_NAME; + + for (int i = 2; i < args.length; i++) { + if ("-numPerSegment".equals(args[i])) { + topN = Long.parseLong(args[i + 1]); + i++; + } else if ("-numFetchers".equals(args[i])) { + numFetchers = Integer.parseInt(args[i + 1]); + i++; + } else if ("-adddays".equals(args[i])) { + long numDays = Integer.parseInt(args[i + 1]); + curTime += numDays * 1000L * 60 * 60 * 24; + } else if ("-noFilter".equals(args[i])) { + filter = false; + } else if ("-noNorm".equals(args[i])) { + norm = false; + } else if ("-force".equals(args[i])) { + force = true; + } else if ("-maxNumSegments".equals(args[i])) { + maxNumSegments = Integer.parseInt(args[i + 1]); + } else if ("-keep".equals(args[i])) { + keep = true; + } else if ("-stage1".equals(args[i])) { + stage1 = args[++i]; + } else if ("-stage2".equals(args[i])) { + stage2 = args[++i]; + } else if ("-dbVersion".equals(args[i])) { + dbVersion = args[++i]; + } + } + + try { + Path[] segs = generate(dbDir, dbVersion, segmentsDir, numFetchers, topN, + curTime, filter, norm, force, maxNumSegments, keep, stage2, stage1); + if (segs == null) + return -1; + } catch (Exception e) { + LOG.error("Generator failed with", e); + return -1; + } + return 0; + } + +} diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 819c91e3a8..ae154350ef 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -24,6 +24,7 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -36,6 +37,8 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilterException; @@ -118,14 +121,21 @@ public static class InjectMapper public static final String EQUAL_CHARACTER = "="; private URLNormalizers urlNormalizers; - private int interval; + protected int interval; private float scoreInjected; private URLFilters filters; - private ScoringFilters scfilters; - private long curTime; + protected ScoringFilters scfilters; + protected long curTime; private boolean url404Purging; private String scope; private boolean filterNormalizeAll = false; + private ErrorTracker errorTracker; + + // Cached counter references for performance + private Counter urlsFilteredCounter; + private Counter urlsInjectedCounter; + private Counter urlsPurged404Counter; + private Counter urlsPurgedFilterCounter; @Override public void setup(Context context) { @@ -146,10 +156,28 @@ public void setup(Context context) { curTime = conf.getLong("injector.current.time", System.currentTimeMillis()); url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_INJECTOR, context); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlsFilteredCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL); + urlsInjectedCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL); + urlsPurged404Counter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL); + urlsPurgedFilterCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL); } /* Filter and normalize the input url */ - private String filterNormalize(String url) { + protected String filterNormalize(String url) { if (url != null) { try { if (urlNormalizers != null) @@ -218,7 +246,7 @@ public void map(Text key, Writable value, Context context) url = filterNormalize(url); if (url == null) { - context.getCounter("injector", "urls_filtered").increment(1); + urlsFilteredCounter.increment(1); } else { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); @@ -237,8 +265,9 @@ public void map(Text key, Writable value, Context context) LOG.warn( "Cannot filter injected score for url {}, using default ({})", url, e.getMessage()); + errorTracker.incrementCounters(e); } - context.getCounter("injector", "urls_injected").increment(1); + urlsInjectedCounter.increment(1); context.write(key, datum); } } else if (value instanceof CrawlDatum) { @@ -248,14 +277,14 @@ public void map(Text key, Writable value, Context context) // remove 404 urls if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) { - context.getCounter("injector", "urls_purged_404").increment(1); + urlsPurged404Counter.increment(1); return; } if (filterNormalizeAll) { String url = filterNormalize(key.toString()); if (url == null) { - context.getCounter("injector", "urls_purged_filter").increment(1); + urlsPurgedFilterCounter.increment(1); } else { key.set(url); context.write(key, datum); @@ -270,22 +299,35 @@ public void map(Text key, Writable value, Context context) /** Combine multiple new entries for a url. */ public static class InjectReducer extends Reducer { - private int interval; - private float scoreInjected; private boolean overwrite = false; private boolean update = false; private CrawlDatum old = new CrawlDatum(); private CrawlDatum injected = new CrawlDatum(); + // Cached counter references for performance + private Counter urlsInjectedUniqueCounter; + private Counter urlsMergedCounter; + @Override public void setup(Context context) { Configuration conf = context.getConfiguration(); - interval = conf.getInt("db.fetch.interval.default", 2592000); - scoreInjected = conf.getFloat("db.score.injected", 1.0f); overwrite = conf.getBoolean("db.injector.overwrite", false); update = conf.getBoolean("db.injector.update", false); LOG.info("Injector: overwrite: {}", overwrite); LOG.info("Injector: update: {}", update); + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlsInjectedUniqueCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL); + urlsMergedCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_MERGED_TOTAL); } /** @@ -334,16 +376,20 @@ public void reduce(Text key, Iterable values, Context context) if (injectedSet && update) { // corresponds to rule (3.b.ii) in the method description old.putAllMetaData(injected); - old.setScore(injected.getScore() != scoreInjected - ? injected.getScore() : old.getScore()); - old.setFetchInterval(injected.getFetchInterval() != interval - ? injected.getFetchInterval() : old.getFetchInterval()); +// old.setScore(injected.getScore() != scoreInjected +// ? injected.getScore() : old.getScore()); +// old.setFetchInterval(injected.getFetchInterval() != interval +// ? injected.getFetchInterval() : old.getFetchInterval()); + // smoothly update score and interval + old.setScore((injected.getScore() + old.getScore()) / 2.0f); + old.setFetchInterval( + (injected.getFetchInterval() + old.getFetchInterval()) / 2); } } if (injectedSet) { - context.getCounter("injector", "urls_injected_unique").increment(1); + urlsInjectedUniqueCounter.increment(1); if (oldSet) { - context.getCounter("injector", "urls_merged").increment(1); + urlsMergedCounter.increment(1); } } context.write(key, result); @@ -454,17 +500,23 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, if (LOG.isInfoEnabled()) { long urlsInjected = job.getCounters() - .findCounter("injector", "urls_injected").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).getValue(); long urlsInjectedUniq = job.getCounters() - .findCounter("injector", "urls_injected_unique").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).getValue(); long urlsFiltered = job.getCounters() - .findCounter("injector", "urls_filtered").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).getValue(); long urlsMerged = job.getCounters() - .findCounter("injector", "urls_merged").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).getValue(); long urlsPurged404 = job.getCounters() - .findCounter("injector", "urls_purged_404").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).getValue(); long urlsPurgedFilter = job.getCounters() - .findCounter("injector", "urls_purged_filter").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).getValue(); LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered); LOG.info( "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", @@ -482,10 +534,12 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, "Injector: Total urls with status gone removed from CrawlDb (db.update.purge.404): {}", urlsPurged404); } - - stopWatch.stop(); - LOG.info("Injector: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } + + stopWatch.stop(); + LOG.info("Injector: finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage()); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); diff --git a/src/java/org/apache/nutch/crawl/NutchWritable.java b/src/java/org/apache/nutch/crawl/NutchWritable.java index 589b8b973c..e9eddec0b7 100644 --- a/src/java/org/apache/nutch/crawl/NutchWritable.java +++ b/src/java/org/apache/nutch/crawl/NutchWritable.java @@ -48,7 +48,9 @@ public class NutchWritable extends GenericWritableConfigurable { org.apache.nutch.protocol.Content.class, org.apache.nutch.protocol.ProtocolStatus.class, org.apache.nutch.scoring.webgraph.LinkDatum.class, - org.apache.nutch.hostdb.HostDatum.class }; + org.apache.nutch.hostdb.HostDatum.class, + org.commoncrawl.util.WarcCapture.class + }; } public NutchWritable() { diff --git a/src/java/org/apache/nutch/crawl/SitemapInjector.java b/src/java/org/apache/nutch/crawl/SitemapInjector.java new file mode 100644 index 0000000000..a8a9152afe --- /dev/null +++ b/src/java/org/apache/nutch/crawl/SitemapInjector.java @@ -0,0 +1,1324 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.TaskCounter; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; +import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolNotFound; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +import crawlercommons.domains.EffectiveTldFinder; +import crawlercommons.robots.BaseRobotRules; +import crawlercommons.sitemaps.AbstractSiteMap; +import crawlercommons.sitemaps.SiteMap; +import crawlercommons.sitemaps.SiteMapIndex; +import crawlercommons.sitemaps.SiteMapParser; +import crawlercommons.sitemaps.SiteMapURL; +import crawlercommons.sitemaps.extension.Extension; +import crawlercommons.sitemaps.extension.ExtensionMetadata; +import crawlercommons.sitemaps.extension.LinkAttributes; + +/** + * Inject URLs from sitemaps (https://www.sitemaps.org/). + * + * Sitemap URLs are given same way as "ordinary" seeds URLs - one URL per line. + * Each URL points to one of + *

XML sitemap
plain text sitemap (possibly compressed)
sitemap index (XML)
and all + * other + * formats supported by the Sitemap parser of crawler-commons.

+ * + *

+ * All sitemap URLs on the input path are fetched and the URLs contained in the + * sitemaps are "injected" into the CrawlDb. If a sitemap specifies modification + * time, refresh rate, and/or priority for a page, these values are stored in + * the CrawlDb but adjusted so that they fit into global limits. E.g., + * + *

+ * <changefreq>yearly</changefreq>
+ *

+ * + * may be limited to the value of property + * db.fetch.schedule.max_interval and/or + * db.fetch.interval.max. + *

+ * + * The typical use case for the SitemapInjector is to feed the crawl with a list + * of URLs maintained by the site's owner (generated, e.g., via content + * management system). + * + * Fetching sitemaps is done by Nutch protocol plugins to make use of special + * settings, e.g., HTTP proxy configurations. + * + * The behavior how entries in the CrawlDb are overwritten by injected entries + * does not differ from {@link Injector}. However, it is possible to run + * SitemapInjector in two steps: + *

Step 1: Extract URLs from sitemaps, store the URLs in a new CrawlDb.
Step 2: Inject URLs from the CrawlDb created in Step 1 into another + * CrawlDb.

+ * + *

Specifics and Limitations

+ * + * SitemapInjector does not support: + *

Retry scheduling if fetching a sitemap fails.
Guarantee polite delays between fetching sitemaps from the same host. + * Usually, there is only one sitemap per host, so this does not matter that + * much. But it should be made sure that the input list of sitemap URLs does not + * contain multiple or many sitemaps from hosted on the same system.

+ * + * The following features are implemented: + *

Respect robots.txt rules: do not access sitemaps disallowed per + * robots.txt
Apply URL filters and normalization rules to URLs of sitemaps and URLs + * listed in sitemaps.
Follow redirects.
Check for + * "cross + * submits": if a sitemap URL is explicitly given it is assumed the + * sitemap's content is trustworthy.
Configure multiple limits on sitemap fetching and processing, to avoid + * that the sitemap processing is overloaded, get stuck, or too many URLs are + * emitted. See + * {@link SitemapInjector.SitemapInjectMapper.SitemapProcessor#processSitemap(AbstractSiteMap, Set, int)} + * for more details.

+ * + */ +public class SitemapInjector extends Injector { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + protected int threads = 8; + protected boolean keepTemp; + protected boolean runStepOneOnly; + protected boolean runStepTwoOnly; + + /** Fetch and parse sitemaps, output extracted URLs as seeds */ + public static class SitemapInjectMapper extends InjectMapper { + + private static final String SITEMAP_MAX_URLS = "db.injector.sitemap.max_urls"; + private static final String SITEMAP_MAX_HOSTS = "db.injector.sitemap.max_hosts"; + private static final String SITEMAP_CROSS_SUBMIT_CHECK = "db.injector.sitemap.check-cross-submits"; + private static final String SITEMAP_CROSS_SUBMIT_CHECK_TYPE = "db.injector.sitemap.check-cross-submit.type"; + private static final String SITEMAP_CROSS_SUBMITS = "db.injector.sitemap.cross-submits"; + + protected float minInterval; + protected float maxInterval; + + protected int maxRecursiveSitemaps = 50001; + /** limit for (deeply) nested sitemap indexes */ + protected int maxRecursiveSitemapDepth = 3; + protected long maxUrlsPerSitemapIndex = 50000L * 50000; + + /** + * Need a limit on the branching factor, sitemaps from spam hosts may refer + * to hundreds of hosts + */ + protected int maxHostsPerSitemapIndex = 100; + + protected int maxSitemapFetchTime = 180; + protected int maxSitemapProcessingTime; + protected int maxUrlLength = 512; + + protected boolean checkRobotsTxt = true; + protected boolean checkCrossSubmits = true; + + enum CrossSubmitType { + PUBLIC_DOMAIN, PRIVATE_DOMAIN, HOST + } + + protected CrossSubmitType checkCrossSubmitsType = CrossSubmitType.PRIVATE_DOMAIN; + + protected int maxFailuresPerHost = 5; + protected int maxRedirect = 3; + + private ProtocolFactory protocolFactory; + private SiteMapParser sitemapParser; + private ExecutorService executorService; + private Map failuresPerHost = new HashMap<>(); + + @Override + public void setup(Context context) { + super.setup(context); + + Configuration conf = context.getConfiguration(); + + protocolFactory = new ProtocolFactory(conf); + + /* + * SiteMapParser to allow "cross submits" from different prefixes (up to + * the last slash), cf. https://www.sitemaps.org/protocol.html#location. + * + * strict = true : do not allow cross submits. This would need to pass a + * set of cross-submit allowed hosts beforehand which is not supported by + * the sitemap parser. Done in SitemapInjector, see below. + */ + boolean strict = conf.getBoolean("db.injector.sitemap.strict", false); + sitemapParser = new SiteMapParser(strict, true); + sitemapParser.setStrictNamespace(true); + sitemapParser.addAcceptedNamespace( + crawlercommons.sitemaps.Namespace.SITEMAP_LEGACY); + sitemapParser + .addAcceptedNamespace(crawlercommons.sitemaps.Namespace.NEWS); + sitemapParser + .addAcceptedNamespace(crawlercommons.sitemaps.Namespace.EMPTY); + // enable support for localized links in sitemaps + sitemapParser.enableExtension(Extension.LINKS); + + maxRecursiveSitemaps = conf.getInt("db.injector.sitemap.index_max_size", + 50001); + maxRecursiveSitemapDepth = conf + .getInt("db.injector.sitemap.index_max_depth", 3); + maxUrlsPerSitemapIndex = conf.getLong(SITEMAP_MAX_URLS, 50000L * 50000); + maxHostsPerSitemapIndex = conf.getInt(SITEMAP_MAX_HOSTS, 100); + + checkRobotsTxt = conf.getBoolean("db.injector.sitemap.checkrobotstxt", + true); + checkCrossSubmits = conf.getBoolean(SITEMAP_CROSS_SUBMIT_CHECK, true); + checkCrossSubmitsType = CrossSubmitType + .valueOf(conf.get(SITEMAP_CROSS_SUBMIT_CHECK_TYPE, "PRIVATE_DOMAIN")); + + /* + * Make sure a sitemap is entirely, even recursively processed within 80% + * of the task timeout, do not start processing a subsitemap if fetch and + * parsing time may hit the task timeout + */ + int taskTimeout = conf.getInt("mapreduce.task.timeout", 900000); + LOG.info("mapreduce.task.timeout = {} ms", taskTimeout); + taskTimeout /= 1000; // now in seconds + LOG.info("http.time.limit = {} seconds", + conf.getInt("http.time.limit", 120)); + maxSitemapFetchTime = (int) (conf.getInt("http.time.limit", 120) * 1.5); + maxSitemapProcessingTime = taskTimeout - (2 * maxSitemapFetchTime); + if ((taskTimeout * .8) < maxSitemapProcessingTime + || maxSitemapProcessingTime < 1) { + maxSitemapProcessingTime = (int) (taskTimeout * .8); + } + LOG.info("Max. sitemap processing time: {} seconds", + maxSitemapProcessingTime); + maxFailuresPerHost = conf + .getInt("db.injector.sitemap.max.fetch.failures.per.host", 5); + + maxRedirect = conf.getInt("db.injector.sitemap.max.redirect", 3); + + // fetch intervals defined in sitemap should within the defined range + minInterval = conf.getFloat("db.fetch.schedule.adaptive.min_interval", + 60); + maxInterval = conf.getFloat("db.fetch.schedule.max_interval", + 365 * 24 * 3600); + if (maxInterval > conf.getInt("db.fetch.interval.max", 365 * 24 * 3600)) { + maxInterval = conf.getInt("db.fetch.interval.max", 365 * 24 * 3600); + } + + /* + * Sitemaps can be quite large, so it is desirable to increase the content + * limits above defaults (1 MiB) to the 50 MiB specified in the sitemaps + * protocol: + */ + String[] contentLimitProperties = { "http.content.limit", + "ftp.content.limit", "file.content.limit" }; + for (int i = 0; i < contentLimitProperties.length; i++) { + conf.setInt(contentLimitProperties[i], SiteMapParser.MAX_BYTES_ALLOWED); + } + + executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder() + .setNameFormat("sitemapinj-%d").setDaemon(true).build()); + } + + public void map(Text key, Writable value, Context context) + throws IOException, InterruptedException { + + // one line in the text file (= one sitemap URL): + // - key is the first field, + // - metadata is contained in value + String url = key.toString().trim(); + if (url.isEmpty() || url.startsWith("#")) { + // skip empty URLs or comment lines starting with '#' + return; + } + + float customScore = 0.0f; + long maxUrls = maxUrlsPerSitemapIndex; + int maxHosts = maxHostsPerSitemapIndex; + Set crossSubmits = new HashSet<>(); + Metadata customMetadata = new Metadata(); + String metadata = value.toString().trim(); + if (metadata.length() > 0) { + String[] splits = metadata.split("[\t ]"); + for (String split : splits) { + int indexEquals = split.indexOf("="); + if (indexEquals == -1) + continue; + String metaname = split.substring(0, indexEquals); + String metavalue = split.substring(indexEquals + 1); + if (metaname.equals(nutchScoreMDName)) { + try { + customScore = Float.parseFloat(metavalue); + } catch (NumberFormatException nfe) { + LOG.error("Invalid custom score for sitemap seed {}: {} - {}", + url, metavalue, nfe.getMessage()); + } + } else if (metaname.equals(SITEMAP_MAX_URLS)) { + try { + maxUrls = Long.parseLong(metavalue); + LOG.info("Setting max. number of URLs per sitemap for {} = {}", + url, maxUrls); + } catch (NumberFormatException nfe) { + LOG.error("Invalid URL limit for sitemap seed {}: {} - {}", url, + metavalue, nfe.getMessage()); + } + } else if (metaname.equals(SITEMAP_MAX_HOSTS)) { + try { + maxHosts = Integer.parseInt(metavalue); + LOG.info("Setting max. number of hosts per sitemap for {} = {}", + url, maxHosts); + } catch (NumberFormatException nfe) { + LOG.error("Invalid host limit for sitemap seed {}: {} - {}", url, + metavalue, nfe.getMessage()); + } + } else if (metaname.equals(SITEMAP_CROSS_SUBMITS) + && checkCrossSubmits) { + for (String target : metavalue.split(",")) { + crossSubmits.add(target); + } + } else { + customMetadata.add(metaname, metavalue); + } + } + } + + SitemapProcessor sp = new SitemapProcessor(context, customScore, maxUrls, + maxHosts, crossSubmits); + sp.process(url); + } + + class FetchSitemapCallable implements Callable { + private Protocol protocol; + private String url; + private Context context; + + public FetchSitemapCallable(Protocol protocol, String url, + Context context) { + this.protocol = protocol; + this.url = url; + this.context = context; + } + + @Override + public ProtocolOutput call() throws Exception { + Text turl = new Text(url); + if (checkRobotsTxt) { + BaseRobotRules rules = protocol.getRobotRules(turl, null, null); + if (!rules.isAllowed(url)) { + LOG.info("Fetch of sitemap forbidden by robots.txt: {}", url); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_ROBOTSTXT_DISALLOW_TOTAL).increment(1); + return null; + } + } + return protocol.getProtocolOutput(turl, new CrawlDatum()); + } + } + + class ParseSitemapCallable implements Callable { + private Content content; + private String url; + private AbstractSiteMap sitemap; + + public ParseSitemapCallable(Content content, Object urlOrSitemap) { + this.content = content; + if (urlOrSitemap instanceof String) + this.url = (String) urlOrSitemap; + else if (urlOrSitemap instanceof AbstractSiteMap) + this.sitemap = (AbstractSiteMap) urlOrSitemap; + else + throw new IllegalArgumentException( + "URL (String) or sitemap (AbstractSiteMap) required as argument"); + } + + @Override + public AbstractSiteMap call() throws Exception { + if (sitemap != null) { + return sitemapParser.parseSiteMap(content.getContentType(), + content.getContent(), sitemap); + } else { + return sitemapParser.parseSiteMap(content.getContentType(), + content.getContent(), new URL(url)); + } + } + } + + class ScoredSitemap implements Comparable { + double score; + AbstractSiteMap sitemap; + + public ScoredSitemap(double score, AbstractSiteMap sitemap) { + this.score = score; + this.sitemap = sitemap; + } + + @Override + public int compareTo(ScoredSitemap other) { + return Double.compare(other.score, this.score); + } + } + + private void incrementFailuresPerHost(String hostName) { + int failures = 1; + if (failuresPerHost.containsKey(hostName)) { + failures += failuresPerHost.get(hostName); + } + failuresPerHost.put(hostName, failures); + } + + /** Wrapper for (recursively) fetching and parsing a sitemap */ + class SitemapProcessor { + Context context; + float customScore; + long maxUrls; + int maxHosts; + + long startTime = System.currentTimeMillis(); + long totalUrls = 0; + Set injectedHosts = new HashSet<>(); + Set crossSubmits; + + public SitemapProcessor(Context context, float customScore, long maxUrls, + int maxHosts, Set crossSubmits) { + this.context = context; + this.maxUrls = maxUrls; + this.maxHosts = maxHosts; + this.crossSubmits = crossSubmits; + + // distribute site score to outlinks + // TODO: should be by real number of outlinks not the maximum allowed + customScore /= maxUrls; + this.customScore = customScore; + } + + /** + * Within limited time: parse and process a sitemap (recursively, in case + * of a sitemap index) and inject URLs + */ + public void process(String url) { + Content content = getContent(url); + if (content == null) { + return; + } + + AbstractSiteMap sitemap = null; + try { + sitemap = parseSitemap(content, url); + } catch (Exception e) { + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1); + LOG.warn("failed to parse sitemap {}: {}", url, + StringUtils.stringifyException(e)); + return; + } + LOG.info("parsed sitemap {} ({})", url, sitemap.getType()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_TYPE_PREFIX + + sitemap.getType().toString().toLowerCase(Locale.ROOT)) + .increment(1); + + if (checkCrossSubmits) { + String host = sitemap.getUrl().getHost(); + String crossSubmit = host; + if (checkCrossSubmitsType == CrossSubmitType.PRIVATE_DOMAIN) { + crossSubmit = EffectiveTldFinder.getAssignedDomain(host, false, + false); + } else if (checkCrossSubmitsType == CrossSubmitType.PUBLIC_DOMAIN) { + crossSubmit = EffectiveTldFinder.getAssignedDomain(host, false, + true); + } + if (crossSubmit != null) { + crossSubmits.add(crossSubmit); + } + } + + try { + processSitemap(sitemap, null, 0); + } catch (IOException | InterruptedException e) { + LOG.warn("failed to process sitemap {}: {}", url, + StringUtils.stringifyException(e)); + } + LOG.info("Injected total {} URLs for {}", totalUrls, url); + + } + + /** + * Parse a sitemap and inject all contained URLs. In case of a sitemap + * index, sitemaps are fetched and processed recursively until one of the + * configurable limits apply: + *

max. depth (db.injector.sitemap.index_max_depth)
max. processing time (recursively, depends on + * mapreduce.task.timeout)
no URLs found at 50% of the processing time
max. number of recursive sitemaps + * (db.injector.sitemap.index_max_size)
50% of the max. number of recursive sitemaps failed to process
max. number of URLs for this sitemap + * (db.injector.sitemap.max_urls)

+ * + * Subsitemaps from a sitemap index are selected randomly but giving + * precedence to sitemaps recently published or coming in front of the + * list of subsitemaps. + * + * @param sitemap + * the sitemap to process + * @param processedSitemaps + * set of recursively processed sitemaps, required to skip + * duplicates and to apply limits + * @param depth + * the current depth when processing sitemaps recursively + * @throws IOException + * @throws InterruptedException + */ + public void processSitemap(AbstractSiteMap sitemap, + Set processedSitemaps, int depth) + throws IOException, InterruptedException { + + if (sitemap.isIndex()) { + processSitemapIndex((SiteMapIndex) sitemap, processedSitemaps, depth); + return; + } + + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_PROCESSED_TOTAL).increment(1); + injectURLs((SiteMap) sitemap); + if (totalUrls >= maxUrls) { + LOG.warn( + "Sitemap index URL limit reached, skipped remaining urls of {}", + sitemap.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL) + .increment(1); + } + sitemap.setProcessed(true); + } + + private void processSitemapIndex(SiteMapIndex sitemapIndex, + Set processedSitemaps, int depth) + throws IOException, InterruptedException { + if (processedSitemaps == null) { + processedSitemaps = new HashSet(); + processedSitemaps.add(sitemapIndex.getUrl().toString()); + } + if (++depth > maxRecursiveSitemapDepth) { + LOG.warn( + "Depth limit reached recursively processing sitemap index {}", + sitemapIndex.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL) + .increment(1); + return; + } + + // choose subsitemaps randomly with a preference for elements in front + // and recently published sitemaps + PriorityQueue sitemaps = new PriorityQueue<>(); + int subSitemaps = 0; + for (AbstractSiteMap s : sitemapIndex.getSitemaps()) { + subSitemaps++; + double publishScore = 0.3; + if (s.getLastModified() != null) { + double elapsedMonthsSincePublished = (System.currentTimeMillis() + - s.getLastModified().getTime()) / (1000.0 * 60 * 60 * 24 * 30); + publishScore = (1.0 / Math.log(1.0 + elapsedMonthsSincePublished)); + } + double score = (1.0 / subSitemaps) + publishScore + Math.random(); + sitemaps.add(new ScoredSitemap(score, s)); + } + + int failedSubSitemaps = 0; + while (sitemaps.size() > 0) { + + long elapsed = (System.currentTimeMillis() - startTime) / 1000; + if (elapsed > maxSitemapProcessingTime) { + LOG.warn( + "Max. processing time reached, skipped remaining sitemaps of sitemap index {}", + sitemapIndex.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL) + .increment(1); + return; + } + if ((totalUrls == 0) && (elapsed > (maxSitemapProcessingTime / 2))) { + LOG.warn( + "Half of processing time elapsed and no URLs injected, skipped remaining sitemaps of sitemap index {}", + sitemapIndex.getUrl()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL) + .increment(1); + return; + } + if (failedSubSitemaps > (maxRecursiveSitemaps / 2)) { + // do not spend too much time to fetch broken subsitemaps + LOG.warn( + "Too many failures, skipped remaining sitemaps of sitemap index {}", + sitemapIndex.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL) + .increment(1); + return; + } + + AbstractSiteMap nextSitemap = sitemaps.poll().sitemap; + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL) + .increment(1); + + String url = nextSitemap.getUrl().toString(); + if (processedSitemaps.contains(url)) { + LOG.warn("skipped duplicated or recursive sitemap URL {}", url); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL) + .increment(1); + nextSitemap.setProcessed(true); + continue; + } + if (processedSitemaps.size() > maxRecursiveSitemaps) { + LOG.warn("{} sitemaps processed for {}, skipped remaining sitemaps", + processedSitemaps.size(), sitemapIndex.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL) + .increment(1); + return; + } + if (totalUrls >= maxUrls) { + LOG.warn( + "URL limit reached, skipped remaining sitemaps of sitemap index {}", + sitemapIndex.getUrl()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL) + .increment(1); + return; + } + + processedSitemaps.add(url); + + Content content = getContent(url); + if (content == null) { + nextSitemap.setProcessed(true); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_TOTAL).increment(1); + failedSubSitemaps++; + continue; + } + + try { + AbstractSiteMap parsedSitemap = parseSitemap(content, nextSitemap); + processSitemap(parsedSitemap, processedSitemaps, depth); + } catch (Exception e) { + LOG.warn("failed to parse sitemap {}: {}", nextSitemap.getUrl(), + StringUtils.stringifyException(e)); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1); + failedSubSitemaps++; + } + nextSitemap.setProcessed(true); + } + sitemapIndex.setProcessed(true); + } + + private Content getContent(String url) { + if (url.length() > maxUrlLength) { + LOG.warn( + "Not fetching sitemap with overlong URL: {} ... (truncated, length = {} characters)", + url.substring(0, maxUrlLength), url.length()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_OVERLONG_URL_TOTAL).increment(1); + return null; + } + String origUrl = url; + url = filterNormalize(url); + if (url == null) { + LOG.warn("Sitemap rejected by URL filters: {}", origUrl); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL) + .increment(1); + return null; + } + String hostName; + try { + hostName = new URL(url).getHost(); + } catch (MalformedURLException e) { + return null; + } + if (failuresPerHost.containsKey(hostName) + && failuresPerHost.get(hostName) > maxFailuresPerHost) { + LOG.info("Skipped, too many failures per host: {}", url); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL) + .increment(1); + return null; + } + Protocol protocol = null; + try { + protocol = protocolFactory.getProtocol(url); + } catch (ProtocolNotFound e) { + LOG.error("Protocol not found: {}", url); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL) + .increment(1); + return null; + } + + LOG.info("Fetching sitemap: {}", url); + ProtocolOutput protocolOutput = null; + origUrl = url; + int redirects = 0; + do { + if (redirects > 0) { + LOG.info("Fetching redirected sitemap: {}", url); + } + FetchSitemapCallable fetch = new FetchSitemapCallable(protocol, url, + context); + Future task = executorService.submit(fetch); + try { + protocolOutput = task.get(maxSitemapFetchTime, TimeUnit.SECONDS); + } catch (Exception e) { + if (e instanceof TimeoutException) { + LOG.error("fetch of sitemap {} timed out", url); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL) + .increment(1); + } else { + LOG.error("fetch of sitemap {} failed with: {}", url, + StringUtils.stringifyException(e)); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL) + .increment(1); + } + task.cancel(true); + incrementFailuresPerHost(hostName); + return null; + } finally { + fetch = null; + } + + if (protocolOutput == null) { + return null; + } + + if (protocolOutput.getStatus().isRedirect()) { + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_TOTAL).increment(1); + String redirUrl = protocolOutput.getStatus().getArgs()[0]; + url = filterNormalize(redirUrl); + if (url == null) { + LOG.info( + "Redirect target of sitemap {} rejected by URL filters: {}", + origUrl, redirUrl); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL) + .increment(1); + return null; + } + // TODO: cross-submitting via redirects? + // - dangerous: if a spammer redirects sitemaps + // it would allow arbitrary domains + // try { + // String host = new URL(url).getHost(); + // String domain = EffectiveTldFinder.getAssignedDomain(host, true, + // true); + // crossSubmitDomains.add(domain); + // } catch (MalformedURLException e) { + // // should not happen, as URL already has been checked by + // filters/normalizers + // } + redirects++; + if (redirects >= maxRedirect) { + LOG.warn("sitemap redirect limit exceeded: {}", origUrl); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL) + .increment(1); + // return to avoid that exceeded redirects are counted twice + // (also as non-success fetch status) + return null; + } + } + } while (protocolOutput.getStatus().isRedirect() + && redirects < maxRedirect); + + if (!protocolOutput.getStatus().isSuccess()) { + LOG.error("fetch of sitemap {} failed with status code {}", url, + protocolOutput.getStatus().getCode()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL) + .increment(1); + incrementFailuresPerHost(hostName); + return null; + } + + Content content = protocolOutput.getContent(); + if (content == null) { + LOG.error("No content for {}, status: {}", url, + protocolOutput.getStatus().getMessage()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_EMPTY_CONTENT_TOTAL).increment(1); + incrementFailuresPerHost(hostName); + return null; + } + return content; + } + + private AbstractSiteMap parseSitemap(Content content, Object urlOrSitemap) + throws Exception { + ParseSitemapCallable parse = new ParseSitemapCallable(content, + urlOrSitemap); + Future task = executorService.submit(parse); + AbstractSiteMap sitemap = null; + try { + // not a recursive task, should be fast + sitemap = task.get((1 + maxSitemapProcessingTime / 5), + TimeUnit.SECONDS); + } finally { + parse = null; + } + return sitemap; + } + + /** + * Inject all URLs contained in one {@link SiteMap}. + */ + public void injectURLs(SiteMap sitemap) + throws IOException, InterruptedException { + + Collection sitemapURLs = sitemap.getSiteMapUrls(); + if (sitemapURLs.size() == 0) { + LOG.info("No URLs in sitemap {}", sitemap.getUrl()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_EMPTY_TOTAL).increment(1); + return; + } + LOG.info("Found {} URLs in {}", sitemapURLs.size(), sitemap.getUrl()); + + // random selection of URLs in case the sitemap contains more than + // accepted + // TODO: + // - for sitemap index: should be done over multiple sub-sitemaps + // - need to consider that URLs may be filtered away + // => use "reservoir sampling" + // (https://en.wikipedia.org/wiki/Reservoir_sampling) + Random random = null; + float randomSelect = 0.0f; + if (sitemapURLs.size() > (maxUrls - totalUrls)) { + randomSelect = (maxUrls - totalUrls) / (.95f * sitemapURLs.size()); + if (randomSelect < 1.0f) { + random = new Random(); + } + } + + AtomicLong crossSubmitsRejected = new AtomicLong(0); + AtomicLong hostLimitRejected = new AtomicLong(0); + + for (SiteMapURL siteMapURL : sitemapURLs) { + + if (totalUrls >= maxUrls) { + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URL_LIMIT_REACHED_TOTAL).increment(1); + LOG.info("URL limit ({}) reached for {}", maxUrls, + sitemap.getUrl()); + break; + } + + if (random != null) { + if (randomSelect > random.nextFloat()) { + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_RANDOM_SKIP_TOTAL).increment(1); + continue; + } + } + + // TODO: score and fetch interval should be transparently overridden + float sitemapScore = (float) siteMapURL.getPriority(); + sitemapScore *= customScore; + int sitemapInterval = getChangeFrequencySeconds( + siteMapURL.getChangeFrequency()); + long lastModified = -1; + if (siteMapURL.getLastModified() != null) { + lastModified = siteMapURL.getLastModified().getTime(); + } + + injectURL(siteMapURL.getUrl(), sitemapScore, sitemapInterval, lastModified, + crossSubmitsRejected, hostLimitRejected); + + /* + * Inject localized links if there are any. See + * + * and + * + */ + ExtensionMetadata[] linkAttrs = siteMapURL + .getAttributesForExtension(Extension.LINKS); + if (linkAttrs != null) { + for (ExtensionMetadata attr : linkAttrs) { + LinkAttributes linkAttr = (LinkAttributes) attr; + URL href = linkAttr.getHref(); + if (href != null) { + injectURL(href, sitemapScore, sitemapInterval, lastModified, + crossSubmitsRejected, hostLimitRejected); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + "sitemap_extension_localized_link").increment(1); + } + } + } + + } + if (crossSubmitsRejected.get() > 0) { + LOG.info("Rejected {} cross-submits for {} ({})", + crossSubmitsRejected.get(), sitemap.getUrl(), + sitemap.getType().toString()); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL) + .increment(crossSubmitsRejected.get()); + } + if (hostLimitRejected.get() > 0) { + LOG.info( + "Rejected {} URLs because max. number of linked hosts is reached for {} ({})", + hostLimitRejected.get(), sitemap.getUrl(), + sitemap.getType().toString()); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL) + .increment(hostLimitRejected.get()); + } + } + + public void injectURL(URL u, float sitemapScore, int sitemapInterval, + long lastModified, AtomicLong crossSubmitsRejected, AtomicLong hostLimitRejected) throws IOException, InterruptedException { + String url = u.toString(); + if (url.length() > maxUrlLength) { + LOG.warn( + "Skipping overlong URL: {} ... (truncated, length = {} characters)", + url.substring(0, maxUrlLength), url.length()); + return; + } + + // for simplicity do host and domain checks before normalization + String host = u.getHost(); + if (injectedHosts.size() >= maxHosts && !injectedHosts.contains(host)) { + hostLimitRejected.incrementAndGet(); + return; + } + + if (checkCrossSubmits) { + String crossSubmit = host; + if (checkCrossSubmitsType == CrossSubmitType.PRIVATE_DOMAIN) { + crossSubmit = EffectiveTldFinder.getAssignedDomain(host, false, + false); + } else if (checkCrossSubmitsType == CrossSubmitType.PUBLIC_DOMAIN) { + crossSubmit = EffectiveTldFinder.getAssignedDomain(host, false, + true); + } + if (crossSubmit == null || !crossSubmits.contains(crossSubmit)) { + crossSubmitsRejected.incrementAndGet(); + return; + } + } + try { + url = filterNormalize(url); + } catch (Exception e) { + LOG.warn("Skipping {}:", url, e); + url = null; + } + if (url == null) { + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS) + .increment(1); + } else { + // URL passed normalizers and filters + totalUrls++; + Text value = new Text(url); + CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, + sitemapInterval, sitemapScore); + if (lastModified != -1) { + // datum.setModifiedTime(lastModified); + } + datum.setFetchTime(curTime); + + try { + scfilters.injectedScore(value, datum); + } catch (ScoringFilterException e) { + LOG.warn( + "Cannot filter injected score for url {}, using default ({})", + url, e.getMessage()); + } + + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_INJECTED).increment(1); + context.write(value, datum); + injectedHosts.add(host); + } + } + } + + /** + * Determine fetch schedule intervals based on given + * changefrequency but adjusted to min. and max. intervals + * + * @param changeFrequency + * @return interval in seconds + */ + private int getChangeFrequencySeconds( + SiteMapURL.ChangeFrequency changeFrequency) { + float cf = interval; + if (changeFrequency != null) { + switch (changeFrequency) { + case NEVER: + cf = maxInterval; + break; + case YEARLY: + cf = 365 * 24 * 3600; + break; + case MONTHLY: + cf = 30 * 24 * 3600; + break; + case WEEKLY: + cf = 7 * 24 * 3600; + break; + case DAILY: + cf = 24 * 3600; + break; + case HOURLY: + cf = 3600; + break; + case ALWAYS: + cf = minInterval; + break; + } + } + if (cf < minInterval) { + cf = minInterval; + } else if (cf > maxInterval) { + cf = maxInterval; + } + return (int) cf; + } + + } + + public void inject(Path crawlDb, Path urlDir, boolean overwrite, + boolean update, boolean normalize, boolean filter, + boolean filterNormalizeAll) + throws IOException, ClassNotFoundException, InterruptedException { + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("SitemapInjector: starting"); + LOG.info("SitemapInjector: crawlDb: {}", crawlDb); + LOG.info("SitemapInjector: urlDir: {}", urlDir); + // for all sitemap URLs listed in text input file(s) + // fetch and parse the sitemap, and map the contained URLs to + // pairs + LOG.info( + "SitemapInjector: Fetching sitemaps, injecting URLs from sitemaps to crawl db entries."); + + // set configuration + Configuration conf = getConf(); + conf.setLong("injector.current.time", System.currentTimeMillis()); + conf.setBoolean("db.injector.overwrite", overwrite); + conf.setBoolean("db.injector.update", update); + conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); + conf.setBoolean(CrawlDbFilter.URL_FILTERING, filter); + conf.setBoolean(URL_FILTER_NORMALIZE_ALL, filterNormalizeAll); + conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + + Path tempDir; + Path lock = null; + if (runStepOneOnly) { + tempDir = crawlDb; + } else { + if (runStepTwoOnly) { + tempDir = urlDir; + } else { + tempDir = new Path(getConf().get("mapreduce.cluster.temp.dir", ".") + + "/sitemap-inject-temp-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + } + // lock an existing crawldb to prevent multiple simultaneous updates + lock = CrawlDb.lock(conf, crawlDb, false); + } + + if (!runStepTwoOnly) { + Job sitemapJob = NutchJob.getInstance(getConf()); + sitemapJob.setJobName("process sitemaps " + urlDir); + sitemapJob.setJarByClass(SitemapInjector.class); + sitemapJob.setInputFormatClass(KeyValueTextInputFormat.class); + KeyValueTextInputFormat.addInputPath(sitemapJob, urlDir); + + sitemapJob.setMapperClass(MultithreadedMapper.class); + MultithreadedMapper.setMapperClass(sitemapJob, SitemapInjectMapper.class); + MultithreadedMapper.setNumberOfThreads(sitemapJob, threads); + sitemapJob.setMapSpeculativeExecution(false); // mappers are fetching + // sitemaps + + FileOutputFormat.setOutputPath(sitemapJob, tempDir); + sitemapJob.setOutputFormatClass(SequenceFileOutputFormat.class); + sitemapJob.setOutputKeyClass(Text.class); + sitemapJob.setOutputValueClass(CrawlDatum.class); + + conf = sitemapJob.getConfiguration(); + conf.setLong("injector.current.time", System.currentTimeMillis()); + try { + // run the job + boolean success = sitemapJob.waitForCompletion(true); + if (!success) { + String message = "SitemapInjector job did not succeed, job status: " + + sitemapJob.getStatus().getState() + ", reason: " + + sitemapJob.getStatus().getFailureInfo(); + LOG.error(message); + NutchJob.cleanupAfterFailure(tempDir, lock, + tempDir.getFileSystem(conf)); + // throw exception so that calling routine can exit with error + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException + | NullPointerException e) { + LOG.error("SitemapInjector job failed: {}", e.getMessage()); + NutchJob.cleanupAfterFailure(tempDir, lock, + tempDir.getFileSystem(conf)); + throw e; + } + + for (Counter counter : sitemapJob.getCounters() + .getGroup(NutchMetrics.GROUP_SITEMAP_INJECTOR)) { + LOG.info(String.format("SitemapInjector: %8d %s", counter.getValue(), + counter.getName())); + } + + stopWatch.suspend(); + LOG.info( + "SitemapInjector: finished fetching and processing sitemaps, elapsed: {}", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + + if (runStepOneOnly) { + return; + } + stopWatch.resume(); + + long numOutputRecords = sitemapJob.getCounters() + .findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS).getValue(); + if (numOutputRecords == 0) { + LOG.warn( + "No URLs found in sitemaps, skipping step 2 merging URLs into CrawlDb"); + return; + } + } + + // merge with existing CrawlDb + if (LOG.isInfoEnabled()) { + LOG.info("SitemapInjector: Merging injected urls into crawl db."); + } + Job mergeJob = CrawlDb.createJob(getConf(), crawlDb); + FileInputFormat.addInputPath(mergeJob, tempDir); + mergeJob.setReducerClass(InjectReducer.class); + conf = mergeJob.getConfiguration(); + if (filterNormalizeAll) { + conf.setBoolean(CrawlDbFilter.URL_FILTERING, filter); + conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); + } else { + conf.setBoolean(CrawlDbFilter.URL_FILTERING, false); + conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, false); + } + + try { + // run the job + boolean success = mergeJob.waitForCompletion(true); + if (!success) { + String message = "SitemapInjector job did not succeed, job status: " + + mergeJob.getStatus().getState() + ", reason: " + + mergeJob.getStatus().getFailureInfo(); + LOG.error(message); + NutchJob.cleanupAfterFailure(tempDir, lock, + tempDir.getFileSystem(conf)); + // throw exception so that calling routine can exit with error + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException + | NullPointerException e) { + LOG.error("SitemapInjector job failed: {}", e.getMessage()); + NutchJob.cleanupAfterFailure(tempDir, lock, tempDir.getFileSystem(conf)); + throw e; + } + + CrawlDb.install(mergeJob, crawlDb); + + // clean up + if (!(keepTemp || runStepOneOnly || runStepTwoOnly)) { + tempDir.getFileSystem(conf).delete(tempDir, true); + } + + stopWatch.stop(); + LOG.info("SitemapInjector: finished, elapsed: ", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + + public void usage(String errorMessage) { + System.err.println(errorMessage + "\n"); + usage(); + } + + public void usage() { + System.err.println( + "Usage: SitemapInjector [-D...] [-threads ] [-overwrite|-update] [-noFilter] [-noNormalize] [-filterNormalizeAll]\n"); + System.err.println("\nFor sitemap URLs listed in seed input files:"); + System.err.println("\t- fetch and parse the sitemap (step 1)"); + System.err + .println("\t- inject URLs from sitemaps into the CrawlDb (step 2)"); + System.err.println( + "\t- using fetch intervals and scores from sitemaps if applicable"); + System.err.println("Options and properties of SitemapInjector"); + System.err.println( + "\t-threads \tNumber of threads created per mapper to fetch sitemap urls (default: 8)"); + System.err.println( + "\t-keepTemp\tDo not delete the temporary directory which contains the output of step 1"); + System.err.println( + "\t-step1\tOnly run step 1 ( is used as output path and must not exist)"); + System.err.println( + "\t-step2\tOnly run step 2 ( must point to the output of step 1)"); + System.err.println( + "\nIn addition, all options of Injector are supported, see below.\n"); + super.usage(); + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new SitemapInjector(), + args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + if (args.length < 2) { + usage(); + return -1; + } + List superArguments = new ArrayList<>(); + for (int i = 0; i < args.length; i++) { + if (i < 2) { + superArguments.add(args[i]); + continue; + } + switch (args[i]) { + case "-threads": + i++; + if (i == args.length) { + usage("Argument -threads requires parameter"); + return -1; + } + threads = Integer.parseInt(args[i]); + break; + case "-keepTemp": + keepTemp = true; + break; + case "-step1": + runStepOneOnly = true; + break; + case "-step2": + runStepTwoOnly = true; + break; + default: + superArguments.add(args[i]); + } + } + if (runStepOneOnly && runStepTwoOnly) { + LOG.warn("Running step 1 and 2 as both -step1 and -step2 are defined."); + runStepOneOnly = false; + runStepTwoOnly = false; + return -1; + } + return super.run(superArguments.toArray(new String[0])); + } +} diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java b/src/java/org/apache/nutch/crawl/URLPartitioner.java index cd1140a698..464bdbd44f 100644 --- a/src/java/org/apache/nutch/crawl/URLPartitioner.java +++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java @@ -21,15 +21,21 @@ import java.net.URL; import java.net.MalformedURLException; import java.net.UnknownHostException; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; import org.apache.hadoop.conf.Configurable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import crawlercommons.domains.EffectiveTldFinder; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.nutch.crawl.Generator2.SelectorReducer; import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.util.URLUtil; import org.apache.hadoop.mapreduce.Partitioner; /** @@ -50,6 +56,8 @@ public class URLPartitioner extends Partitioner implements Confi private URLNormalizers normalizers; private String mode = PARTITION_MODE_HOST; + private Map partitionsPerDomain = null; + private Configuration conf; @Override @@ -66,11 +74,38 @@ public void setConf(Configuration conf) { normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION); } + /** + * Set domain-specific partition numbers. + * + * @param limits + * domain-specific limits returned by + * {@link SelectorReducer#readLimitsFile(java.io.Reader)} + */ + public void setDomainLimits(Map limits) { + if (limits == null) { + return; + } + for (Map.Entry e : limits.entrySet()) { + if (e.getValue().numPartitions > 1) { + if (partitionsPerDomain == null) { + partitionsPerDomain = new TreeMap<>(); + } + partitionsPerDomain.put(e.getKey(), e.getValue().numPartitions); + } + } + LOG.info("Loaded domain-specific numbers of fetch lists for {} domains", + (partitionsPerDomain != null ? partitionsPerDomain.size() : 0)); + } + @Override public Configuration getConf() { return conf; } + public static String getDomainName(String host) { + return EffectiveTldFinder.getAssignedDomain(host, false, true); + } + /** Hash by host or domain name or IP address. */ @Override public int getPartition(Text key, Writable value, int numReduceTasks) { @@ -89,9 +124,13 @@ public int getPartition(Text key, Writable value, int numReduceTasks) { // failed to parse URL, must take URL string as fall-back hashCode = urlString.hashCode(); } else if (mode.equals(PARTITION_MODE_HOST)) { - hashCode = url.getHost().hashCode(); + hashCode = url.getHost().toLowerCase().hashCode(); } else if (mode.equals(PARTITION_MODE_DOMAIN)) { - hashCode = URLUtil.getDomainName(url).hashCode(); + String domainName = getDomainName(url.getHost()); + hashCode = domainName.hashCode(); + if (partitionsPerDomain != null && partitionsPerDomain.containsKey(domainName)) { + hashCode += ((url.getHost().toLowerCase(Locale.ROOT).hashCode() & Integer.MAX_VALUE) % partitionsPerDomain.get(domainName)); + } } else if (mode.equals(PARTITION_MODE_IP)) { try { InetAddress address = InetAddress.getByName(url.getHost()); diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index ead0167dbe..12d1b88bae 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; @@ -48,6 +49,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -140,6 +142,10 @@ public static boolean isStoringContent(Configuration conf) { return conf.getBoolean("fetcher.store.content", true); } + public static boolean isStoringWarc(Configuration conf) { + return conf.getBoolean("fetcher.store.warc", false); + } + public static class FetcherRun extends Mapper { @@ -154,6 +160,13 @@ public static class FetcherRun extends private boolean storingContent; private boolean parsing; + // Cached counter references for performance + private Counter bytesDownloadedCounter; + private Counter hitByThroughputThresholdCounter; + private Counter hitByTimelimitCounter; + private Counter hungThreadsCounter; + private Counter hitByTimeoutCounter; + private AtomicInteger getActiveThreads() { return activeThreads; } @@ -192,11 +205,28 @@ public void setup(Mapper.Context context) parsing = isParsing(conf); } + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + bytesDownloadedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL); + hitByThroughputThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL); + hitByTimelimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + hungThreadsCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HUNG_THREADS_TOTAL); + hitByTimeoutCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL); + } + @Override public void run(Context innerContext) throws IOException, InterruptedException { setup(innerContext); + initCounters(innerContext); try { Configuration conf = innerContext.getConfiguration(); LinkedList fetcherThreads = new LinkedList<>(); @@ -291,8 +321,7 @@ public void run(Context innerContext) pagesLastSec = pages.get() - pagesLastSec; bytesLastSec = (int) bytes.get() - bytesLastSec; - innerContext.getCounter("FetcherStatus", "bytes_downloaded") - .increment(bytesLastSec); + bytesDownloadedCounter.increment(bytesLastSec); reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec); @@ -330,9 +359,7 @@ public void run(Context innerContext) int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) - innerContext - .getCounter("FetcherStatus", "hitByThrougputThreshold") - .increment(hitByThrougputThreshold); + hitByThroughputThresholdCounter.increment(hitByThrougputThreshold); } } } @@ -413,8 +440,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) - innerContext.getCounter("FetcherStatus", "hitByTimeLimit") - .increment(hitByTimeLimit); + hitByTimelimitCounter.increment(hitByTimeLimit); } /* @@ -426,17 +452,21 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { * fetcher.threads.timeout.divisor. */ if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { - LOG.warn("Timeout reached with no new requests since {} seconds.", + LOG.warn( + "Timeout reached with no new requests since {} milliseconds.", timeout); - LOG.warn("Aborting with {} hung threads{}.", activeThreads, + LOG.warn("Aborting with {} hung or idle threads{}.", activeThreads, feeder.isAlive() ? " (queue feeder still alive)" : ""); - innerContext.getCounter("FetcherStatus", "hungThreads") - .increment(activeThreads.get()); + hungThreadsCounter.increment(activeThreads.get()); for (int i = 0; i < fetcherThreads.size(); i++) { FetcherThread thread = fetcherThreads.get(i); if (thread.isAlive()) { - LOG.warn("Thread #{} hung while processing {}", i, - thread.getReprUrl()); + if (thread.getReprUrl() != null) { + LOG.warn("Thread #{} hung while processing {}", i, + thread.getReprUrl()); + } else { + LOG.warn("Thread #{} idle", i); + } StackTraceElement[] stack = thread.getStackTrace(); StringBuilder sb = new StringBuilder(); sb.append("Stack of thread #").append(i).append(":\n"); @@ -466,8 +496,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { fetchQueues.getTotalSize(), fetchQueues.getQueueCount(), feeder.isAlive() ? " (queue feeder still alive)" : ""); int hitByTimeout = fetchQueues.emptyQueues(); - innerContext.getCounter("FetcherStatus", "hitByTimeout") - .increment(hitByTimeout); + hitByTimeoutCounter.increment(hitByTimeout); return; } diff --git a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java index 12dca9a945..b78a6c56e4 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java +++ b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java @@ -17,31 +17,38 @@ package org.apache.nutch.fetcher; import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; +import java.util.TimeZone; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.NutchWritable; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.MapFile.Writer.Option; import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.CompressionType; -import org.apache.hadoop.util.Progressable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.InvalidJobConfException; +import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.util.Progressable; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseOutputFormat; import org.apache.nutch.protocol.Content; +import org.commoncrawl.util.S3FileOutputFormat; +import org.commoncrawl.util.WarcCapture; +import org.commoncrawl.util.WarcOutputFormat; /** Splits FetcherOutput entries into multiple map files. */ -public class FetcherOutputFormat extends FileOutputFormat { +public class FetcherOutputFormat extends S3FileOutputFormat { @Override public void checkOutputSpecs(JobContext job) throws IOException { @@ -80,6 +87,7 @@ public RecordWriter getRecordWriter(TaskAttemptContext cont return new RecordWriter() { private MapFile.Writer contentOut; private RecordWriter parseOut; + private RecordWriter warcOut; { if (Fetcher.isStoringContent(conf)) { @@ -94,6 +102,38 @@ public RecordWriter getRecordWriter(TaskAttemptContext cont if (Fetcher.isParsing(conf)) { parseOut = new ParseOutputFormat().getRecordWriter(context); } + + if (Fetcher.isStoringWarc(conf)) { + // set start and end time of WARC capture + /* + * Note: start and end time are only reliable if fetcher is configured + * with a time limit (fetcher.timelimit.mins), otherwise the time is + * that of reduce task started which is after the fetch has happened + * in the map tasks. + */ + long timelimitMins = conf.getLong("fetcher.timelimit.mins", -1); + long startTime = System.currentTimeMillis(); + long endTime = startTime; + if (timelimitMins > 0) { + long timelimit = conf.getLong("fetcher.timelimit", -1); + /* + * Note: the current time might be before the timelimit, however, if + * a reduce task fails it gets assigned another end time and + * consequently output file name. This may cause duplicate WARC + * files in the worst case. We take the latest possible end time + * (when the time limit applies) as end time. + */ + endTime = timelimit; + startTime = endTime - (timelimitMins * 60 * 1000); + } + SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss", + Locale.US); + fileDate.setTimeZone(TimeZone.getTimeZone("GMT")); + conf.set("warc.export.date", fileDate.format(new Date(startTime))); + conf.set("warc.export.date.end", fileDate.format(new Date(endTime))); + + warcOut = new WarcOutputFormat().getRecordWriter(context); + } } @Override @@ -107,6 +147,8 @@ else if (w instanceof Content && contentOut != null) contentOut.append(key, w); else if (w instanceof Parse && parseOut != null) parseOut.write(key, (Parse) w); + else if (w instanceof WarcCapture) + warcOut.write(key, (WarcCapture) w); } @Override @@ -118,6 +160,9 @@ public void close(TaskAttemptContext context) throws IOException, InterruptedExc if (parseOut != null) { parseOut.close(context); } + if (warcOut != null) { + warcOut.close(context); + } } }; diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index f886aed422..ba07b8250f 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -26,17 +26,24 @@ import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.fetcher.Fetcher.FetcherRun; import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.LatencyTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLExemptionFilters; @@ -44,6 +51,7 @@ import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.net.protocols.ProtocolLogUtil; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; @@ -57,6 +65,7 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolNotFound; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilterException; @@ -64,6 +73,8 @@ import org.apache.nutch.service.NutchServer; import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.URLUtil; +import org.commoncrawl.util.CanonicalLinkDetector; +import org.commoncrawl.util.WarcCapture; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -77,6 +88,8 @@ public class FetcherThread extends Thread { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + private static final Writable EMPTY_VALUE = NullWritable.get(); + private Configuration conf; private URLFilters urlFilters; private URLExemptionFilters urlExemptionFilters; @@ -131,17 +144,28 @@ public class FetcherThread extends Thread { private FetcherRun.Context context; private boolean storingContent; + private boolean storingWarc; + private boolean storing404s; + private boolean storingProtocolVersions; private boolean signatureWithoutParsing; + private boolean detectCanonicalLink; private AtomicInteger pages; private AtomicLong bytes; - private List robotsTxtContent = null; private long robotsDeferVisitsDelay; private int robotsDeferVisitsRetries; + private List robotsTxtContent = null; + + private boolean robotsTxtArchivingFilterUrl = false; + private boolean robotsTxtArchivingFilterUrlAlways = false; + private boolean robotsTxtArchivingFilterMime = false; + private boolean robotsTxtArchivingCheckRobotsTxt = false; + private Set robotsTxtArchivingAcceptedMimeTypes = new HashSet<>(); + //Used by the REST service private FetchNode fetchNode; private boolean reportToNutchServer; @@ -152,6 +176,29 @@ public class FetcherThread extends Thread { private ProtocolLogUtil logUtil = new ProtocolLogUtil(); + // Cached counters for performance (avoid repeated lookups in hot paths) + private Counter robotsDeniedCounter; + private Counter robotsDeniedMaxCrawlDelayCounter; + private Counter robotsDeferVisitsDroppedCounter; + private Counter redirectCountExceededCounter; + private Counter redirectDeduplicatedCounter; + private Counter redirectNotCreatedCounter; + private Counter hitByTimeLimitCounter; + private Counter aboveExceptionThresholdCounter; + private Counter outlinksDetectedCounter; + private Counter outlinksFollowingCounter; + private Counter robotsTxtArchivingFilteredCounter; + private Counter ipv4Counter; + private Counter ipv6Counter; + private Counter robotsTxtArchivingFilteredMimeCounter; + private Counter robotsTxtArchivingRobotsDeniedCounter; + + // Latency tracker for fetch timing metrics + private LatencyTracker fetchLatencyTracker; + + // Error tracker for categorized error metrics + private ErrorTracker errorTracker; + public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context, AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, @@ -165,6 +212,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ this.parseUtil = new ParseUtil(conf); this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true); this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false); + this.detectCanonicalLink = conf.getBoolean("fetcher.detect.canonical.link", + false); this.protocolFactory = new ProtocolFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; @@ -181,6 +230,10 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ this.segmentName = segmentName; this.parsing = parsing; this.storingContent = storingContent; + this.storing404s = conf.getBoolean("fetcher.store.404s", false); + this.storingWarc = Fetcher.isStoringWarc(conf); + this.storingProtocolVersions = conf.getBoolean("store.protocol.versions", + false); this.pages = pages; this.bytes = bytes; @@ -233,14 +286,79 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ maxOutlinkDepthNumLinks = conf.getInt( "fetcher.follow.outlinks.num.links", 4); if (conf.getBoolean("fetcher.store.robotstxt", false)) { - if (storingContent) { + if (storingContent || storingWarc) { robotsTxtContent = new LinkedList<>(); + robotsTxtArchivingFilterUrl = conf + .getBoolean("fetcher.robotstxt.archiving.filter.url", false); + robotsTxtArchivingFilterUrlAlways = conf + .getBoolean("fetcher.robotstxt.archiving.filter.url.always", false); + robotsTxtArchivingFilterMime = conf + .getBoolean("fetcher.robotstxt.archiving.filter.mime", false); + robotsTxtArchivingCheckRobotsTxt = conf + .getBoolean("fetcher.robotstxt.archiving.check.robotstxt", false); + if (robotsTxtArchivingFilterMime) { + robotsTxtArchivingAcceptedMimeTypes.addAll(conf.getStringCollection( + "fetcher.robotstxt.archiving.filter.mime.accept")); + } } else { LOG.warn( "{} {} Ignoring fetcher.store.robotstxt because not storing content (fetcher.store.content)!", getName(), Thread.currentThread().getId()); } } + + // Initialize cached counters for performance + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters() { + robotsDeniedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_TOTAL); + robotsDeniedMaxCrawlDelayCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL); + robotsDeferVisitsDroppedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL); + redirectCountExceededCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL); + redirectDeduplicatedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_DEDUPLICATED_TOTAL); + redirectNotCreatedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_NOT_CREATED_TOTAL); + hitByTimeLimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + aboveExceptionThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL); + outlinksDetectedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL); + outlinksFollowingCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL); + + // Common Crawl specific counters + ipv4Counter = context.getCounter( + NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP, + NutchMetrics.FETCHER_IPV4_TOTAL); + ipv6Counter = context.getCounter( + NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP, + NutchMetrics.FETCHER_IPV6_TOTAL); + robotsTxtArchivingFilteredCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL); + robotsTxtArchivingFilteredMimeCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL); + robotsTxtArchivingRobotsDeniedCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL); + + // Initialize latency tracker for fetch timing + fetchLatencyTracker = new LatencyTracker( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY); + + // Initialize error tracker for categorized error metrics + errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); } @Override @@ -315,7 +433,7 @@ public void run() { LOG.debug("redirectCount={}", redirectCount); redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.u); - BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, + BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { outputRobotsTxt(robotsTxtContent); @@ -334,20 +452,18 @@ public void run() { fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay); if (killedURLs != 0) { - context - .getCounter("FetcherStatus", "robots_defer_visits_dropped") - .increment(killedURLs); + robotsDeferVisitsDroppedCounter.increment(killedURLs); } continue; } - if (!rules.isAllowed(fit.url.toString())) { + if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); - context.getCounter("FetcherStatus", "robots_denied").increment(1); + robotsDeniedCounter.increment(1); continue; } if (rules.getCrawlDelay() > 0) { @@ -359,8 +475,7 @@ public void run() { output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); - context.getCounter("FetcherStatus", - "robots_denied_maxcrawldelay").increment(1); + robotsDeniedMaxCrawlDelayCounter.increment(1); continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); @@ -377,8 +492,11 @@ public void run() { fit.queueID, fiq.crawlDelay, fit.url); } } + // Track fetch latency + long fetchStart = System.currentTimeMillis(); ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum); + fetchLatencyTracker.record(System.currentTimeMillis() - fetchStart); ProtocolStatus status = output.getStatus(); Content content = output.getContent(); ParseStatus pstatus = null; @@ -398,7 +516,12 @@ public void run() { endEvent.addEventData("status", status.getName()); publisher.publish(endEvent, conf); } - context.getCounter("FetcherStatus", status.getName()).increment(1); + // Dynamic counter for protocol status - can't cache as status varies + context.getCounter(NutchMetrics.GROUP_FETCHER, status.getName()).increment(1); + + if (storingProtocolVersions && content != null) { + countProtocolVersions(content.getMetadata()); + } switch (status.getCode()) { @@ -447,12 +570,11 @@ public void run() { int killedURLs = fetchQueues .checkExceptionThreshold(fit.getQueueID()); if (killedURLs != 0) - context.getCounter("FetcherStatus", - "AboveExceptionThresholdInQueue").increment(killedURLs); + aboveExceptionThresholdCounter.increment(killedURLs); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry - output(fit.url, fit.datum, null, status, + output(fit.url, fit.datum, (storing404s ? content : null), status, CrawlDatum.STATUS_FETCH_RETRY); break; @@ -460,26 +582,25 @@ public void run() { case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: - output(fit.url, fit.datum, null, status, + output(fit.url, fit.datum, (storing404s ? content : null), status, CrawlDatum.STATUS_FETCH_GONE); break; case ProtocolStatus.NOTMODIFIED: - output(fit.url, fit.datum, null, status, + output(fit.url, fit.datum, (storing404s ? content : null), status, CrawlDatum.STATUS_FETCH_NOTMODIFIED); break; default: LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(), Thread.currentThread().getId(), status.getCode()); - output(fit.url, fit.datum, null, status, + output(fit.url, fit.datum, (storing404s ? content : null), status, CrawlDatum.STATUS_FETCH_RETRY); } if (redirecting && redirectCount > maxRedirect) { fetchQueues.finishFetchItem(fit); - context.getCounter("FetcherStatus", "redirect_count_exceeded") - .increment(1); + redirectCountExceededCounter.increment(1); LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked"); @@ -498,18 +619,13 @@ public void run() { } catch (Throwable t) { // unexpected exception // unblock fetchQueues.finishFetchItem(fit); - String message; - if (LOG.isDebugEnabled()) { - message = StringUtils.stringifyException(t); - } else if (logUtil.logShort(t)) { - message = t.getClass().getName(); - } else { - message = StringUtils.stringifyException(t); - } - logError(fit.url, message); + logError(fit.url, t); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } + + // done: unset reprUrl for reporting + setReprUrl(null); } } catch (Throwable e) { @@ -518,6 +634,10 @@ public void run() { if (fit != null) { fetchQueues.finishFetchItem(fit); } + // Emit fetch latency metrics + fetchLatencyTracker.emitCounters(context); + // Emit error metrics + errorTracker.emitCounters(context); activeThreads.decrementAndGet(); // count threads LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads); @@ -613,13 +733,13 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { if (fetchQueues.redirectIsQueuedRecently(redirUrl)) { redirecting = false; - context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1); + redirectDeduplicatedCounter.increment(1); LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url, redirUrl); return null; } else if (fetchQueues.timelimitExceeded()) { redirecting = false; - context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1); + hitByTimeLimitCounter.increment(1); LOG.debug(" - ignoring redirect from {} to {} - timelimit reached", fit.url, redirUrl); return null; @@ -632,15 +752,53 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit) } else { // stop redirecting redirecting = false; - context.getCounter("FetcherStatus", "FetchItem.notCreated.redirect").increment(1); + redirectNotCreatedCounter.increment(1); } return fit; } + private void logError(Text url, Throwable t) { + String message = t.getClass().getName() + ": " + t.getMessage(); + LOG.info("{} {} fetch of {} failed with: {}", getName(), + Thread.currentThread().getId(), url, message); + errors.incrementAndGet(); + errorTracker.recordError(t); + } + private void logError(Text url, String message) { LOG.info("{} {} fetch of {} failed with: {}", getName(), Thread.currentThread().getId(), url, message); errors.incrementAndGet(); + errorTracker.recordError(ErrorTracker.ErrorType.OTHER); + } + + private void countProtocolVersions(Metadata contentMetadata) { + if (contentMetadata == null) { + return; + } + String versionStr = contentMetadata.get(Response.PROTOCOL_VERSIONS); + if (versionStr != null) { + String[] versions = versionStr.split(","); + if (versions.length >= 1) { + context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP, + versions[0]).increment(1); + } else { + context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP, + NutchMetrics.FETCHER_HTTP_PROTOCOL_UNKNOWN).increment(1); + } + for (int i = 1; i < versions.length; i++) { + context.getCounter(NutchMetrics.FETCHER_TLS_PROTOCOL_VERSION_GROUP, + versions[i]).increment(1); + } + } + String ipaddress = contentMetadata.get(Response.IP_ADDRESS); + if (ipaddress == null) { + // IP address is not recorded + } else if (ipaddress.indexOf(':') != -1) { + ipv6Counter.increment(1); + } else { + ipv4Counter.increment(1); + } } private ParseStatus output(Text key, CrawlDatum datum, Content content, @@ -691,6 +849,14 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } + + if (detectCanonicalLink) { + /* + * TODO: if parsing, then canonical links should be detected on the + * DOM tree. + */ + addCanonicalLink(key, datum, content); + } } /* @@ -705,6 +871,10 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, context.write(key, new NutchWritable(datum)); if (content != null && storingContent) context.write(key, new NutchWritable(content)); + if (storingWarc) { + WarcCapture warcCapture = new WarcCapture(key, datum, content); + context.write(key, new NutchWritable(warcCapture)); + } if (parseResult != null) { for (Entry entry : parseResult) { Text url = entry.getKey(); @@ -805,8 +975,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID); queue.alreadyFetched.add(url.toString().hashCode()); - context.getCounter("FetcherOutlinks", "outlinks_detected").increment( - outlinks.size()); + outlinksDetectedCounter.increment(outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; @@ -838,7 +1007,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); - context.getCounter("FetcherOutlinks", "outlinks_following").increment(1); + outlinksFollowingCounter.increment(1); fetchQueues.addFetchItem(fit); @@ -864,7 +1033,8 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { - context.getCounter("ParserStatus", ParseStatus.majorCodes[p + // Dynamic counter for parse status - can't cache as status varies + context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[p .getData().getStatus().getMajorCode()]).increment(1); return p.getData().getStatus(); } @@ -872,19 +1042,122 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, return null; } + private void addCanonicalLink(Text key, CrawlDatum datum, Content content) { + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinks(content); + if (canonicalLinks.isEmpty() || canonicalLinks.get(0).isEmpty()) { + /* + * Add a null value, so that a CrawlDb update overwrites outdated + * canonical links. + */ + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE); + } else { + LOG.debug("Found canonical links: {}", canonicalLinks); + String link = canonicalLinks.get(0); + String urlKey = key.toString(); + try { + if (!link.startsWith("http")) { + link = URLUtil.resolveURL(new URL(urlKey), link).toString(); + } + link = normalizers.normalize(link, URLNormalizers.SCOPE_FETCHER); + // do not filter, we just recording the canonical link + } catch (MalformedURLException e) { + link = null; + } + if (link != null) { + Text canonicalLink = new Text(link); + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, canonicalLink); + } else { + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, EMPTY_VALUE); + } + } + } + private void outputRobotsTxt(List robotsTxtContent) throws InterruptedException { for (Content robotsTxt : robotsTxtContent) { LOG.debug("Fetched and stored robots.txt {}", robotsTxt.getUrl()); try { - context.write(new Text(robotsTxt.getUrl()), - new NutchWritable(robotsTxt)); + Text tUrl = new Text(robotsTxt.getUrl()); + if (storingContent) { + context.write(tUrl, new NutchWritable(robotsTxt)); + } + if (storingWarc && robotsTxtArchivingIsAllowed(robotsTxt)) { + context.write(tUrl, + new NutchWritable(new WarcCapture(tUrl, null, robotsTxt))); + } } catch (IOException e) { LOG.error("Fetcher failed to store the robots.txt:", e); } } } + private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) { + String url = robotsTxt.getUrl(); + URL u = null; + if (robotsTxtArchivingFilterUrl) { + try { + if (urlFilters.filter(url) == null) { + u = new URL(url); + if (robotsTxtArchivingFilterUrlAlways + || !u.getFile().equals("/robots.txt")) { + LOG.info("Archiving of robots.txt {} skipped by URL filters", url); + robotsTxtArchivingFilteredCounter.increment(1); + return false; + } + + } + } catch (URLFilterException | MalformedURLException e) { + return false; + } + } + + if (robotsTxtArchivingFilterMime) { + int status = 200; + try { + status = Integer + .parseInt(robotsTxt.getMetadata().get(Nutch.FETCH_STATUS_KEY)); + } catch (NumberFormatException e) { + // ignore + } + if (status == 200) { + String contentType = robotsTxt.getContentType(); + if (contentType != null) { + if (!robotsTxtArchivingAcceptedMimeTypes.contains(contentType)) { + LOG.info("Archiving of robots.txt {} ({}) skipped by MIME filter", + url, contentType); + robotsTxtArchivingFilteredMimeCounter.increment(1); + return false; + } + } + } + } + + if (robotsTxtArchivingCheckRobotsTxt) { + try { + if (u == null) { + u = new URL(url); + } + if (!u.getFile().equals("/robots.txt")) { + Protocol protocol = protocolFactory.getProtocol(u); + BaseRobotRules rules = protocol.getRobotRules(new Text(url), null, + null); + if (!rules.isAllowed(url)) { + LOG.info( + "Archiving of redirected robots.txt {} ({}) not allowed by robots.txt", + url, robotsTxt.getContentType()); + robotsTxtArchivingRobotsDeniedCounter.increment(1); + return false; + } + } + } catch (MalformedURLException | ProtocolNotFound e) { + return false; + } + } + + return true; + } + private void updateStatus(int bytesInPage) throws IOException { pages.incrementAndGet(); bytes.addAndGet(bytesInPage); diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java b/src/java/org/apache/nutch/fetcher/QueueFeeder.java index c48c4b8f31..5dfa24fd06 100644 --- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java +++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java @@ -22,9 +22,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.fetcher.FetchItemQueues.QueuingStatus; import org.apache.nutch.fetcher.Fetcher.FetcherRun; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -47,6 +49,12 @@ public class QueueFeeder extends Thread { private URLNormalizers urlNormalizers = null; private String urlNormalizerScope = URLNormalizers.SCOPE_DEFAULT; + // Cached counter references to avoid repeated lookups in hot paths + private Counter hitByTimeoutCounter; + private Counter hitByTimelimitCounter; + private Counter filteredCounter; + private Counter aboveExceptionThresholdCounter; + public QueueFeeder(FetcherRun.Context context, FetchItemQueues queues, int size) { this.context = context; @@ -61,6 +69,21 @@ public QueueFeeder(FetcherRun.Context context, if (conf.getBoolean("fetcher.normalize.urls", false)) { urlNormalizers = new URLNormalizers(conf, urlNormalizerScope); } + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters() { + hitByTimeoutCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL); + hitByTimelimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + filteredCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_FILTERED_TOTAL); + aboveExceptionThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL); } /** Filter and normalize the url */ @@ -94,14 +117,14 @@ public void run() { LOG.info("QueueFeeder stopping, timeout reached."); } queuingStatus[qstatus]++; - context.getCounter("FetcherStatus", "hitByTimeout").increment(1); + hitByTimeoutCounter.increment(1); } else { int qstatus = QueuingStatus.HIT_BY_TIMELIMIT.ordinal(); if (queuingStatus[qstatus] == 0) { LOG.info("QueueFeeder stopping, timelimit exceeded."); } queuingStatus[qstatus]++; - context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1); + hitByTimelimitCounter.increment(1); } try { hasMore = context.nextKeyValue(); @@ -133,7 +156,7 @@ public void run() { String u = filterNormalize(url.toString()); if (u == null) { // filtered or failed to normalize - context.getCounter("FetcherStatus", "filtered").increment(1); + filteredCounter.increment(1); continue; } url = new Text(u); @@ -150,9 +173,7 @@ public void run() { QueuingStatus status = queues.addFetchItem(url, datum); queuingStatus[status.ordinal()]++; if (status == QueuingStatus.ABOVE_EXCEPTION_THRESHOLD) { - context - .getCounter("FetcherStatus", "AboveExceptionThresholdInQueue") - .increment(1); + aboveExceptionThresholdCounter.increment(1); } cnt++; feed--; diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index 9f2e4a384e..23d94bc881 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -44,11 +44,10 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.SegmentReaderUtil; -import org.apache.commons.jexl3.JexlBuilder; import org.apache.commons.jexl3.JexlContext; import org.apache.commons.jexl3.JexlScript; -import org.apache.commons.jexl3.JexlEngine; import org.apache.commons.jexl3.MapContext; +import org.apache.nutch.util.JexlUtil; /** * @see Commons @@ -77,11 +76,7 @@ public void setup(Context context) { fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true); String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION); if (expr != null) { - // Create or retrieve a JexlEngine - JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create(); - - // Create an expression object - this.expr = jexl.createScript(expr); + this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr); } } diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java index 2140ea52d1..05e4a940c8 100644 --- a/src/java/org/apache/nutch/hostdb/ResolverThread.java +++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java @@ -21,9 +21,13 @@ import java.net.UnknownHostException; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,6 +45,17 @@ public class ResolverThread implements Runnable { protected Context context; protected int purgeFailedHostsThreshold; + // Cached counter references for performance + private Counter newKnownHostCounter; + private Counter rediscoveredHostCounter; + private Counter existingKnownHostCounter; + private Counter newUnknownHostCounter; + private Counter existingUnknownHostCounter; + private Counter purgedUnknownHostCounter; + private Counter checkedHostsCounter; + private Counter errorsCounter; + private Counter errorsNetworkCounter; + /** * Overloaded constructor. * @param host name of the host to lookup @@ -58,6 +73,33 @@ public ResolverThread(String host, HostDatum datum, this.datum = datum; this.context = context; this.purgeFailedHostsThreshold = purgeFailedHostsThreshold; + + // Initialize cached counters for performance + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups. + */ + private void initCounters() { + newKnownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL); + rediscoveredHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL); + existingKnownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL); + newUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL); + existingUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL); + purgedUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL); + checkedHostsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL); + errorsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_TOTAL); + errorsNetworkCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_NETWORK_TOTAL); } /** @@ -72,16 +114,16 @@ public void run() { InetAddress inetAddr = InetAddress.getByName(host); if (datum.isEmpty()) { - context.getCounter("UpdateHostDb", "new_known_host").increment(1); + newKnownHostCounter.increment(1); datum.setLastCheck(); LOG.info("{}: new_known_host {}", host, datum); } else if (datum.getDnsFailures() > 0) { - context.getCounter("UpdateHostDb", "rediscovered_host").increment(1); + rediscoveredHostCounter.increment(1); datum.setLastCheck(); datum.setDnsFailures(0l); LOG.info("{}: rediscovered_host {}", host, datum); } else { - context.getCounter("UpdateHostDb", "existing_known_host").increment(1); + existingKnownHostCounter.increment(1); datum.setLastCheck(); LOG.info("{}: existing_known_host {}", host, datum); } @@ -95,7 +137,7 @@ public void run() { datum.setLastCheck(); datum.setDnsFailures(1l); context.write(hostText, datum); - context.getCounter("UpdateHostDb", "new_unknown_host").increment(1); + newUnknownHostCounter.increment(1); LOG.info("{}: new_unknown_host {}", host, datum); } else { datum.setLastCheck(); @@ -106,23 +148,34 @@ public void run() { purgeFailedHostsThreshold < datum.getDnsFailures()) { context.write(hostText, datum); - context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1); + existingUnknownHostCounter.increment(1); LOG.info("{}: existing_unknown_host {}", host, datum); } else { - context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1); + purgedUnknownHostCounter.increment(1); LOG.info("{}: purged_unknown_host {}", host, datum); } } - context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1); + // Dynamic counter based on failure count - can't cache + context.getCounter(NutchMetrics.GROUP_HOSTDB, createFailureCounterLabel(datum)).increment(1); + // Common error counters for consistency + errorsCounter.increment(1); + errorsNetworkCounter.increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.ERROR_TOTAL).increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + ErrorTracker.getCounterName(ioe)).increment(1); } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); + errorsCounter.increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + ErrorTracker.getCounterName(e)).increment(1); } - context.getCounter("UpdateHostDb", "checked_hosts").increment(1); + checkedHostsCounter.increment(1); } private String createFailureCounterLabel(HostDatum datum) { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java index ca6797ac0a..b1736348b8 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java @@ -24,12 +24,15 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.protocol.ProtocolStatus; @@ -60,6 +63,10 @@ public class UpdateHostDbMapper protected URLFilters filters = null; protected URLNormalizers normalizers = null; + // Cached counter references to avoid repeated lookups in hot paths + protected Counter filteredRecordsCounter; + protected ErrorTracker errorTracker; + @Override public void setup(Mapper.Context context) { Configuration conf = context.getConfiguration(); @@ -71,6 +78,19 @@ public void setup(Mapper.Context context) { filters = new URLFilters(conf); if (normalize) normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); + + // Initialize cached counter references + initCounters(context); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_HOSTDB, context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + filteredRecordsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL); } /** @@ -136,7 +156,7 @@ public void map(Text key, Writable value, try { url = new URL(keyStr); } catch (MalformedURLException e) { - context.getCounter("UpdateHostDb", "malformed_url").increment(1); + errorTracker.incrementCounters(e); return; } String hostName = URLUtil.getHost(url); @@ -146,7 +166,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} crawldatum has been filtered", hostName); return; } @@ -219,7 +239,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} hostdatum has been filtered", keyStr); return; } @@ -243,7 +263,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} score has been filtered", keyStr); return; } diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java index 1431b56365..878216b3c6 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java @@ -31,11 +31,13 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metrics.NutchMetrics; import com.tdunning.math.stats.TDigest; @@ -72,6 +74,11 @@ public class UpdateHostDbReducer protected BlockingQueue queue = new SynchronousQueue<>(); protected ThreadPoolExecutor executor = null; + // Cached counter references to avoid repeated lookups in hot paths + protected Counter urlLimitNotReachedCounter; + protected Counter totalHostsCounter; + protected Counter skippedNotEligibleCounter; + /** * Configures the thread pool and prestarts all resolver threads. */ @@ -145,6 +152,21 @@ public void setup(Reducer.Context context) // Run all threads in the pool executor.prestartAllCoreThreads(); } + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Reducer.Context context) { + urlLimitNotReachedCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL); + totalHostsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL); + skippedNotEligibleCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL); } /** @@ -379,12 +401,12 @@ else if (value instanceof FloatWritable) { // Impose limits on minimum number of URLs? if (urlLimit > -1l) { if (hostDatum.numRecords() < urlLimit) { - context.getCounter("UpdateHostDb", "url_limit_not_reached").increment(1); + urlLimitNotReachedCounter.increment(1); return; } } - context.getCounter("UpdateHostDb", "total_hosts").increment(1); + totalHostsCounter.increment(1); // See if this record is to be checked if (shouldCheck(hostDatum)) { @@ -401,7 +423,7 @@ else if (value instanceof FloatWritable) { // Do not progress, the datum will be written in the resolver thread return; } else if (checkAny) { - context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1); + skippedNotEligibleCounter.increment(1); LOG.debug("UpdateHostDb: {}: skipped_not_eligible", key); } diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index cedee8e34c..dc466dad06 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -26,6 +26,7 @@ import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -36,6 +37,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.slf4j.Logger; @@ -88,6 +90,9 @@ public static class DeleterReducer extends IndexWriters writers = null; + // Cached counter reference for performance + private Counter deletedDocumentsCounter; + @Override public void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); @@ -98,6 +103,17 @@ public void setup(Reducer.Context contex throw new RuntimeException(e); } noCommit = conf.getBoolean("noCommit", false); + + // Initialize cached counter reference + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + deletedDocumentsCounter = context.getCounter( + NutchMetrics.GROUP_CLEANING, NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL); } @Override @@ -118,7 +134,7 @@ public void reduce(ByteWritable key, Iterable values, for (Text document : values) { writers.delete(document.toString()); totalDeleted++; - context.getCounter("CleaningJobStatus", "Deleted documents").increment(1); + deletedDocumentsCounter.increment(1); } } } diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 9fb8007715..50da12b8a2 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; @@ -40,6 +41,9 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.LatencyTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; @@ -214,6 +218,22 @@ public static class IndexerReducer extends private URLNormalizers urlNormalizers; private URLFilters urlFilters; + // Latency tracker for indexing timing metrics + private LatencyTracker indexLatencyTracker; + + // Cached counter references to avoid repeated lookups in hot paths + private Counter deletedRobotsNoIndexCounter; + private Counter deletedGoneCounter; + private Counter deletedRedirectsCounter; + private Counter deletedDuplicatesCounter; + private Counter skippedNotModifiedCounter; + private Counter deletedByIndexingFilterCounter; + private Counter skippedByIndexingFilterCounter; + private Counter indexedCounter; + + // Error tracker with cached counters + private ErrorTracker errorTracker; + @Override public void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); @@ -238,6 +258,44 @@ public void setup(Reducer.Context c if (filter) { urlFilters = new URLFilters(conf); } + + // Initialize latency tracker for indexing timing + indexLatencyTracker = new LatencyTracker( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_LATENCY); + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Reducer.Context context) { + deletedRobotsNoIndexCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL); + deletedGoneCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_GONE_TOTAL); + deletedRedirectsCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL); + deletedDuplicatesCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL); + skippedNotModifiedCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL); + deletedByIndexingFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL); + skippedByIndexingFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL); + indexedCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_INDEXED_TOTAL); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_INDEXER, context); + } + + @Override + public void cleanup(Reducer.Context context) + throws IOException, InterruptedException { + // Emit indexing latency metrics + indexLatencyTracker.emitCounters(context); } @Override @@ -283,7 +341,7 @@ public void reduce(Text key, Iterable values, .indexOf("noindex") != -1) { // Delete it! context.write(key, DELETE_ACTION); - context.getCounter("IndexerStatus", "deleted (robots=noindex)").increment(1); + deletedRobotsNoIndexCounter.increment(1); return; } } @@ -300,7 +358,7 @@ public void reduce(Text key, Iterable values, if (delete && fetchDatum != null) { if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { - context.getCounter("IndexerStatus", "deleted (gone)").increment(1); + deletedGoneCounter.increment(1); context.write(key, DELETE_ACTION); return; } @@ -309,7 +367,7 @@ public void reduce(Text key, Iterable values, || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { - context.getCounter("IndexerStatus", "deleted (redirects)").increment(1); + deletedRedirectsCounter.increment(1); context.write(key, DELETE_ACTION); return; } @@ -321,14 +379,14 @@ public void reduce(Text key, Iterable values, // Whether to delete pages marked as duplicates if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { - context.getCounter("IndexerStatus", "deleted (duplicates)").increment(1); + deletedDuplicatesCounter.increment(1); context.write(key, DELETE_ACTION); return; } // Whether to skip DB_NOTMODIFIED pages if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { - context.getCounter("IndexerStatus", "skipped (not modified)").increment(1); + skippedNotModifiedCounter.increment(1); return; } @@ -337,6 +395,9 @@ public void reduce(Text key, Iterable values, return; } + // Start timing document indexing + long indexStart = System.currentTimeMillis(); + NutchDocument doc = new NutchDocument(); doc.add("id", key.toString()); @@ -355,7 +416,7 @@ public void reduce(Text key, Iterable values, boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost); } catch (final ScoringFilterException e) { - context.getCounter("IndexerStatus", "errors (ScoringFilter)").increment(1); + errorTracker.incrementCounters(e); LOG.warn("Error calculating score {}: {}", key, e); return; } @@ -390,7 +451,7 @@ public void reduce(Text key, Iterable values, doc = filters.filter(doc, parse, key, fetchDatum, inlinks); } catch (final IndexingException e) { LOG.warn("Error indexing {}: ", key, e); - context.getCounter("IndexerStatus", "errors (IndexingFilter)").increment(1); + errorTracker.incrementCounters(e); return; } @@ -400,9 +461,9 @@ public void reduce(Text key, Iterable values, if (deleteSkippedByIndexingFilter) { NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); context.write(key, action); - context.getCounter("IndexerStatus", "deleted (IndexingFilter)").increment(1); + deletedByIndexingFilterCounter.increment(1); } else { - context.getCounter("IndexerStatus", "skipped (IndexingFilter)").increment(1); + skippedByIndexingFilterCounter.increment(1); } return; } @@ -422,7 +483,10 @@ public void reduce(Text key, Iterable values, doc.add("binaryContent", binary); } - context.getCounter("IndexerStatus", "indexed (add/update)").increment(1); + // Record indexing latency + indexLatencyTracker.record(System.currentTimeMillis() - indexStart); + + indexedCounter.increment(1); NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD); context.write(key, action); diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index fc2c44a064..224b4118e6 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -30,6 +30,7 @@ import org.apache.commons.lang3.time.StopWatch; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -155,10 +156,14 @@ public void index(Path crawlDb, Path linkDb, List segments, throw e; } LOG.info("Indexer: number of documents indexed, deleted, or skipped:"); - for (Counter counter : job.getCounters().getGroup("IndexerStatus")) { - LOG.info("Indexer: {} {}", - String.format(Locale.ROOT, "%6d", counter.getValue()), - counter.getName()); + for (Counter counter : job.getCounters() + .getGroup(NutchMetrics.GROUP_INDEXER)) { + long counterValue = counter.getValue(); + if (counterValue > 0) { + LOG.info("Indexer: {} {}", + String.format(Locale.ROOT, "%6d", counterValue), + counter.getName()); + } } stopWatch.stop(); LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime( diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index 0cfb26369b..eea25e8e95 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -17,6 +17,7 @@ package org.apache.nutch.metadata; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; /** * A collection of Nutch internal metadata constants. @@ -114,4 +115,6 @@ public interface Nutch { public static final String FETCH_EVENT_FETCHTIME = "fetchTime"; /** Content-lanueage key in the Pub/Sub event metadata for the content-language of the parsed page*/ public static final String FETCH_EVENT_CONTENTLANG = "content-language"; + + public static final Writable CANONICAL_LINK_KEY = new Text("canonical.link"); } diff --git a/src/java/org/apache/nutch/metrics/ErrorTracker.java b/src/java/org/apache/nutch/metrics/ErrorTracker.java new file mode 100644 index 0000000000..1921071605 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/ErrorTracker.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.UnknownHostException; +import java.util.EnumMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.hadoop.mapreduce.TaskInputOutputContext; + +/** + * A utility class for tracking errors by category with automatic classification. + * + *

This class provides thread-safe error counting with automatic categorization + * based on exception type. It uses a bounded set of error categories to stay within + * Hadoop's counter limits (~120 counters). + * + *

Usage: + *

+ * // In mapper/reducer setup or thread initialization
+ * errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
+ * 
+ * // When catching exceptions
+ * try {
+ *     // ... operation ...
+ * } catch (Exception e) {
+ *     errorTracker.recordError(e);  // Auto-categorizes
+ * }
+ * 
+ * // Or with manual categorization
+ * errorTracker.recordError(ErrorTracker.ErrorType.NETWORK);
+ * 
+ * // In cleanup - emit all error counters
+ * errorTracker.emitCounters(context);
+ *

+ * + *

Emits the following counters: + *

errors_total - total number of errors across all categories
errors_network_total - network-related errors
errors_protocol_total - protocol errors
errors_parsing_total - parsing errors
errors_url_total - URL-related errors
errors_scoring_total - scoring filter errors
errors_indexing_total - indexing filter errors
errors_timeout_total - timeout errors
errors_other_total - uncategorized errors

+ * + * @since 1.22 + */ +public class ErrorTracker { + + /** + * Error type categories for classification. + * Uses a bounded set to stay within Hadoop's counter limits. + */ + public enum ErrorType { + /** Network-related errors (IOException, SocketException, etc.) */ + NETWORK, + /** Protocol errors (ProtocolException, ProtocolNotFound) */ + PROTOCOL, + /** Parsing errors (ParseException, ParserNotFound) */ + PARSING, + /** URL-related errors (MalformedURLException, URLFilterException) */ + URL, + /** Scoring filter errors */ + SCORING, + /** Indexing filter errors */ + INDEXING, + /** Timeout errors (SocketTimeoutException) */ + TIMEOUT, + /** Other uncategorized errors */ + OTHER + } + + private final String group; + private final Map counts; + private final AtomicLong totalCount; + + // Cached counter references for performance (optional - set via initCounters) + private org.apache.hadoop.mapreduce.Counter cachedTotalCounter; + private final Map cachedCounters; + + /** + * Creates a new ErrorTracker for the specified counter group. + * + *

This constructor creates an ErrorTracker without cached counters. + * Call {@link #initCounters(TaskInputOutputContext)} in setup() to cache + * counter references for better performance. + * + * @param group the Hadoop counter group name (e.g., NutchMetrics.GROUP_FETCHER) + */ + public ErrorTracker(String group) { + this.group = group; + this.counts = new EnumMap<>(ErrorType.class); + this.cachedCounters = new EnumMap<>(ErrorType.class); + this.totalCount = new AtomicLong(0); + + // Initialize all counts to 0 + for (ErrorType type : ErrorType.values()) { + counts.put(type, new AtomicLong(0)); + } + } + + /** + * Creates a new ErrorTracker with cached counter references. + * + *

This constructor caches all counter references at creation time, + * avoiding repeated counter lookups in hot paths. + * + * @param group the Hadoop counter group name + * @param context the Hadoop task context for caching counters + */ + public ErrorTracker(String group, TaskInputOutputContext context) { + this(group); + initCounters(context); + } + + /** + * Initializes cached counter references from the Hadoop context. + * + *

Call this method in the mapper/reducer setup() method to cache + * counter references and avoid repeated lookups during processing. + * + * @param context the Hadoop task context + */ + public void initCounters(TaskInputOutputContext context) { + cachedTotalCounter = context.getCounter(group, NutchMetrics.ERROR_TOTAL); + for (ErrorType type : ErrorType.values()) { + cachedCounters.put(type, context.getCounter(group, getCounterName(type))); + } + } + + /** + * Records an error with automatic categorization based on the throwable type. + * + * @param t the throwable to categorize and record + */ + public void recordError(Throwable t) { + recordError(categorize(t)); + } + + /** + * Records an error with explicit category. + * + * @param type the error type category + */ + public void recordError(ErrorType type) { + counts.get(type).incrementAndGet(); + totalCount.incrementAndGet(); + } + + /** + * Returns the count for a specific error type. + * + * @param type the error type + * @return the count for that error type + */ + public long getCount(ErrorType type) { + return counts.get(type).get(); + } + + /** + * Returns the total count of all errors. + * + * @return the total error count + */ + public long getTotalCount() { + return totalCount.get(); + } + + /** + * Emits all error counters to the Hadoop context. + * + *

Should be called once during cleanup to emit aggregated metrics. + * Only emits counters for error types that have non-zero counts. + * + *

If counters were cached via {@link #initCounters(TaskInputOutputContext)}, + * uses the cached references for better performance. + * + * @param context the Hadoop task context + */ + public void emitCounters(TaskInputOutputContext context) { + // Use cached counters if available, otherwise look up + if (cachedTotalCounter != null) { + cachedTotalCounter.increment(totalCount.get()); + for (ErrorType type : ErrorType.values()) { + long count = counts.get(type).get(); + if (count > 0) { + cachedCounters.get(type).increment(count); + } + } + } else { + // Fallback to direct lookup + context.getCounter(group, NutchMetrics.ERROR_TOTAL).increment(totalCount.get()); + for (ErrorType type : ErrorType.values()) { + long count = counts.get(type).get(); + if (count > 0) { + context.getCounter(group, getCounterName(type)).increment(count); + } + } + } + } + + /** + * Directly increments cached error counters without local accumulation. + * + *

Use this method when you want to immediately update Hadoop counters + * rather than accumulating locally and emitting in cleanup. + * Requires {@link #initCounters(TaskInputOutputContext)} to have been called. + * + * @param t the throwable to categorize and count + * @throws IllegalStateException if counters have not been initialized + */ + public void incrementCounters(Throwable t) { + incrementCounters(categorize(t)); + } + + /** + * Directly increments cached error counters without local accumulation. + * + *

Use this method when you want to immediately update Hadoop counters + * rather than accumulating locally and emitting in cleanup. + * Requires {@link #initCounters(TaskInputOutputContext)} to have been called. + * + * @param type the error type to count + * @throws IllegalStateException if counters have not been initialized + */ + public void incrementCounters(ErrorType type) { + if (cachedTotalCounter == null) { + throw new IllegalStateException( + "Counters not initialized. Call initCounters() first."); + } + cachedTotalCounter.increment(1); + cachedCounters.get(type).increment(1); + } + + /** + * Categorizes a throwable into an error type. + * + *

The categorization checks the exception class hierarchy to determine + * the most appropriate category. Timeout exceptions are checked first as + * they are a subclass of IOException. + * + * @param t the throwable to categorize + * @return the appropriate ErrorType for the throwable + */ + public static ErrorType categorize(Throwable t) { + if (t == null) { + return ErrorType.OTHER; + } + + String className = t.getClass().getName(); + + // Check for timeout first (before general IOException) + if (t instanceof SocketTimeoutException + || className.contains("TimeoutException") + || className.contains("Timeout")) { + return ErrorType.TIMEOUT; + } + + // Network errors + if (t instanceof SocketException + || t instanceof UnknownHostException + || className.contains("ConnectException") + || className.contains("NoRouteToHostException") + || className.contains("ConnectionRefusedException")) { + return ErrorType.NETWORK; + } + + // URL errors (check before general IOException since MalformedURLException extends IOException) + if (t instanceof MalformedURLException + || className.contains("URLFilterException") + || className.contains("URISyntaxException")) { + return ErrorType.URL; + } + + // General IOException (but not the specific subtypes above) + if (t instanceof IOException) { + return ErrorType.NETWORK; + } + + // Protocol errors + if (className.contains("ProtocolException") + || className.contains("ProtocolNotFound")) { + return ErrorType.PROTOCOL; + } + + // Parsing errors + if (className.contains("ParseException") + || className.contains("ParserNotFound") + || className.contains("SAXException") + || className.contains("ParserConfigurationException")) { + return ErrorType.PARSING; + } + + // Scoring errors + if (className.contains("ScoringFilterException")) { + return ErrorType.SCORING; + } + + // Indexing errors + if (className.contains("IndexingException")) { + return ErrorType.INDEXING; + } + + // Check cause chain for more specific categorization + Throwable cause = t.getCause(); + if (cause != null && cause != t) { + ErrorType causeType = categorize(cause); + if (causeType != ErrorType.OTHER) { + return causeType; + } + } + + return ErrorType.OTHER; + } + + /** + * Gets the counter name constant for a given error type. + * + * @param type the error type + * @return the counter name constant from NutchMetrics + */ + public static String getCounterName(ErrorType type) { + switch (type) { + case NETWORK: + return NutchMetrics.ERROR_NETWORK_TOTAL; + case PROTOCOL: + return NutchMetrics.ERROR_PROTOCOL_TOTAL; + case PARSING: + return NutchMetrics.ERROR_PARSING_TOTAL; + case URL: + return NutchMetrics.ERROR_URL_TOTAL; + case SCORING: + return NutchMetrics.ERROR_SCORING_TOTAL; + case INDEXING: + return NutchMetrics.ERROR_INDEXING_TOTAL; + case TIMEOUT: + return NutchMetrics.ERROR_TIMEOUT_TOTAL; + case OTHER: + default: + return NutchMetrics.ERROR_OTHER_TOTAL; + } + } + + /** + * Gets the counter name for a throwable based on its categorization. + * + *

This is a convenience method for direct use in catch blocks: + *

+   * } catch (Exception e) {
+   *     context.getCounter(group, ErrorTracker.getCounterName(e)).increment(1);
+   * }
+   *

+ * + * @param t the throwable to get the counter name for + * @return the counter name constant from NutchMetrics + */ + public static String getCounterName(Throwable t) { + return getCounterName(categorize(t)); + } +} diff --git a/src/java/org/apache/nutch/metrics/LatencyTracker.java b/src/java/org/apache/nutch/metrics/LatencyTracker.java new file mode 100644 index 0000000000..3777bb29e3 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/LatencyTracker.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +import org.apache.hadoop.mapreduce.TaskInputOutputContext; + +import com.tdunning.math.stats.TDigest; + +/** + * A utility class for tracking latency metrics using TDigest for percentile + * calculation. + * + *

This class wraps a TDigest data structure to collect latency samples and + * emit Hadoop counters with count, sum, and percentile values (p50, p95, p99). + * + *

Usage: + *

+ * // In mapper/reducer setup
+ * latencyTracker = new LatencyTracker(NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY);
+ * 
+ * // During processing
+ * long start = System.currentTimeMillis();
+ * // ... operation ...
+ * latencyTracker.record(System.currentTimeMillis() - start);
+ * 
+ * // In cleanup
+ * latencyTracker.emitCounters(context);
+ *

+ * + *

Emits the following counters: + *

{prefix}_count_total - total number of samples
{prefix}_sum_ms - sum of all latencies in milliseconds
{prefix}_p50_ms - 50th percentile (median) latency
{prefix}_p95_ms - 95th percentile latency
{prefix}_p99_ms - 99th percentile latency

+ * + * @since 1.22 + */ +public class LatencyTracker { + + /** Default compression factor for TDigest (controls accuracy vs memory). */ + private static final double DEFAULT_COMPRESSION = 100.0; + + private final TDigest digest; + private final String group; + private final String prefix; + private long count = 0; + private long sum = 0; + + /** + * Creates a new LatencyTracker. + * + * @param group the Hadoop counter group name + * @param prefix the prefix for counter names (e.g., "fetch_latency") + */ + public LatencyTracker(String group, String prefix) { + this.digest = TDigest.createDigest(DEFAULT_COMPRESSION); + this.group = group; + this.prefix = prefix; + } + + /** + * Records a latency sample. + * + * @param latencyMs the latency in milliseconds + */ + public void record(long latencyMs) { + digest.add(latencyMs); + count++; + sum += latencyMs; + } + + /** + * Returns the number of recorded samples. + * + * @return the count of recorded latency samples + */ + public long getCount() { + return count; + } + + /** + * Returns the sum of all recorded latencies. + * + * @return the sum of latencies in milliseconds + */ + public long getSum() { + return sum; + } + + /** + * Returns the percentile value for the given quantile. + * + * @param quantile the quantile (0.0 to 1.0) + * @return the percentile value in milliseconds + */ + public long getPercentile(double quantile) { + if (count == 0) { + return 0; + } + return (long) digest.quantile(quantile); + } + + /** + * Emits all latency counters to the Hadoop context. + * + *

Should be called once during cleanup to emit aggregated metrics. + * + * @param context the Hadoop task context + */ + public void emitCounters(TaskInputOutputContext context) { + context.getCounter(group, prefix + "_count_total").setValue(count); + context.getCounter(group, prefix + "_sum_ms").setValue(sum); + + if (count > 0) { + context.getCounter(group, prefix + "_p50_ms").setValue((long) digest.quantile(0.50)); + context.getCounter(group, prefix + "_p95_ms").setValue((long) digest.quantile(0.95)); + context.getCounter(group, prefix + "_p99_ms").setValue((long) digest.quantile(0.99)); + } else { + // Set to 0 if no samples recorded + context.getCounter(group, prefix + "_p50_ms").setValue(0); + context.getCounter(group, prefix + "_p95_ms").setValue(0); + context.getCounter(group, prefix + "_p99_ms").setValue(0); + } + } +} + + diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java new file mode 100644 index 0000000000..14979803a1 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -0,0 +1,720 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.FetchSchedule; + +/** + * Centralized constants for Hadoop metrics counter groups and names. + * + *

Follows Prometheus + * naming conventions: + *

Counter groups use the {@code nutch_} prefix namespace
Counter names use snake_case
Accumulating counters use {@code _total} suffix
Units are included in counter names where applicable (e.g., {@code _bytes})

+ * + * @since 1.22 + */ +public final class NutchMetrics { + + private NutchMetrics() { + // Utility class - prevent instantiation + } + + // ========================================================================= + // Counter Groups (Prometheus namespace style with nutch_ prefix) + // ========================================================================= + + /** Counter group for fetcher operations. */ + public static final String GROUP_FETCHER = "nutch_fetcher"; + + /** Counter group for fetcher outlink processing. */ + public static final String GROUP_FETCHER_OUTLINKS = "nutch_fetcher_outlinks"; + + /** Counter group for generator operations. */ + public static final String GROUP_GENERATOR = "nutch_generator"; + + /** Counter group for indexer operations. */ + public static final String GROUP_INDEXER = "nutch_indexer"; + + /** Counter group for CrawlDb operations. */ + public static final String GROUP_CRAWLDB = "nutch_crawldb"; + + /** Counter group for CrawlDb filter operations. */ + public static final String GROUP_CRAWLDB_FILTER = "nutch_crawldb_filter"; + + /** Counter group for injector operations. */ + public static final String GROUP_INJECTOR = "nutch_injector"; + + /** Counter group for HostDb operations. */ + public static final String GROUP_HOSTDB = "nutch_hostdb"; + + /** Counter group for parser operations. */ + public static final String GROUP_PARSER = "nutch_parser"; + + /** Counter group for deduplication operations. */ + public static final String GROUP_DEDUP = "nutch_dedup"; + + /** Counter group for cleaning job operations. */ + public static final String GROUP_CLEANING = "nutch_cleaning"; + + /** Counter group for WebGraph operations. */ + public static final String GROUP_WEBGRAPH = "nutch_webgraph"; + + /** Counter group for sitemap processing operations. */ + public static final String GROUP_SITEMAP = "nutch_sitemap"; + + /** Counter group for WARC export operations. */ + public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter"; + + /** Counter group for Common Crawl data dumper tool. */ + public static final String GROUP_COMMONCRAWL_DUMPER = "nutch_commoncrawl_dumper"; + + /** Counter group for domain statistics operations. */ + public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats"; + + // ========================================================================= + // Fetcher Counters + // ========================================================================= + + /** Total bytes downloaded by fetcher. */ + public static final String FETCHER_BYTES_DOWNLOADED_TOTAL = "bytes_downloaded_total"; + + /** URLs denied by robots.txt. */ + public static final String FETCHER_ROBOTS_DENIED_TOTAL = "robots_denied_total"; + + /** URLs denied due to crawl delay exceeding maximum. */ + public static final String FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL = "robots_denied_maxcrawldelay_total"; + + /** URLs dropped due to robots.txt deferred visits. */ + public static final String FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL = "robots_defer_visits_dropped_total"; + + /** Redirects that exceeded maximum redirect count. */ + public static final String FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL = "redirect_count_exceeded_total"; + + /** Redirects deduplicated (already seen). */ + public static final String FETCHER_REDIRECT_DEDUPLICATED_TOTAL = "redirect_deduplicated_total"; + + /** FetchItems not created for redirects. */ + public static final String FETCHER_REDIRECT_NOT_CREATED_TOTAL = "redirect_not_created_total"; + + /** URLs hit by time limit. */ + public static final String FETCHER_HIT_BY_TIMELIMIT_TOTAL = "hit_by_timelimit_total"; + + /** URLs hit by timeout. */ + public static final String FETCHER_HIT_BY_TIMEOUT_TOTAL = "hit_by_timeout_total"; + + /** URLs hit by throughput threshold. */ + public static final String FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL = "hit_by_throughput_threshold_total"; + + /** Threads that hung during fetching. */ + public static final String FETCHER_HUNG_THREADS_TOTAL = "hung_threads_total"; + + /** URLs filtered during fetching. */ + public static final String FETCHER_FILTERED_TOTAL = "filtered_total"; + + /** URLs dropped due to exception threshold in queue. */ + public static final String FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL = "above_exception_threshold_total"; + + // ========================================================================= + // Fetcher Outlinks Counters + // ========================================================================= + + /** Outlinks detected during parsing. */ + public static final String FETCHER_OUTLINKS_DETECTED_TOTAL = "outlinks_detected_total"; + + /** Outlinks being followed. */ + public static final String FETCHER_OUTLINKS_FOLLOWING_TOTAL = "outlinks_following_total"; + + // ========================================================================= + // Fetcher Common Crawl extensions + // ========================================================================= + + /** HTTP protocol version group with dynamic counters. */ + public static final String FETCHER_HTTP_PROTOCOL_VERSION_GROUP = "http_protocol_version"; + + public static final String FETCHER_HTTP_PROTOCOL_UNKNOWN = "unknown"; + + /** SSL/TLS protocol version group with dynamic counters. */ + public static final String FETCHER_TLS_PROTOCOL_VERSION_GROUP = "tls_protocol_version"; + + /** IP address version group with two counters: ipv4 and ipv6. */ + public static final String FETCHER_IP_ADDRESS_VERSION_GROUP = "ip_address_version"; + + /** Number of fetches over IPv4. */ + public static final String FETCHER_IPV4_TOTAL = "ipv4"; + + /** Number of fetches over IPv6. */ + public static final String FETCHER_IPV6_TOTAL = "ipv6"; + + /** Archiving of robots.txt captures. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_GROUP = "robotstxt_archiving"; + + /** Robots.txt not archived: URL rejected by URL filters. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL = "filtered"; + + /** Robots.txt not archived: MIME type rejected. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL = "filtered_mime"; + + /** + * Robots.txt not archived: URL path not /robots.txt and + * disallowed by robots.txt. + */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL = "robots_denied"; + + // ========================================================================= + // Common Crawl's WarcWriter + // ========================================================================= + + /** Counter group for Common Crawl's WARC writer. */ + public static final String GROUP_WARC_WRITER = "warc_writer"; + + /** Skipped records because no content (and protocol status) is available. */ + public static final String WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL = "skipped_no_content"; + + /** Fixed records: invalid URI normalized. */ + public static final String WARC_WRITER_URI_NORMALIZED_TOTAL = "fixed_uri"; + + /** Skipped records because URL is not a valid URI (no WARC-Target-URI). */ + public static final String WARC_WRITER_SKIPPED_INVALID_URI_TOTAL = "skipped_invalid_uri"; + + /** Skipped records by content type / MIME type. */ + public static final String WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL = "skipped_by_content_type"; + + /** Skipped duplicate records. */ + public static final String WARC_WRITER_SKIPPED_DUPLICATE_TOTAL = "skipped_duplicate"; + + /** Skipped records: no protocol status. */ + public static final String WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL = "skipped_no_protocol_status"; + + /** Skipped records: unknown protocol status. */ + public static final String WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL = "skipped_unknown_protocol_status"; + + /** Prefix for error status of language identification (LID), returned by CLD2 Java bindings. */ + public static final String WARC_WRITER_LID_ERROR_PREFIX = "lid_error: "; + + /** Language identification (LID): no result. */ + public static final String WARC_WRITER_LID_NO_RESULT_TOTAL = "lid_no_result"; + + /** Language identification (LID): result is reliable. */ + public static final String WARC_WRITER_LID_RESULT_RELIABLE_TOTAL = "lid_reliable"; + + /** Language identification (LID): result is not reliable. */ + public static final String WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL = "lid_not_reliable"; + + // ========================================================================= + // Generator Counters + // ========================================================================= + + /** URLs rejected by URL filters. */ + public static final String GENERATOR_URL_FILTERS_REJECTED_TOTAL = "url_filters_rejected_total"; + + /** URLs rejected by fetch schedule. */ + public static final String GENERATOR_SCHEDULE_REJECTED_TOTAL = "schedule_rejected_total"; + + /** URLs waiting for CrawlDb update. */ + public static final String GENERATOR_WAIT_FOR_UPDATE_TOTAL = "wait_for_update_total"; + + /** URLs rejected by JEXL expression. */ + public static final String GENERATOR_EXPR_REJECTED_TOTAL = "expr_rejected_total"; + + /** URLs rejected due to status restriction. */ + public static final String GENERATOR_STATUS_REJECTED_TOTAL = "status_rejected_total"; + + /** URLs rejected due to score below threshold. */ + public static final String GENERATOR_SCORE_TOO_LOW_TOTAL = "score_too_low_total"; + + /** URLs rejected due to fetch interval exceeding threshold. */ + public static final String GENERATOR_INTERVAL_REJECTED_TOTAL = "interval_rejected_total"; + + /** URLs skipped due to per-host overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL = "urls_skipped_per_host_overflow_total"; + + /** Hosts affected by per-host overflow. */ + public static final String GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL = "hosts_affected_per_host_overflow_total"; + + // ========================================================================= + // Generator2-specific Counters + // ========================================================================= + + /** Domains affected by per-domain overflow. All remaining URLs of this domain have been skipped, but were not counted. */ + public static final String GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL = "domains_affected_per_domain_overflow_total"; + + /** Domains affected by max. number of hosts per domain overflow. URLs from further hosts below this domain have been skipped. */ + public static final String GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "domains_affected_num_hosts_overflow_total"; + + /** URLs skipped due to the max. number of hosts per domain overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "urls_skipped_per_max_num_host_overflow_total"; + + /** URLs skipped due to per-segment overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL = "urls_skipped_per_segment_overflow_total"; + + /** + * Counter group for items by status, rejected by the fetch schedule. See + * {@link FetchSchedule#shouldFetch(Text, CrawlDatum, long)}. + */ + public static final String GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS = "schedule_rejected_by_status"; + + /** + * Counter group for items by status, rejected because the generator score is + * lower than the minimum score defined per generate.min.score. + */ + public static final String GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS = "score_rejected_by_status"; + + /** Counter group for items by status, selected for fetch. */ + public static final String GROUP_GENERATOR_SELECTED_BY_STATUS = "selected_by_status"; + + // ========================================================================= + // Indexer Counters + // ========================================================================= + + /** Documents deleted due to robots noindex. */ + public static final String INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL = "deleted_robots_noindex_total"; + + /** Documents deleted because they are gone. */ + public static final String INDEXER_DELETED_GONE_TOTAL = "deleted_gone_total"; + + /** Documents deleted due to redirects. */ + public static final String INDEXER_DELETED_REDIRECTS_TOTAL = "deleted_redirects_total"; + + /** Documents deleted as duplicates. */ + public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total"; + + /** Documents deleted by indexing filter. */ + public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total"; + + /** Documents skipped (not modified). */ + public static final String INDEXER_SKIPPED_NOT_MODIFIED_TOTAL = "skipped_not_modified_total"; + + /** Documents skipped by indexing filter. */ + public static final String INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL = "skipped_by_indexing_filter_total"; + + /** Documents indexed (added or updated). */ + public static final String INDEXER_INDEXED_TOTAL = "indexed_total"; + + // ========================================================================= + // CrawlDb Counters + // ========================================================================= + + /** URLs filtered during CrawlDb operations. */ + public static final String CRAWLDB_URLS_FILTERED_TOTAL = "urls_filtered_total"; + + /** Gone (404) records removed during CrawlDb operations. */ + public static final String CRAWLDB_GONE_RECORDS_REMOVED_TOTAL = "gone_records_removed_total"; + + /** Orphan records removed during CrawlDb operations. */ + public static final String CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL = "orphan_records_removed_total"; + + // ========================================================================= + // Injector Counters + // ========================================================================= + + /** URLs filtered during injection. */ + public static final String INJECTOR_URLS_FILTERED_TOTAL = "urls_filtered_total"; + + /** URLs injected. */ + public static final String INJECTOR_URLS_INJECTED_TOTAL = "urls_injected_total"; + + /** Unique URLs injected. */ + public static final String INJECTOR_URLS_INJECTED_UNIQUE_TOTAL = "urls_injected_unique_total"; + + /** URLs merged with existing CrawlDb entries. */ + public static final String INJECTOR_URLS_MERGED_TOTAL = "urls_merged_total"; + + /** URLs purged due to 404 status. */ + public static final String INJECTOR_URLS_PURGED_404_TOTAL = "urls_purged_404_total"; + + /** URLs purged by filter. */ + public static final String INJECTOR_URLS_PURGED_FILTER_TOTAL = "urls_purged_filter_total"; + + // ========================================================================= + // HostDb Counters + // ========================================================================= + + /** Records filtered in HostDb. */ + public static final String HOSTDB_FILTERED_RECORDS_TOTAL = "filtered_records_total"; + + /** Total hosts processed. */ + public static final String HOSTDB_TOTAL_HOSTS_TOTAL = "total_hosts_total"; + + /** Hosts skipped (not eligible). */ + public static final String HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL = "skipped_not_eligible_total"; + + /** Hosts where URL limit was not reached. */ + public static final String HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL = "url_limit_not_reached_total"; + + /** New known hosts discovered. */ + public static final String HOSTDB_NEW_KNOWN_HOST_TOTAL = "new_known_host_total"; + + /** Rediscovered hosts. */ + public static final String HOSTDB_REDISCOVERED_HOST_TOTAL = "rediscovered_host_total"; + + /** Existing known hosts. */ + public static final String HOSTDB_EXISTING_KNOWN_HOST_TOTAL = "existing_known_host_total"; + + /** New unknown hosts. */ + public static final String HOSTDB_NEW_UNKNOWN_HOST_TOTAL = "new_unknown_host_total"; + + /** Existing unknown hosts. */ + public static final String HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL = "existing_unknown_host_total"; + + /** Purged unknown hosts. */ + public static final String HOSTDB_PURGED_UNKNOWN_HOST_TOTAL = "purged_unknown_host_total"; + + /** Hosts checked. */ + public static final String HOSTDB_CHECKED_HOSTS_TOTAL = "checked_hosts_total"; + + // ========================================================================= + // Deduplication Counters + // ========================================================================= + + /** Documents marked as duplicate. */ + public static final String DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL = "documents_marked_duplicate_total"; + + // ========================================================================= + // Redirect Deduplication Counters + // ========================================================================= + + /** Redirects kept as non-duplicates. */ + public static final String DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL = "redirects_marked_not_duplicate_total"; + + /** Redirects in CrawlDb. */ + public static final String DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL = "redirects_in_crawldb_total"; + + /** Self-referential redirects in CrawlDb. */ + public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL = "redirects_self_referential_total"; + + /** Self-referential redirects kept as non-duplicates. */ + public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL = "redirects_self_referential_marked_not_duplicate_total"; + + // ========================================================================= + // Cleaning Job Counters + // ========================================================================= + + /** Documents deleted during cleaning. */ + public static final String CLEANING_DELETED_DOCUMENTS_TOTAL = "deleted_documents_total"; + + // ========================================================================= + // WebGraph Counters + // ========================================================================= + + /** Links added to WebGraph. */ + public static final String WEBGRAPH_ADDED_LINKS_TOTAL = "added_links_total"; + + /** Links removed from WebGraph. */ + public static final String WEBGRAPH_REMOVED_LINKS_TOTAL = "removed_links_total"; + + // ========================================================================= + // Sitemap Counters + // ========================================================================= + + /** Filtered records in sitemap processing. */ + public static final String SITEMAP_FILTERED_RECORDS_TOTAL = "filtered_records_total"; + + /** Seeds extracted from sitemaps. */ + public static final String SITEMAP_SEEDS_TOTAL = "sitemap_seeds_total"; + + /** Sitemaps discovered from hostname. */ + public static final String SITEMAP_FROM_HOSTNAME_TOTAL = "sitemaps_from_hostname_total"; + + /** Sitemaps filtered from hostname. */ + public static final String SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL = "filtered_sitemaps_from_hostname_total"; + + /** Failed sitemap fetches. */ + public static final String SITEMAP_FAILED_FETCHES_TOTAL = "failed_fetches_total"; + + /** Existing sitemap entries. */ + public static final String SITEMAP_EXISTING_ENTRIES_TOTAL = "existing_sitemap_entries_total"; + + /** New sitemap entries. */ + public static final String SITEMAP_NEW_ENTRIES_TOTAL = "new_sitemap_entries_total"; + + // ========================================================================= + // SitemapInjector Counters + // ========================================================================= + + /** SitemapInjector counter group. */ + public static final String GROUP_SITEMAP_INJECTOR = "sitemap_injector"; + + /** Failed to fetch sitemap content, disallowed per robots.txt. */ + public static final String SITEMAP_ROBOTSTXT_DISALLOW_TOTAL = "sitemap_robotstxt_disallow"; + + /** Sitemap failed to parse. */ + public static final String SITEMAP_FAILED_TO_PARSE_TOTAL = "sitemaps_failed_to_parse"; + + /** Prefix for sitemap type counter. */ + public static final String SITEMAP_TYPE_PREFIX = "sitemap_type_"; + + /** Sitemaps processed total. */ + public static final String SITEMAP_PROCESSED_TOTAL = "sitemaps_processed"; + + /** Sitemap index: affected by URL limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL = "sitemap_index_url_limit"; + + /** Sitemap index: affected by depth limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL = "sitemap_index_depth_limit"; + + /** Sitemap index: affected by time limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL = "sitemap_index_time_limit"; + + /** Sitemap index: skipped because no URLs found after 50% of time limit. */ + public static final String SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL = "sitemap_index_no_urls_after_50_percent_of_time_limit"; + + /** Sitemap index: skipped because of too many fetch failures. */ + public static final String SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL = "sitemap_index_too_many_failures"; + + /** Sitemap index: processed sitemaps. */ + public static final String SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL = "sitemap_index_processed_sitemaps"; + + /** Skipped duplicated or recursive sitemap URLs. */ + public static final String SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL = "sitemap_skipped_duplicate_or_recursive_sitemap_url"; + + /** Sitemap index: affected by max. number of sitemaps in index. */ + public static final String SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL = "sitemap_index_max_sitemaps_limit"; + + /** Sitemap failed to fetch. */ + public static final String SITEMAP_FAILED_TO_FETCH_TOTAL = "sitemap_failed_to_fetch"; + + /** Sitemap skipped because of overlong URL. */ + public static final String SITEMAP_SKIPPED_OVERLONG_URL_TOTAL = "sitemap_skipped_overlong_url"; + + /** Sitemap rejected by URL filters */ + public static final String SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_rejected_by_url_filters"; + + /** Sitemap skipped, too many failures per host. */ + public static final String SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL = "sitemap_skipped_too_many_failures_per_host"; + + /** Could not fetch sitemap content, protocol not supported. */ + public static final String SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL = "sitemap_protocol_not_supported"; + + /** Failed to fetch sitemap content because of timeout. */ + public static final String SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL = "sitemap_failed_to_fetch_timeout"; + + /** Failed to fetch sitemap content because of exception. */ + public static final String SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL = "sitemap_failed_to_fetch_exception"; + + /** Sitemap redirect. */ + public static final String SITEMAP_REDIRECT_TOTAL = "sitemap_redirect"; + + /** Sitemap redirect target rejected by URL filters */ + public static final String SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_redirect_target_rejected_by_url_filters"; + + /** Sitemap redirect limit exceeded (max. number of redirects followed). */ + public static final String SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL = "sitemap_redirect_limit_exceeded"; + + /** Failed to fetch sitemap content, HTTP status != 200. */ + public static final String SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL = "sitemap_failed_to_fetch_http_status_code_not_200"; + + /** Failed to fetch sitemap content, empty content. */ + public static final String SITEMAP_EMPTY_CONTENT_TOTAL = "sitemap_empty_content"; + + /** Empty sitemap. */ + public static final String SITEMAP_EMPTY_TOTAL = "sitemap_empty"; + + /** Sitemap URL limit reached. */ + public static final String SITEMAP_URL_LIMIT_REACHED_TOTAL = "sitemap_url_limit_reached"; + + /** URLs randomly skipped. */ + public static final String SITEMAP_RANDOM_SKIP_TOTAL = "urls_random_skip"; + + /** URLs from sitemaps rejected, host limit reached. */ + public static final String SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL = "urls_skipped_host_limit_reached"; + + /** URLs from sitemaps rejected, target not allowed by cross-submit. */ + public static final String SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL = "urls_skipped_not_allowed_by_cross_submits"; + + /** URLs from sitemaps rejected by URL filters. */ + public static final String SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS = "urls_from_sitemaps_rejected_by_url_filters"; + + /** URLs from sitemaps injected. */ + public static final String SITEMAP_URLS_INJECTED = "urls_from_sitemaps_injected"; + + // ========================================================================= + // WARC Exporter Counters + // ========================================================================= + + /** Missing content in WARC export. */ + public static final String WARC_MISSING_CONTENT_TOTAL = "missing_content_total"; + + /** Missing metadata in WARC export. */ + public static final String WARC_MISSING_METADATA_TOTAL = "missing_metadata_total"; + + /** Omitted empty responses in WARC export. */ + public static final String WARC_OMITTED_EMPTY_RESPONSE_TOTAL = "omitted_empty_response_total"; + + /** WARC records generated. */ + public static final String WARC_RECORDS_GENERATED_TOTAL = "records_generated_total"; + + // ========================================================================= + // Domain Statistics Counters (enum-based, kept for compatibility) + // ========================================================================= + + /** Fetched URLs in domain statistics. */ + public static final String DOMAIN_STATS_FETCHED_TOTAL = "fetched_total"; + + /** Not fetched URLs in domain statistics. */ + public static final String DOMAIN_STATS_NOT_FETCHED_TOTAL = "not_fetched_total"; + + /** Empty results in domain statistics. */ + public static final String DOMAIN_STATS_EMPTY_RESULT_TOTAL = "empty_result_total"; + + // ========================================================================= + // UrlCleaner + // ========================================================================= + + public static final String GROUP_URLCLEANER = "urlcleaner"; + + public static final String URLCLEANER_REJECTED_TOTAL = "urls_rejected"; + + public static final String URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL = "urls_rejected_invalid_domain"; + + public static final String URLCLEANER_ACCEPTED_UNCHANGED_TOTAL = "urls_accepted_unchanged"; + + public static final String URLCLEANER_ACCEPTED_NORMALIZED_TOTAL = "urls_accepted_normalized"; + + // ========================================================================= + // UrlSampler and UrlSamplerHost + // ========================================================================= + + public static final String GROUP_URLSAMPLER = "urlsampler"; + + public static final String GROUP_URLSAMPLER_HOST = "urlsamplerhost"; + + public static final String URLSAMPLER_MALFORMED_URL_TOTAL = "malformed_url"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_TOTAL = "skipped_max_urls"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL = "skipped_max_urls_per_host"; + + public static final String URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL = "skipped_max_hosts"; + + public static final String URLSAMPLER_HOSTS = "hosts"; + + public static final String URLSAMPLER_URLS = "urls"; + + public static final String URLSAMPLER_HOSTS_WITH_LIMIT = "hosts_with_limit"; + + public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT = "urls_host_with_limit"; + + public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT = "hosts_without_limit"; + + public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT = "urls_host_without_limit"; + + public static final String URLSAMPLER_URLS_SAMPLED = "urls_sampled"; + + public static final String URLSAMPLER_HOSTS_SAMPLED = "hosts_sampled"; + + public static final String URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED = "hosts_with_limit_sampled"; + + public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED = "urls_host_with_limit_sampled"; + + public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED = "hosts_without_limit_sampled"; + + public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED = "urls_host_without_limit_sampled"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST = "skipped_max_urls_per_host"; + + public static final String URLSAMPLER_SKIPPED_RANDOM = "skipped_random"; + + // ========================================================================= + // Latency Metric Prefixes (used with LatencyTracker) + // ========================================================================= + + /** + * Prefix for fetch latency metrics. + * Used with {@link LatencyTracker} to emit fetch timing counters. + */ + public static final String FETCHER_LATENCY = "fetch_latency"; + + /** + * Prefix for parse latency metrics. + * Used with {@link LatencyTracker} to emit parse timing counters. + */ + public static final String PARSER_LATENCY = "parse_latency"; + + /** + * Prefix for indexer latency metrics. + * Used with {@link LatencyTracker} to emit indexing timing counters. + */ + public static final String INDEXER_LATENCY = "index_latency"; + + // ========================================================================= + // Common Error Counter Names (used with component-specific groups) + // These constants are shared across all components for consistent error + // categorization. Use with ErrorTracker for automatic classification. + // ========================================================================= + + /** + * Total errors across all categories. + * This is incremented alongside any category-specific error counter. + */ + public static final String ERROR_TOTAL = "errors_total"; + + /** + * Network-related errors. + * Includes: IOException, SocketException, ConnectException, UnknownHostException + */ + public static final String ERROR_NETWORK_TOTAL = "errors_network_total"; + + /** + * Protocol errors. + * Includes: ProtocolException, ProtocolNotFound + */ + public static final String ERROR_PROTOCOL_TOTAL = "errors_protocol_total"; + + /** + * Parsing errors. + * Includes: ParseException, ParserNotFound + */ + public static final String ERROR_PARSING_TOTAL = "errors_parsing_total"; + + /** + * URL-related errors. + * Includes: MalformedURLException, URLFilterException + */ + public static final String ERROR_URL_TOTAL = "errors_url_total"; + + /** + * Scoring filter errors. + * Includes: ScoringFilterException + */ + public static final String ERROR_SCORING_TOTAL = "errors_scoring_total"; + + /** + * Indexing filter errors. + * Includes: IndexingException + */ + public static final String ERROR_INDEXING_TOTAL = "errors_indexing_total"; + + /** + * Timeout errors. + * Includes: SocketTimeoutException, connection timeouts + */ + public static final String ERROR_TIMEOUT_TOTAL = "errors_timeout_total"; + + /** + * Other uncategorized errors. + * Used as fallback for exceptions not matching any specific category. + */ + public static final String ERROR_OTHER_TOTAL = "errors_other_total"; +} + diff --git a/src/java/org/apache/nutch/metrics/package-info.java b/src/java/org/apache/nutch/metrics/package-info.java new file mode 100644 index 0000000000..376605d043 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/package-info.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Metrics infrastructure for Apache Nutch. + * + *

This package provides centralized constants and utilities for Hadoop + * MapReduce metrics/counters following + * Prometheus naming + * conventions. + * + *

The main class is {@link org.apache.nutch.metrics.NutchMetrics} which + * defines all counter group names and counter names as constants. + * + * @since 1.22 + */ +package org.apache.nutch.metrics; + diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 6b2fb5cee7..0b2a6f2290 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -37,6 +37,9 @@ import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.LatencyTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.scoring.ScoringFilterException; @@ -80,12 +83,25 @@ public static class ParseSegmentMapper extends private Text newKey = new Text(); private ScoringFilters scfilters; private boolean skipTruncated; + private LatencyTracker parseLatencyTracker; + private ErrorTracker errorTracker; @Override public void setup(Mapper, Content, Text, ParseImpl>.Context context) { Configuration conf = context.getConfiguration(); scfilters = new ScoringFilters(conf); skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true); + parseLatencyTracker = new LatencyTracker( + NutchMetrics.GROUP_PARSER, NutchMetrics.PARSER_LATENCY); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_PARSER, context); + } + + @Override + public void cleanup(Mapper, Content, Text, ParseImpl>.Context context) + throws IOException, InterruptedException { + // Emit parse latency metrics + parseLatencyTracker.emitCounters(context); } @Override @@ -121,6 +137,7 @@ public void map(WritableComparable key, Content content, parseResult = parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: {}: {}", key, StringUtils.stringifyException(e)); + errorTracker.incrementCounters(e); return; } @@ -129,7 +146,8 @@ public void map(WritableComparable key, Content content, Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); - context.getCounter("ParserStatus", + // Dynamic counter based on parse status + context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[parseStatus.getMajorCode()]).increment(1); if (!parseStatus.isSuccess()) { @@ -151,10 +169,13 @@ public void map(WritableComparable key, Content content, scfilters.passScoreAfterParsing(url, content, parse); } catch (ScoringFilterException e) { LOG.warn("Error passing score: {}: {}", url, e.getMessage()); + errorTracker.incrementCounters(ErrorTracker.ErrorType.SCORING); } long end = System.currentTimeMillis(); - LOG.info("Parsed ({}ms): {}", (end - start), url); + long parseTime = end - start; + parseLatencyTracker.record(parseTime); + LOG.info("Parsed ({}ms): {}", parseTime, url); context.write( url, diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java index 052a342247..25b8ae1b47 100644 --- a/src/java/org/apache/nutch/parse/ParseStatus.java +++ b/src/java/org/apache/nutch/parse/ParseStatus.java @@ -56,7 +56,7 @@ public class ParseStatus implements Writable { // Secondary failure codes go here: /** - * Parsing failed. An Exception occured (which may be retrieved from the + * Parsing failed. An Exception occurred (which may be retrieved from the * arguments). */ public static final short FAILED_EXCEPTION = 200; diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java index 10ce4fdb7b..95208fa433 100644 --- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java +++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java @@ -18,11 +18,11 @@ import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -124,10 +124,7 @@ public File getPluginFolder(String name) { String path = url.getPath(); if (WINDOWS && path.startsWith("/")) // patch a windows bug path = path.substring(1); - try { - path = URLDecoder.decode(path, "UTF-8"); // decode the url path - } catch (UnsupportedEncodingException e) { - } + path = URLDecoder.decode(path, StandardCharsets.UTF_8); // decode the url path directory = new File(path); } else if (!directory.exists()) { LOG.warn("Plugins: directory not found: {}", name); diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index ab4162c87f..2514eae33e 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.URL; import java.util.List; import org.apache.hadoop.conf.Configurable; @@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable { BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List robotsTxtContent); + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + default BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return getRobotRules(new Text(url.toString()), datum, robotsTxtContent); + } + } diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 4daefcd8f3..fee0921d0a 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -48,6 +48,7 @@ import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; import org.apache.hadoop.mapreduce.Mapper; @@ -58,6 +59,7 @@ import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; @@ -327,6 +329,10 @@ public static class OutlinkDbReducer extends // url normalizers, filters and job configuration private Configuration conf; + // Cached counter references for performance + private Counter addedLinksCounter; + private Counter removedLinksCounter; + /** * Configures the OutlinkDb job reducer. Sets up internal links and link limiting. */ @@ -339,6 +345,18 @@ public void setup(Reducer.Context context) limitPages = conf.getBoolean("link.ignore.limit.page", true); limitDomains = conf.getBoolean("link.ignore.limit.domain", true); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + addedLinksCounter = context.getCounter( + NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL); + removedLinksCounter = context.getCounter( + NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL); } @Override @@ -361,14 +379,14 @@ public void reduce(Text key, Iterable values, mostRecent = timestamp; } outlinkList.add(WritableUtils.clone(next, conf)); - context.getCounter("WebGraph.outlinks", "added links").increment(1); + addedLinksCounter.increment(1); } else if (value instanceof BooleanWritable) { BooleanWritable delete = (BooleanWritable) value; // Actually, delete is always true, otherwise we don't emit it in the // mapper in the first place if (delete.get() == true) { // This page is gone, do not emit it's outlinks - context.getCounter("WebGraph.outlinks", "removed links").increment(1); + removedLinksCounter.increment(1); return; } } diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index d5d5035e89..8e37c21fcf 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -67,6 +67,8 @@ import org.apache.nutch.crawl.LinkDbReader; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.DumpFileUtil; import org.apache.nutch.util.NutchConfiguration; @@ -188,6 +190,7 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool { private GzipCompressorOutputStream gzipOutput = null; private TarArchiveOutputStream tarOutput = null; private ArrayList fileList = null; + private ErrorTracker errorTracker; /** * Main method for invoking this tool @@ -210,6 +213,7 @@ public static void main(String[] args) throws Exception { * @param config A populated {@link CommonCrawlConfig} */ public CommonCrawlDataDumper(CommonCrawlConfig config) { + this(); this.config = config; } @@ -217,6 +221,7 @@ public CommonCrawlDataDumper(CommonCrawlConfig config) { * Constructor */ public CommonCrawlDataDumper() { + this.errorTracker = new ErrorTracker(NutchMetrics.GROUP_COMMONCRAWL_DUMPER); } /** @@ -274,7 +279,8 @@ public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, if (parts == null || parts.size() == 0) { LOG.error( "No segment directories found in {} ", segmentRootDir.getAbsolutePath()); - System.exit(1); + this.errorTracker.recordError(ErrorTracker.ErrorType.OTHER); + return; } LOG.info("Found {} segment parts", parts.size()); if (gzip && !warc) { diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index bf824f9b3f..14b59ac85c 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -41,6 +41,7 @@ import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Job; @@ -57,6 +58,8 @@ import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; import org.apache.nutch.tools.WARCUtils; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -111,6 +114,35 @@ public static class WARCReducer // Metadata to JSON Gson gson = new Gson(); + // Cached counter references to avoid repeated lookups in hot paths + private Counter missingContentCounter; + private Counter missingMetadataCounter; + private Counter omittedEmptyResponseCounter; + private Counter recordsGeneratedCounter; + private ErrorTracker errorTracker; + + @Override + public void setup(Context context) { + // Initialize cached counter references + initCounters(context); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER, context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + missingContentCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_CONTENT_TOTAL); + missingMetadataCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_METADATA_TOTAL); + omittedEmptyResponseCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL); + recordsGeneratedCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_RECORDS_GENERATED_TOTAL); + } + @Override public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { @@ -147,13 +179,13 @@ public void reduce(Text key, Iterable values, // check that we have everything we need if (content == null) { LOG.info("Missing content for {}", key); - context.getCounter("WARCExporter", "missing content").increment(1); + missingContentCounter.increment(1); return; } if (cd == null) { LOG.info("Missing fetch datum for {}", key); - context.getCounter("WARCExporter", "missing metadata").increment(1); + missingMetadataCounter.increment(1); return; } @@ -161,8 +193,7 @@ public void reduce(Text key, Iterable values, // Empty responses is everything that was not a regular response if (!(cd.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS || cd.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED)) { - context.getCounter("WARCExporter", "omitted empty response") - .increment(1); + omittedEmptyResponseCounter.increment(1); return; } } @@ -237,7 +268,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + errorTracker.incrementCounters(e); return; } @@ -269,12 +300,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated").increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC resource record for {} : {}", key, exception.getMessage()); - context.getCounter("WARCExporter", "exception").increment(1); + errorTracker.incrementCounters(exception); } // Do we need to emit a metadata record too? @@ -316,7 +347,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + errorTracker.incrementCounters(e); return; } @@ -332,13 +363,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated") - .increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter("WARCExporter", "exception").increment(1); + errorTracker.incrementCounters(exception); } } @@ -376,7 +406,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + errorTracker.incrementCounters(e); return; } @@ -392,13 +422,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated") - .increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter("WARCExporter", "exception").increment(1); + errorTracker.incrementCounters(exception); } } } diff --git a/src/java/org/apache/nutch/util/DomainStatistics.java b/src/java/org/apache/nutch/util/DomainStatistics.java index 5ee09c846a..4057795d52 100644 --- a/src/java/org/apache/nutch/util/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/DomainStatistics.java @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -38,6 +39,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metrics.NutchMetrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,10 +54,6 @@ public class DomainStatistics extends Configured implements Tool { private static final Text FETCHED_TEXT = new Text("FETCHED"); private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED"); - public static enum MyCounter { - FETCHED, NOT_FETCHED, EMPTY_RESULT - }; - private static final int MODE_HOST = 1; private static final int MODE_DOMAIN = 2; private static final int MODE_SUFFIX = 3; @@ -158,10 +156,29 @@ static class DomainStatisticsMapper extends Mapper { int mode = 0; + // Cached counter references for performance + private Counter fetchedCounter; + private Counter notFetchedCounter; + private Counter emptyResultCounter; + @Override public void setup(Context context) { mode = context.getConfiguration().getInt("domain.statistics.mode", MODE_DOMAIN); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + fetchedCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_FETCHED_TOTAL); + notFetchedCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_NOT_FETCHED_TOTAL); + emptyResultCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_EMPTY_RESULT_TOTAL); } @Override @@ -197,17 +214,17 @@ public void map(Text urlText, CrawlDatum datum, Context context) } if (out.trim().equals("")) { LOG.info("url : {}", url); - context.getCounter(MyCounter.EMPTY_RESULT).increment(1); + emptyResultCounter.increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } - context.getCounter(MyCounter.FETCHED).increment(1); + fetchedCounter.increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { - context.getCounter(MyCounter.NOT_FETCHED).increment(1); + notFetchedCounter.increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } } diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java index 549aebc419..29e8a4f204 100644 --- a/src/java/org/apache/nutch/util/JexlUtil.java +++ b/src/java/org/apache/nutch/util/JexlUtil.java @@ -23,33 +23,159 @@ import org.apache.commons.jexl3.JexlBuilder; import org.apache.commons.jexl3.JexlEngine; +import org.apache.commons.jexl3.JexlFeatures; import org.apache.commons.jexl3.JexlScript; +import org.apache.commons.jexl3.introspection.JexlSandbox; import org.apache.commons.lang3.time.DateUtils; +import org.apache.hadoop.conf.Configuration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Utility methods for handling JEXL expressions + * Utility methods for handling JEXL expressions used in crawl and index + * pipelines. Expressions are evaluated under a {@link JexlSandbox} with + * {@link JexlFeatures#newInstance(boolean)} disabled so arbitrary classes cannot + * be instantiated from user-supplied configuration. */ public class JexlUtil { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + /** + * When {@code true}, JEXL parsing skips the sandbox (unsafe). For trusted + * environments only; not recommended. + */ + public static final String DISABLE_SANDBOX_KEY = "nutch.jexl.disable.sandbox"; + /** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */ - private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); + private static final Pattern DATE_PATTERN = Pattern + .compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z"); + + /** + * Classes and interfaces that may be introspected when evaluating Nutch JEXL + * scripts. Default-deny sandbox: anything not listed is blocked. + */ + private static final String[] SANDBOX_ALLOW_CLASSES = { + "java.lang.String", + "java.lang.Boolean", + "java.lang.Byte", + "java.lang.Character", + "java.lang.Short", + "java.lang.Integer", + "java.lang.Long", + "java.lang.Float", + "java.lang.Double", + "java.lang.Number", + "java.lang.Math", + "java.lang.Comparable", + "java.lang.CharSequence", + "java.util.Map", + "java.util.List", + "java.util.Collection", + "java.util.Set", + "java.util.SortedMap", + "java.util.SortedSet", + "java.util.Iterator", + "java.lang.Iterable", + "java.util.AbstractList", + "java.util.AbstractCollection", + "java.util.AbstractMap", + "java.util.AbstractSet", + "java.util.ArrayList", + "java.util.LinkedList", + "java.util.HashMap", + "java.util.LinkedHashMap", + "java.util.HashSet", + "java.util.LinkedHashSet", + "java.util.TreeMap", + "java.util.TreeSet", + "java.util.Collections", + "java.util.Arrays", + "java.util.regex.Pattern", + "java.util.regex.Matcher", + "org.apache.commons.jexl3.MapContext", + "org.apache.nutch.indexer.NutchDocument", + "org.apache.nutch.indexer.NutchField", + }; + + private static volatile JexlEngine sandboxedEngine; + private static volatile JexlEngine legacyEngine; + + private JexlUtil() { + } + + private static JexlSandbox createSandbox() { + JexlSandbox sandbox = new JexlSandbox(false); + for (String name : SANDBOX_ALLOW_CLASSES) { + sandbox.allow(name); + } + return sandbox; + } + + private static JexlFeatures createFeatures() { + return new JexlFeatures(JexlFeatures.createDefault()).newInstance(false); + } + + private static JexlEngine getSandboxedEngine() { + if (sandboxedEngine == null) { + synchronized (JexlUtil.class) { + if (sandboxedEngine == null) { + sandboxedEngine = new JexlBuilder().silent(true).strict(true) + .sandbox(createSandbox()).features(createFeatures()).create(); + } + } + } + return sandboxedEngine; + } + + private static JexlEngine getLegacyEngine() { + if (legacyEngine == null) { + synchronized (JexlUtil.class) { + if (legacyEngine == null) { + legacyEngine = new JexlBuilder().silent(true).strict(true).create(); + } + } + } + return legacyEngine; + } + + private static JexlEngine engineFor(Configuration conf) { + if (conf != null && conf.getBoolean(DISABLE_SANDBOX_KEY, false)) { + LOG.warn("{}=true: JEXL sandbox is disabled; only use in fully trusted environments.", + DISABLE_SANDBOX_KEY); + return getLegacyEngine(); + } + return getSandboxedEngine(); + } /** - * Parses the given expression to a JEXL expression. This supports - * date parsing. + * Parses a JEXL expression using the default (sandboxed) engine. Use + * {@link #parseExpression(Configuration, String)} when a {@link Configuration} + * is available so {@link #DISABLE_SANDBOX_KEY} can be honored. * * @param expr string JEXL expression * @return parsed JEXL expression or null in case of parse error */ public static JexlScript parseExpression(String expr) { - if (expr == null) return null; - + return parseExpression(null, expr); + } + + /** + * Parses a JEXL expression. Unless {@link #DISABLE_SANDBOX_KEY} is set to + * {@code true} in {@code conf}, the expression is parsed for execution under + * a restrictive sandbox. + * + * @param conf Hadoop configuration, or null to always use the sandbox + * @param expr string JEXL expression + * @return parsed JEXL expression or null in case of parse error + */ + public static JexlScript parseExpression(Configuration conf, String expr) { + if (expr == null) { + return null; + } + try { // Translate any date object into a long. Dates must be in the DATE_PATTERN // format. For example: 2016-03-20T00:00:00Z @@ -57,22 +183,21 @@ public static JexlScript parseExpression(String expr) { if (matcher.find()) { String date = matcher.group(); - + // parse the matched substring and get the epoch - Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"}); + Date parsedDate = DateUtils.parseDateStrictly(date, + new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" }); long time = parsedDate.getTime(); - + // replace the original string date with the numeric value expr = expr.replace(date, Long.toString(time)); } - JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create(); - - return jexl.createScript(expr); + return engineFor(conf).createScript(expr); } catch (Exception e) { LOG.error(e.getMessage()); } - + return null; } } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index d83a6e358c..21362223cd 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -45,6 +46,8 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.hostdb.HostDatum; +import org.apache.nutch.metrics.ErrorTracker; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.protocol.Content; @@ -113,6 +116,14 @@ private static class SitemapMapper extends Mapper values, Context context) originalDatum.setModifiedTime(sitemapDatum.getModifiedTime()); } - context.getCounter("Sitemap", "existing_sitemap_entries").increment(1); + existingEntriesCounter.increment(1); context.write(key, originalDatum); } else if(sitemapDatum != null) { // For the newly discovered links via sitemap, set the status as unfetched and emit - context.getCounter("Sitemap", "new_sitemap_entries").increment(1); + newEntriesCounter.increment(1); sitemapDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); context.write(key, sitemapDatum); } @@ -457,11 +507,11 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); - long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); - long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); - long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); - long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); - long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); + long filteredRecords = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).getValue(); + long fromHostname = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FROM_HOSTNAME_TOTAL).getValue(); + long fromSeeds = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_SEEDS_TOTAL).getValue(); + long failedFetches = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL).getValue(); + long newSitemapEntries = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).getValue(); LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 0cfce1c650..fd036480a6 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.util; +import java.lang.invoke.MethodHandles; import java.net.IDN; import java.net.MalformedURLException; import java.net.URI; @@ -23,11 +24,22 @@ import java.util.Locale; import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ibm.icu.text.IDNA; + import crawlercommons.domains.EffectiveTldFinder; /** Utility class for URL analysis */ public class URLUtil { + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private static final IDNA idna = IDNA.getUTS46Instance( + IDNA.NONTRANSITIONAL_TO_ASCII | IDNA.NONTRANSITIONAL_TO_UNICODE); + /** * Resolve relative URL-s and fix a java.net.URL error in handling of URLs * with pure query targets. @@ -103,7 +115,7 @@ static URL fixPureQueryTargets(URL base, String target) * https://publicsuffix.org/list/public_suffix_list.dat and are compared * using + * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html"> * crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are * used. Because EffectiveTldFinder loads the public suffix list as file * "effective_tld_names.dat" from the Java classpath, it's possible to use the @@ -262,7 +274,7 @@ public static String getDomainSuffix(URL url) { EffectiveTldFinder.EffectiveTLD suffix = EffectiveTldFinder.getEffectiveTLD(host, true); if (suffix != null) { - return suffix.getDomain(); + return suffix.getSuffix(); } return null; @@ -520,17 +532,39 @@ public static String getProtocol(URL url) { return url.getProtocol(); } + public static boolean isAscii(String str) { + char[] chars = str.toCharArray(); + for (char c : chars) { + if (c > 127) { + return false; + } + } + return true; + } + + /** + * Convert URL with IDN host/domain name into the ASCII representation. + * + * @param url + * URL string to convert + * @return URL string with ASCII host/domain name or null if conversion fails. + */ public static String toASCII(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (isAscii(host) && host.equals(hostLowerCase))) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - do nothing if host is already ASCII-only + // - not already in lowercase => conversion also lowercases host name return url; } - URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host), - u.getPort(), u.getPath(), u.getQuery(), u.getRef()); + URI p = new URI(u.getProtocol(), u.getUserInfo(), + convertIDNA2008(hostLowerCase, true), u.getPort(), u.getPath(), + u.getQuery(), u.getRef()); return p.toString(); } catch (Exception e) { @@ -538,13 +572,25 @@ public static String toASCII(String url) { } } + /** + * Convert URL with IDN host/domain name to the Unicode representation. + * + * @param url + * URL string to convert + * @return URL string with Unicode host/domain name or null if conversion + * fails. + */ public static String toUNICODE(String url) { try { URL u = new URL(url); String host = u.getHost(); - if (host == null || host.isEmpty()) { - // no host name => no punycoded domain name - // also do not add additional slashes for file: URLs (NUTCH-1880) + String hostLowerCase = host.toLowerCase(Locale.ROOT); + if (host == null || host.isEmpty() + || (!hostLowerCase.contains("xn--") && host.equals(hostLowerCase))) { + // - no host name => no punycoded domain name + // - also do not add additional slashes for file: URLs (NUTCH-1880) + // - contains 'xn--' => needs conversion + // - not already in lowercase => conversion also lowercases host name return url; } StringBuilder sb = new StringBuilder(); @@ -554,7 +600,7 @@ public static String toUNICODE(String url) { sb.append(u.getUserInfo()); sb.append('@'); } - sb.append(IDN.toUnicode(host)); + sb.append(convertIDNA2008(hostLowerCase, false)); if (u.getPort() != -1) { sb.append(':'); sb.append(u.getPort()); @@ -572,22 +618,83 @@ public static String toUNICODE(String url) { } /** - * For testing - * @param args print with no args to get help + * Convert IDN host to ASCII or Unicode using Java's built-in {@link IDN} + * class. + * + * The conversion supports only IDNA2003, it does not support IDNA2008. + * However, unless the parameter strictIDNA2003 is true, the + * methods {@link IDN#toASCII(String, int)} resp. + * {@link IDN#toUnicode(String, int)} are called passing the flag + * {@link IDN#ALLOW_UNASSIGNED} to avoid that the conversion fails on + * characters not in the repertoire of Unicode 3.2. + * + * @param host + * host name to be converted (lowercase expected) + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @param strictIDNA2003 + * if true, do + * @return converted host name + * @throws MalformedURLException + * if the conversion fails */ - public static void main(String[] args) { - - if (args.length != 1) { - System.err.println("Usage : URLUtil "); - return; + public static String convertIDNA2003(String host, boolean toAscii, + boolean strictIDNA2003) throws MalformedURLException { + try { + if (toAscii) { + return IDN.toASCII(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } else { + return IDN.toUnicode(host, strictIDNA2003 ? 0 : IDN.ALLOW_UNASSIGNED); + } + } catch (IllegalArgumentException | IndexOutOfBoundsException e) { + // IllegalArgumentException: thrown if the input string contains + // non-convertible Unicode codepoints + // IndexOutOfBoundsException: thrown (undocumented) if one "label" + // (non-ASCII dot-separated segment) is longer than 256 characters, + // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 + LOG.debug("Failed to convert IDN host {}: ", host, e); + throw (MalformedURLException) new MalformedURLException( + "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); } + } - String url = args[0]; - try { - System.out.println(URLUtil.getDomainName(new URL(url))); - } catch (MalformedURLException ex) { - ex.printStackTrace(); + /** + * Convert IDN host to ASCII or Unicode using ICU's {@link IDNA} class. + * + * The conversion supports IDNA2008 names. + * + * @param host + * host name to be converted (lowercase expected) + * @param toAscii + * if true convert to ASCII, otherwise to Unicode + * @return converted host name + * @throws MalformedURLException + * if the conversion fails + */ + public static String convertIDNA2008(String host, boolean toAscii) + throws MalformedURLException { + final IDNA.Info idnaInfo = new IDNA.Info(); + final StringBuilder hostConverted = new StringBuilder(); + if (toAscii) { + idna.nameToASCII(host, hostConverted, idnaInfo); + } else { + idna.nameToUnicode(host, hostConverted, idnaInfo); } + if (idnaInfo.hasErrors()) { + StringBuilder msg = new StringBuilder(); + for (IDNA.Error error : idnaInfo.getErrors()) { + if (msg.length() == 0) { + msg.append("Invalid IDNA2008 host").append(host).append(": "); + } else { + msg.append(", "); + } + msg.append(error.name()); + } + String errorMsg = msg.toString(); + LOG.debug("Failed to convert IDN host {}: {}", host, errorMsg); + throw new MalformedURLException(errorMsg); + } + return hostConverted.toString(); } /** @@ -610,4 +717,24 @@ public static boolean isHomePageOf(URL url, String hostName) { && url.getRef() == null // && url.getUserInfo() == null; } + + /** + * For testing + * @param args print with no args to get help + */ + public static void main(String[] args) { + + if (args.length != 1) { + System.err.println("Usage : URLUtil "); + System.err.println("\nExtract and print pay-level domain names for the input URL"); + return; + } + + String url = args[0]; + try { + System.out.println(URLUtil.getDomainName(new URL(url))); + } catch (MalformedURLException ex) { + ex.printStackTrace(); + } + } } diff --git a/src/java/org/commoncrawl/tools/UrlCleaner.java b/src/java/org/commoncrawl/tools/UrlCleaner.java new file mode 100644 index 0000000000..c4d92ca669 --- /dev/null +++ b/src/java/org/commoncrawl/tools/UrlCleaner.java @@ -0,0 +1,381 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.commoncrawl.tools; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.domains.EffectiveTldFinder; + +public class UrlCleaner extends Configured implements Tool { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + public static enum OutputKeyType { + URL, + HOST, + DOMAIN, + HOST_REVERSED, + DOMAIN_REVERSED, + HOST_REVERSED_TRAILING_DOT, + DOMAIN_REVERSED_TRAILING_DOT; + + public static OutputKeyType get(String name) { + for (OutputKeyType t : OutputKeyType.values()) { + if (name.equalsIgnoreCase(t.toString())) { + return t; + } + } + return URL; + } + } + + private static final String CHECK_DOMAIN = "urlcleaner.check.domain"; + private static final String OUTPUT_KEY_TYPE = "urlcleaner.output.key.type"; + private static final String SUM_VALUES = "urlcleaner.sum.values"; + + private Configuration config; + + + public static class UrlCleanerMapper extends Mapper { + + public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope"; + + private static Pattern SPLIT_HOST_PATTERN = Pattern.compile("\\."); + + private URLNormalizers urlNormalizers; + private URLFilters filters; + private String scope; + private boolean checkDomain; + private boolean needDomain; + private boolean needHost; + private boolean sumValues; + private OutputKeyType outputType; + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_INJECT); + urlNormalizers = new URLNormalizers(conf, scope); + filters = new URLFilters(conf); + checkDomain = conf.getBoolean(CHECK_DOMAIN, false); + sumValues = conf.getBoolean(SUM_VALUES, false); + outputType = OutputKeyType.get(conf.get(OUTPUT_KEY_TYPE)); + LOG.info("check domain names: {}", checkDomain); + LOG.info("output type: {}", outputType); + needDomain = checkDomain || outputType == OutputKeyType.DOMAIN + || outputType == OutputKeyType.DOMAIN_REVERSED + || outputType == OutputKeyType.DOMAIN_REVERSED_TRAILING_DOT; + needHost = needDomain || outputType == OutputKeyType.HOST + || outputType == OutputKeyType.HOST_REVERSED + || outputType == OutputKeyType.HOST_REVERSED_TRAILING_DOT; + } + + public static String[] reverseHost(String hostName) { + String[] rev = SPLIT_HOST_PATTERN.split(hostName); + for (int i = 0; i < (rev.length/2); i++) { + String temp = rev[i]; + rev[i] = rev[rev.length - i - 1]; + rev[rev.length - i - 1] = temp; + } + return rev; + } + + @Override + public void map(Text key, Text value, Context context) + throws IOException, InterruptedException { + + String urlOrig = key.toString(); + String url = urlOrig.trim(); + + try { + url = urlNormalizers.normalize(url, scope); + } catch (MalformedURLException e) { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); + return; + } + try { + url = filters.filter(url); + } catch (URLFilterException e) { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); + return; + } + + if (url == null) { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); + return; + } + + String host = null, domain = null; + if (needHost) { + try { + URL u = new URL(url); + host = u.getHost(); + if (needDomain) { + domain = EffectiveTldFinder.getAssignedDomain(host, true, true); + if (checkDomain && domain == null) { + context + .getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL) + .increment(1); + return; + } + } + } catch (MalformedURLException e) { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); + return; + } + } + + if (url.equals(urlOrig)) { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_ACCEPTED_UNCHANGED_TOTAL).increment(1); + } else { + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_ACCEPTED_NORMALIZED_TOTAL).increment(1); + key.set(url); + } + + String keyVal = null; + String addVal = url; + switch (outputType) { + case HOST: + keyVal = (host != null) ? host : ""; + break; + case DOMAIN: + keyVal = (domain != null) ? domain : ""; + addVal = host + "\t" + url; + break; + case HOST_REVERSED: + keyVal = (host != null) ? host : ""; + keyVal = String.join(".", reverseHost(keyVal)); + break; + case DOMAIN_REVERSED: + keyVal = (domain != null) ? domain : ""; + keyVal = String.join(".", reverseHost(keyVal)); + addVal = String.join(".", reverseHost(host)) + "\t" + url; + break; + case HOST_REVERSED_TRAILING_DOT: + keyVal = (host != null) ? host : ""; + List parts = new ArrayList<>(); + parts.addAll(Arrays.asList(reverseHost(keyVal))); + parts.add(""); + keyVal = String.join(".", parts); + break; + case DOMAIN_REVERSED_TRAILING_DOT: + keyVal = (domain != null) ? domain : ""; + parts = new ArrayList<>(); + parts.addAll(Arrays.asList(reverseHost(keyVal))); + parts.add(""); + keyVal = String.join(".", parts); + parts = new ArrayList<>(); + parts.addAll(Arrays.asList(reverseHost(host))); + parts.add(""); + addVal = String.join(".", parts) + "\t" + url; + break; + default: + break; + } + + if (sumValues) { + if (outputType != OutputKeyType.URL) { + key.set(keyVal + "\t" + addVal); + } + context.write(key, value); + } else { + if (outputType != OutputKeyType.URL) { + key.set(keyVal); + value.set(addVal + "\t" + value.toString()); + } + context.write(key, value); + } + } + + } + + public static class UrlCleanerTextSumReducer + extends Reducer { + + private Text result = new Text(); + + @Override + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + long sum = 0; + for (Text val : values) { + try { + long v = Long.parseLong(val.toString()); + sum += v; + } catch (NumberFormatException e) { + LOG.error("Value is not a long integer (key: {} value: {})", key, val); + sum += 1; + } + } + result.set(Long.toString(sum)); + context.write(key, result); + } + } + + @Override + public Configuration getConf() { + return config; + } + + @Override + public void setConf(Configuration conf) { + config = conf; + } + + public void clean(Path[] inputs, Path output, boolean checkDomain, + OutputKeyType outputKeyType, boolean sumValues) throws Exception { + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UrlCleaner: starting"); + + Configuration conf = getConf(); + conf.setBoolean(CHECK_DOMAIN, checkDomain); + conf.set(OUTPUT_KEY_TYPE, outputKeyType.toString()); + conf.setBoolean(SUM_VALUES, sumValues); + + Job job = Job.getInstance(conf, UrlCleaner.class.getName()); + job.setJarByClass(UrlCleaner.class); + job.setMapperClass(UrlCleanerMapper.class); + if (sumValues) { + job.setReducerClass(UrlCleanerTextSumReducer.class); + } else { + job.setReducerClass(Reducer.class); + } + job.setInputFormatClass(KeyValueTextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + + for (Path input : inputs) { + KeyValueTextInputFormat.addInputPath(job, input); + } + FileOutputFormat.setOutputPath(job, output); + + try { + // run the job + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "UrlCleaner job did not succeed, job status: " + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + // throw exception so that calling routine can exit with error + throw new RuntimeException(message); + } + + } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { + LOG.error("UrlCleaner job failed: {}", e.getMessage()); + throw e; + } + + stopWatch.stop(); + LOG.info("UrlCleaner: finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + + public void usage() { + System.err.println( + "Usage: UrlCleaner [-D...] [-checkDomain] [-sumValues] [-outputKey <...>] ... \n"); + } + + @Override + public int run(String[] args) throws Exception { + + boolean checkDomain = false; + OutputKeyType outputKeyType = OutputKeyType.URL; + boolean sumValues = false; + + int i = 0; + for (; i < (args.length - 2); i++) { + if (args[i].equals("-checkDomain")) { + checkDomain = true; + } else if (args[i].equals("-outputKey")) { + String key = args[++i]; + outputKeyType = OutputKeyType.get(key); + } else if (args[i].equals("-sumValues")) { + sumValues = true; + } else { + break; + } + } + + if ((args.length - i) < 2) { + usage(); + return -1; + } + + Path[] inputs = new Path[args.length - i - 1]; + for (int j = 0; j < (args.length - i - 1); j++) { + inputs[j] = new Path(args[i + j]); + } + Path output = new Path(args[args.length - 1]); + + try { + clean(inputs, output, checkDomain, outputKeyType, sumValues); + } catch (Exception e) { + LOG.error("UrlCleaner: ", e); + return -1; + } + return 0; + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new UrlCleaner(), + args); + System.exit(res); + } + +} diff --git a/src/java/org/commoncrawl/tools/UrlSampler.java b/src/java/org/commoncrawl/tools/UrlSampler.java new file mode 100644 index 0000000000..e2060e1f47 --- /dev/null +++ b/src/java/org/commoncrawl/tools/UrlSampler.java @@ -0,0 +1,376 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.commoncrawl.tools; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.Generator2; +import org.apache.nutch.crawl.Generator2.DomainScorePair; +import org.apache.nutch.crawl.URLPartitioner; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sample URLs used as Nutch seeds following per-domain limits. + * + * Input: + *

URL and inlink count + * + *
```
+ * <url> \t <inlink_count>
+ * 
```
+ * + *

domain name, limits and default score + * + *

+ * <domain_name> \t <rank> \t <max_urls> \t <max_hosts> \t  <max_urls_per_host> \t <default_score>
+ *

+ * + *

+ * <url> \t nutch.score=<score>
+ *

+ * 0.001 * default_score * log10(1 + inlink_count)
+ *

{ + + private DomainScorePair outputKey = new DomainScorePair(); + private TextCountPair outputValue = new TextCountPair(); + + @Override + public void map(Text key, Text value, Context context) + throws IOException, InterruptedException { + + if (!URL_PATTERN.matcher(key.toString()).find()) { + // got + outputKey.set(key, Float.MAX_VALUE); + outputValue.set(value, -1); + context.write(outputKey, outputValue); + return; + } + + // got + String url = key.toString(); + String domain; + try { + URL u = new URL(url); + domain = URLPartitioner.getDomainName(u.getHost()); + } catch (Exception e) { + LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage()); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1); + return; + } + + long count = 0; + try { + count = Long.parseLong(value.toString()); + } catch (NumberFormatException e) { + LOG.error("Value is not a long integer (url: {} value: {})", url, + value); + } + outputKey.set(domain, (float) (Math.random() * count)); + outputValue.set(key, count); + context.write(outputKey, outputValue); + } + } + + public static class SampleReducer + extends Reducer { + + private int maxUrlsPerDomain = 40; + private int maxHostsPerDomain = 2; + private int maxUrlsPerHost = 20; + private float defaultScore = .001f; + private Text meta = new Text(); + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + maxUrlsPerDomain = conf.getInt("urlsample.urls.per.domain", + maxUrlsPerDomain); + maxUrlsPerHost = conf.getInt("urlsample.urls.per.host", maxUrlsPerHost); + maxHostsPerDomain = conf.getInt("urlsample.hosts.per.domain", + maxHostsPerDomain); + defaultScore = conf.getFloat("urlsample.default.score", defaultScore); + } + + @Override + public void reduce(DomainScorePair key, Iterable values, + Context context) throws IOException, InterruptedException { + int maxUrls = maxUrlsPerDomain; + int maxHosts = maxHostsPerDomain; + int maxPerHost = maxUrlsPerHost; + float domainScore = defaultScore; + int nUrls = 0; + int nUrlsSampled = 0; + int skippedMaxUrls = 0; + int skippedMaxHosts = 0; + int skippedMaxUrlsPerHost = 0; + double sumScores = .0; + String domain = null; + Text limits = null; + Map hosts = null; + + for (TextCountPair val : values) { + Text text = val.getText(); + long count = val.getCount().get(); + if (limits == null && count == -1) { + limits = new Text(text); + // store limits but parse later lazily (if there are URLs to sample) + continue; + } else if (limits != null) { + String[] l = limits.toString().split("\t"); + if (l.length >= 5) { + try { + // long rank = l[0] + maxUrls = Integer.parseInt(l[1]); + maxHosts = Integer.parseInt(l[2]); + maxPerHost = Integer.parseInt(l[3]); + domainScore = Float.parseFloat(l[4]); + } catch (NumberFormatException e) { + LOG.warn("Invalid domain limits: {}", limits, e); + } + } + limits = null; + } + if (domain == null) { + // processing first URL in values + domain = key.getDomain().toString(); + hosts = new HashMap<>(); + } + String host = null; + try { + host = new URL(text.toString()).getHost().toLowerCase(Locale.ROOT); + if (host.endsWith(domain)) { + // clip common domain name suffix to save storage space in map keys + host = host.substring(0, host.length() - domain.length()); + } else { + LOG.warn("Host {} does not have domain {} as suffix!", host, + domain); + } + } catch (MalformedURLException e) { + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1); + continue; + } + nUrls++; + if (nUrlsSampled > maxUrls) { + skippedMaxUrls++; + continue; + } + int[] nUrlsPerHost = hosts.get(host); + if (nUrlsPerHost != null) { + if (nUrlsPerHost[0]++ > maxPerHost) { + skippedMaxUrlsPerHost++; + continue; + } + } else { + if (hosts.size() >= maxHosts) { + skippedMaxHosts++; + continue; + } + hosts.put(host, new int[] { 0 }); + } + nUrlsSampled++; + double score = .001d * domainScore * Math.log10(1 + count); + sumScores += score; + meta.set(String.format(Locale.ROOT, "nutch.score=%.12f", score)); + context.write(text, meta); + } + if (nUrls == 0) + return; + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_TOTAL).increment(skippedMaxUrls); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL) + .increment(skippedMaxUrlsPerHost); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL).increment(skippedMaxHosts); + LOG.info( + "Sampled for domain {} : {} hosts, {} URLs ({} skipped: {} max. URLs, {} max. per host, {} max. hosts), sum of scores = {}", + domain, hosts.size(), nUrlsSampled, (nUrls - nUrlsSampled), + skippedMaxUrls, skippedMaxUrlsPerHost, skippedMaxHosts, sumScores); + } + } + + private void sample(Path[] inputs, Path output) throws Exception { + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UrlSampler: starting"); + + Configuration conf = getConf(); + conf.setInt("partition.url.seed", new Random().nextInt()); + + Job job = Job.getInstance(conf, UrlSampler.class.getName()); + job.setJarByClass(UrlSampler.class); + job.setMapperClass(SampleMapper.class); + job.setPartitionerClass(Generator2.Selector.class); + job.setSortComparatorClass(Generator2.ScoreComparator.class); + job.setGroupingComparatorClass(Generator2.DomainComparator.class); + job.setReducerClass(SampleReducer.class); + job.setInputFormatClass(KeyValueTextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setMapOutputKeyClass(Generator2.DomainScorePair.class); + job.setMapOutputValueClass(TextCountPair.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + + for (Path input : inputs) { + KeyValueTextInputFormat.addInputPath(job, input); + } + FileOutputFormat.setOutputPath(job, output); + + try { + // run the job + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "UrlSampler job did not succeed, job status: " + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + // throw exception so that calling routine can exit with error + throw new RuntimeException(message); + } + + } catch (IOException | InterruptedException | ClassNotFoundException + | NullPointerException e) { + LOG.error("UrlSampler job failed: {}", e.getMessage()); + throw e; + } + + stopWatch.stop(); + LOG.info("UrlSampler: finished}, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + + public void usage() { + System.err.println( + "Usage: UrlSampler [-D...] ... \n"); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 2) { + usage(); + return -1; + } + + Path[] inputs = new Path[args.length - 1]; + for (int i = 0; i < (args.length - 1); i++) { + inputs[i] = new Path(args[i]); + } + Path output = new Path(args[args.length - 1]); + + try { + sample(inputs, output); + } catch (Exception e) { + LOG.error("UrlSampler: ", e); + return -1; + } + return 0; + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new UrlSampler(), + args); + System.exit(res); + } + +} diff --git a/src/java/org/commoncrawl/tools/UrlSamplerHost.java b/src/java/org/commoncrawl/tools/UrlSamplerHost.java new file mode 100644 index 0000000000..bce68ad50f --- /dev/null +++ b/src/java/org/commoncrawl/tools/UrlSamplerHost.java @@ -0,0 +1,437 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.commoncrawl.tools; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URL; +import java.util.Locale; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.Generator2; +import org.apache.nutch.crawl.Generator2.DomainScorePair; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sample URLs used as Nutch seeds following per-host limits. + * + * Input: + *

URL and inlink count + * + *
```
+ * <url> \t <inlink_count>
+ * 
```
+ * + *
host name (leading www. may be stripped), limits and default + * score + * + *
```
+ * <host_name> \t <rank> \t <max_urls> \t <default_score>
+ * 
```
+ * + *

+ * <url> \t nutch.score=<score>
+ *

+ * 0.001 * default_score * log10(1 + inlink_count)
+ *

{ + + private boolean hostStripWWW = false; + + private DomainScorePair outputKey = new DomainScorePair(); + private TextCountPair outputValue = new TextCountPair(); + + /** + * Strip leading www. from host name. + * + * But do not strip if the host name is "e;www.tld"e; (e.g., + * www.com). + * + * Stripping is required for per-host limit configurations before 2026, + * based on Common Crawl web graphs where the leading www. was + * stripped. + * + * @param host + * name + * @return host name with leading www. stripped + */ + private static String hostStripWWW(String host) { + // min. length: 4 + 3 + 1 = 8 (www. + 1-letter-domain + .2-letter-tld) + if (host.length() >= 8 && host.startsWith("www.") + && host.indexOf('.', 4) != -1) { + host = host.substring(4); + } + return host; + } + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + hostStripWWW = conf.getBoolean("urlsample.host.strip.www", false); + } + + @Override + public void map(Text key, Text value, Context context) + throws IOException, InterruptedException { + + if (!URL_PATTERN.matcher(key.toString()).find()) { + // got + outputKey.set(key, Float.MAX_VALUE); + outputValue.set(value, -1); + context.write(outputKey, outputValue); + return; + } + + // got + String url = key.toString(); + String host; + try { + URL u = new URL(url); + host = u.getHost(); + if (hostStripWWW) { + host = hostStripWWW(host); + } + } catch (Exception e) { + LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage()); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1); + return; + } + + long count = 0; + try { + count = Long.parseLong(value.toString()); + } catch (NumberFormatException e) { + LOG.error("Value is not a long integer (url: {} value: {})", url, + value); + } + outputKey.set(host, (float) (Math.random() * count)); + outputValue.set(key, count); + context.write(outputKey, outputValue); + } + } + + public static class SampleReducer + extends Reducer { + + private int maxUrlsPerHost = -1; // -1 : sample randomly + private float defaultScore = .001f; + private Text meta = new Text(); + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + maxUrlsPerHost = conf.getInt("urlsample.urls.per.host", maxUrlsPerHost); + defaultScore = conf.getFloat("urlsample.default.score", defaultScore); + } + + @Override + public void reduce(DomainScorePair key, Iterable values, + Context context) throws IOException, InterruptedException { + int maxUrls = maxUrlsPerHost; + float hostScore = defaultScore; + int nUrls = 0; + int nUrlsSampled = 0; + int skippedMaxUrlsPerHost = 0; + int skippedRandom = 0; + double sumScores = .0; + String host = null; + Text limits = null; + + for (TextCountPair val : values) { + Text text = val.getText(); + long count = val.getCount().get(); + if (limits == null && count == -1) { + limits = new Text(text); + // store limits but parse later lazily (if there are URLs to sample) + continue; + } else if (limits != null) { + String[] l = limits.toString().split("\t"); + if (l.length >= 3) { + try { + // long rank = l[0] + maxUrls = Integer.parseInt(l[1]); + hostScore = Float.parseFloat(l[2]); + } catch (NumberFormatException e) { + LOG.warn("Invalid host limits: {}", limits, e); + } + } + limits = null; + } + if (host == null) { + // processing first URL in values + host = key.getDomain().toString(); + } + nUrls++; + if (nUrlsSampled > maxUrls) { + if (maxUrls > -1) { + skippedMaxUrlsPerHost++; + continue; + } else { + // no limits, sample randomly with low probability + double sampleProb = (.1d / (1 + nUrlsSampled)) + * Math.log10(1 + count); + if (sampleProb < Math.random()) { + skippedRandom++; + continue; + } + } + } + nUrlsSampled++; + double score = .001d * hostScore * Math.log10(1 + count); + sumScores += score; + meta.set(String.format(Locale.ROOT, "nutch.score=%.12f", score)); + context.write(text, meta); + } + // hosts == reduce input groups + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS).increment(1); + // URLs == map output records, reduce input records + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS).increment(nUrls); + if (nUrls > 0) { + if (maxUrls > -1) { + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT).increment(1); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT) + .increment(nUrls); + } else { + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT).increment(1); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT) + .increment(nUrls); + } + if (nUrlsSampled > 0) { + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS_SAMPLED).increment(nUrlsSampled); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS_SAMPLED).increment(1); + if (maxUrls > -1) { + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED) + .increment(1); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED) + .increment(nUrlsSampled); + } else { + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED) + .increment(1); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED) + .increment(nUrlsSampled); + } + } + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST) + .increment(skippedMaxUrlsPerHost); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST, + NutchMetrics.URLSAMPLER_SKIPPED_RANDOM) + .increment(skippedRandom); + LOG.info( + "Sampled for host {} : {} URLs ({} skipped: {} max. per host, {} random), sum of scores = {}", + host, nUrlsSampled, (nUrls - nUrlsSampled), skippedMaxUrlsPerHost, + skippedRandom, sumScores); + } + } + } + + private void sample(Path[] inputs, Path output) throws Exception { + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UrlSamplerHost: starting"); + + Configuration conf = getConf(); + conf.setInt("partition.url.seed", new Random().nextInt()); + + Job job = Job.getInstance(conf, UrlSamplerHost.class.getName()); + job.setJarByClass(UrlSamplerHost.class); + job.setMapperClass(SampleMapper.class); + job.setPartitionerClass(Generator2.Selector.class); + job.setSortComparatorClass(Generator2.ScoreComparator.class); + job.setGroupingComparatorClass(Generator2.DomainComparator.class); + job.setReducerClass(SampleReducer.class); + job.setInputFormatClass(KeyValueTextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setMapOutputKeyClass(Generator2.DomainScorePair.class); + job.setMapOutputValueClass(TextCountPair.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + + for (Path input : inputs) { + KeyValueTextInputFormat.addInputPath(job, input); + } + FileOutputFormat.setOutputPath(job, output); + + try { + // run the job + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "UrlSampler job did not succeed, job status: " + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + // throw exception so that calling routine can exit with error + throw new RuntimeException(message); + } + + } catch (IOException | InterruptedException | ClassNotFoundException + | NullPointerException e) { + LOG.error("UrlSampler job failed: {}", e.getMessage()); + throw e; + } + + stopWatch.stop(); + LOG.info("UrlSamplerHost: finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + + public void usage() { + System.err.println( + "Usage: UrlSamplerHost [-D...] ... \n"); + System.err.println( + "\nThe host_limits file defines the maximum number of URLs to sample per host."); + System.err.println("\nProperties:"); + System.err.println( + "\t-Durlsample.host.strip.www=(true|false)\tstrip leading www. from host names"); + System.err.println( + "\t\t\t(depending on whether the limits file uses stripped host names)"); + System.err.println( + "Properties to configure defaults, if host is not in the limits file:"); + System.err.println( + "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host"); + System.err + .println("\t\t\t-1 : sample randomly with low probability (default)"); + System.err.println( + "\t-Durlsample.default.score\tdefault score for sampled URLs (default: 0.001)"); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 2) { + usage(); + return -1; + } + + Path[] inputs = new Path[args.length - 1]; + for (int i = 0; i < (args.length - 1); i++) { + inputs[i] = new Path(args[i]); + } + Path output = new Path(args[args.length - 1]); + + try { + sample(inputs, output); + } catch (Exception e) { + LOG.error("UrlSamplerHost: ", e); + return -1; + } + return 0; + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new UrlSamplerHost(), + args); + System.exit(res); + } + +} diff --git a/src/java/org/commoncrawl/tools/WarcExport.java b/src/java/org/commoncrawl/tools/WarcExport.java new file mode 100644 index 0000000000..ab4a9aeafc --- /dev/null +++ b/src/java/org/commoncrawl/tools/WarcExport.java @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.tools; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.commoncrawl.util.WarcCapture; +import org.commoncrawl.util.WarcOutputFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class WarcExport extends Configured implements Tool { + public static Logger LOG = LoggerFactory.getLogger(WarcExport.class); + + static { + Configuration.addDefaultResource("nutch-default.xml"); + Configuration.addDefaultResource("nutch-site.xml"); + } + + public static class ExportMapper + extends Mapper { + @Override + public void map(Text key, Writable value, Context context) + throws IOException, InterruptedException { + if (key.getLength() == 0) { + return; + } + context.write(key, new NutchWritable(value)); + } + } + + public static class ExportReducer + extends Reducer { + + private boolean generateCrawlDiagnostics = false; + private boolean generateRobotsTxt = false; + + @Override + public void setup(Context context) { + Configuration conf = context.getConfiguration(); + generateCrawlDiagnostics = conf.getBoolean("warc.export.crawldiagnostics", + false); + generateRobotsTxt = conf.getBoolean("warc.export.robotstxt", false); + } + + @Override + public void reduce(Text key, Iterable values, + Context context) throws IOException, InterruptedException { + CrawlDatum datum = null; + Content content = null; + + for (NutchWritable nutchValue : values) { + final Writable value = nutchValue.get(); // unwrap + if (value instanceof CrawlDatum) { + datum = (CrawlDatum) value; + } else if (value instanceof ParseData) { + ParseData parseData = (ParseData) value; + // Get the robots meta data + String robotsMeta = parseData.getMeta("robots"); + + // Has it a noindex for this url? + if (robotsMeta != null + && robotsMeta.toLowerCase().contains("noindex")) { + return; + } + } else if (value instanceof Content) { + content = (Content) value; + } + } + + if (content == null) { + return; + } + + if (datum == null) { + if (!generateRobotsTxt) + return; + } else if (datum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { + if (!generateCrawlDiagnostics) + return; + } + + WarcCapture completeData = new WarcCapture(key, datum, content); + + context.write(key, completeData); + } + } + + public void export(Path outputDir, List segments, + boolean generateCrawlDiagnostics, boolean generateRobotsTxt, Path cdxPath) + throws IOException, InterruptedException, ClassNotFoundException { + Configuration conf = getConf(); + + // We compress ourselves, so this isn't necessary + conf.setBoolean( + org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS, + false); + + conf.setBoolean("warc.export.crawldiagnostics", generateCrawlDiagnostics); + conf.setBoolean("warc.export.robotstxt", generateRobotsTxt); + if (cdxPath != null) { + conf.setBoolean("warc.export.cdx", true); + conf.set("warc.export.cdx.path", cdxPath.toString()); + } + + Job job = Job.getInstance(conf); + job.setJobName("WarcExport: " + outputDir.toString()); + job.setJarByClass(WarcExport.class); + + FileOutputFormat.setOutputPath(job, new Path("out")); + + for (final Path segment : segments) { + LOG.info("ExporterMapReduces: adding segment: {}", segment); + FileSystem fs = segment.getFileSystem(getConf()); + + MultipleInputs.addInputPath(job, + new Path(segment, CrawlDatum.FETCH_DIR_NAME), + SequenceFileInputFormat.class); + + Path parseDataPath = new Path(segment, ParseData.DIR_NAME); + if (fs.exists(parseDataPath)) { + MultipleInputs.addInputPath(job, parseDataPath, + SequenceFileInputFormat.class); + } + + MultipleInputs.addInputPath(job, new Path(segment, Content.DIR_NAME), + SequenceFileInputFormat.class); + } + + job.setMapperClass(ExportMapper.class); + job.setReducerClass(ExportReducer.class); + + job.setMapOutputValueClass(NutchWritable.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(WarcCapture.class); + + job.setOutputFormatClass(WarcOutputFormat.class); + WarcOutputFormat.setOutputPath(job, outputDir); + + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WarcExport: starting"); + + try { + boolean success = job.waitForCompletion(true); + if (!success) { + String message = "WarcExport: job did not succeed, job status: " + + job.getStatus().getState() + ", reason: " + + job.getStatus().getFailureInfo(); + LOG.error(message); + throw new RuntimeException(message); + } + } catch (IOException | InterruptedException | ClassNotFoundException e) { + LOG.error("WarcExport job failed: {}", e.getMessage()); + throw e; + } + stopWatch.stop(); + LOG.info("WarcExport: finished, elapsed: {} ms", + stopWatch.getTime(TimeUnit.MILLISECONDS)); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.err.println( + "Usage: WarcExport ( ... | -dir ) [-crawldiagnostics] [-robotstxt] [-cdx path]"); + return -1; + } + + final Path outputDir = new Path(args[0]); + + final List segments = new ArrayList(); + boolean generateCrawlDiagnostics = false; + boolean generateRobotsTxt = false; + Path cdxPath = null; + + for (int i = 1; i < args.length; i++) { + if (args[i].equals("-dir")) { + Path dir = new Path(args[++i]); + FileSystem fs = dir.getFileSystem(getConf()); + FileStatus[] fstats = fs.listStatus(dir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] files = HadoopFSUtil.getPaths(fstats); + for (Path p : files) { + segments.add(p); + } + } else if (args[i].equals("-crawldiagnostics")) { + generateCrawlDiagnostics = true; + } else if (args[i].equals("-robotstxt")) { + generateRobotsTxt = true; + } else if (args[i].equals("-cdx")) { + cdxPath = new Path(args[++i]); + } else { + segments.add(new Path(args[i])); + } + } + + try { + export(outputDir, segments, generateCrawlDiagnostics, generateRobotsTxt, + cdxPath); + return 0; + } catch (final Exception e) { + LOG.error("WARC Exporter:", e); + return -1; + } + } + + public static void main(String[] args) throws Exception { + final int res = ToolRunner.run(NutchConfiguration.create(), + new WarcExport(), args); + System.exit(res); + } +} diff --git a/src/java/org/commoncrawl/util/ByteArrayCharSequence.java b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java new file mode 100644 index 0000000000..9946933a30 --- /dev/null +++ b/src/java/org/commoncrawl/util/ByteArrayCharSequence.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.nio.charset.StandardCharsets; + +/** + * Wrap a byte array as a {@link CharSequence} in + * {@link StandardCharsets#ISO_8859_1} encoding. + * + * For regular expression matching on ASCII characters only, the wrapper should + * be faster than creating a {@link String} from the byte array or a + * subsequence, because no bytes are converted to chars and no memory is + * allocated for a new String. + * + * Similar wrappers are part of + * extJWNL, + * BUbiNG, and other Java + * libraries. + */ +public class ByteArrayCharSequence implements CharSequence { + + private final byte[] data; + private final int length; + private final int offset; + + public ByteArrayCharSequence() { + this(new byte[0], 0, 0); + } + + public ByteArrayCharSequence(final byte[] data) { + this(data, 0, data.length); + } + + public ByteArrayCharSequence(final byte[] data, int length) { + this(data, 0, length); + } + + public ByteArrayCharSequence(final byte[] data, int offset, int length) { + this.data = data; + if (offset < 0) { + throw new ArrayIndexOutOfBoundsException("Negative offset: " + offset); + } + if (length < 0) { + throw new IllegalArgumentException("Negative length:" + length); + } + if ((offset + length) > data.length) { + throw new ArrayIndexOutOfBoundsException( + "(Offset + length) > array_length"); + } + this.length = length; + this.offset = offset; + } + + @Override + public int length() { + return this.length; + } + + @Override + public char charAt(int index) { + if (index >= length) { + throw new IndexOutOfBoundsException("" + index); + } + return (char) (data[offset + index] & 0xff); + } + + @Override + public CharSequence subSequence(int start, int end) { + return new ByteArrayCharSequence(data, offset + start, end - start); + } + + @Override + public String toString() { + return new String(data, offset, length, StandardCharsets.ISO_8859_1); + } +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/CanonicalLinkDetector.java b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java new file mode 100644 index 0000000000..8e40777013 --- /dev/null +++ b/src/java/org/commoncrawl/util/CanonicalLinkDetector.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.http.HeaderElement; +import org.apache.http.NameValuePair; +import org.apache.http.ParseException; +import org.apache.http.message.BasicHeaderValueParser; +import org.apache.nutch.protocol.Content; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class CanonicalLinkDetector { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + protected static Set SUPPORTED_CONTENT_TYPES = new HashSet<>(); + static { + SUPPORTED_CONTENT_TYPES.add("text/html"); + SUPPORTED_CONTENT_TYPES.add("application/xhtml+xml"); + } + + /** + * Pattern to match canonical link elements in HTML. The length of the + * canonical link URL inside the element is limited to max. 2048 characters. + */ + private static Pattern canonicalLinkPattern = Pattern.compile( + "]{0,2054}rel=(?:'canonical'|\"canonical\"|canonical\\b)[^>]{0,2054}>", + Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + private static Pattern hrefPattern = Pattern + .compile("href=['\"]?([^'\"\\s]{0,2048})", Pattern.CASE_INSENSITIVE); + + private static Pattern canonicalRelValuePattern = Pattern + .compile("\\bcanonical\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern linkInParentheses = Pattern + .compile("^\\s*<\\s*(.*?)\\s*>\\s*$"); + + private static final List EMPTY_RESULT = List.of(); + + /** top-N bytes of HTML to look for canonical link */ + private static int CHUNK_SIZE = 65536; + + /** max. number canonical links to detect */ + private static int MAX_LINKS = 1; + + /** + * Extract canonical link from HTTP header. + * + * The extraction is delegated to {@link BasicHeaderValueParser} because + * parsing multi-valued link attributes is far from trivial, e.g. + * + *

+   Link: ; rel="canonical",; rel="shortlink",; rel="shortcut icon"
+   *

+ * + * @param "Link" + * header values + * @return the canonical links found, or an empty list if no canonical link is + * found + */ + protected static List detectCanonicalLinksHttpHeader( + String[] linkHeaders, int maxResults) { + List result = EMPTY_RESULT; + for (String httpHeaderLink : linkHeaders) { + HeaderElement elem; + try { + elem = BasicHeaderValueParser.parseHeaderElement(httpHeaderLink, + BasicHeaderValueParser.INSTANCE); + } catch (ParseException e) { + LOG.error("Failed to parse Link HTTP header: {}", httpHeaderLink, e); + continue; + } + for (NameValuePair param : elem.getParameters()) { + if ("rel".equalsIgnoreCase(param.getName()) + && canonicalRelValuePattern.matcher(param.getValue()).find()) { + String link = elem.getName(); + // match inside < ... > + Matcher urlMatcher = linkInParentheses.matcher(link); + if (urlMatcher.matches()) { + link = urlMatcher.group(1); + if (result == EMPTY_RESULT) { + result = new ArrayList(1); + } + result.add(link); + if (result.size() >= maxResults) { + break; + } + } + } + } + } + return result; + } + + public static boolean isEligibleContentType(String contentType) { + return SUPPORTED_CONTENT_TYPES.contains(contentType); + } + + /** + * Extract canonical link from HTTP header. + * + * The extraction is delegated to {@link BasicHeaderValueParser} because + * parsing multi-valued link attributes is far from trivial, e.g. + * + *

+   Link: ; rel="canonical",; rel="shortlink",; rel="shortcut icon"
+   *

+ * + * @param "Link" + * header values + * @return the canonical links found, or an empty list if no canonical link is + * found + */ + public static List detectCanonicalLinksHTML(byte[] content, int chunkSize, + int maxResults) { + List result = EMPTY_RESULT; + int length = content.length < chunkSize ? content.length : chunkSize; + CharSequence cs; + cs = new ByteArrayCharSequence(content, length); + Matcher clMatcher = canonicalLinkPattern.matcher(cs); + while (clMatcher.find()) { + CharSequence cls; + cls = cs.subSequence(clMatcher.start(), clMatcher.end()); + Matcher hrefMatcher = hrefPattern.matcher(cls); + if (hrefMatcher.find(5)) { + String cl = hrefMatcher.group(1); + if (result == EMPTY_RESULT) { + result = new ArrayList(1); + } + result.add(cl); + if (result.size() >= maxResults) { + break; + } + } + } + return result; + } + + public static List detectCanonicalLinks(Content content, + int chunkSize, int maxLinks) { + + /* + * Note: the HTTP header look-up is case-insensitive if + * CaseInsensitiveMetadata or SpellCheckedMetadata is used. + */ + String[] linkHeaders = content.getMetadata().getValues("Link"); + List canonicalLinks = detectCanonicalLinksHttpHeader(linkHeaders, + maxLinks); + + if (canonicalLinks.size() < maxLinks + && isEligibleContentType(content.getContentType())) { + List linksHtml = detectCanonicalLinksHTML(content.getContent(), chunkSize, maxLinks); + if (linksHtml.size() > 0) { + if (canonicalLinks == EMPTY_RESULT) { + canonicalLinks = linksHtml; + } else { + canonicalLinks.addAll(linksHtml); + } + } + } + return canonicalLinks; + } + + public static List detectCanonicalLinks(Content content) { + return detectCanonicalLinks(content, CHUNK_SIZE, MAX_LINKS); + } + +} diff --git a/src/java/org/commoncrawl/util/LanguageDetector.java b/src/java/org/commoncrawl/util/LanguageDetector.java new file mode 100644 index 0000000000..092e534aaa --- /dev/null +++ b/src/java/org/commoncrawl/util/LanguageDetector.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.detect.CompositeEncodingDetector; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.html.HtmlEncodingDetector; +import org.apache.tika.parser.txt.Icu4jEncodingDetector; +import org.commoncrawl.langdetect.cld2.CLDHints; +import org.commoncrawl.langdetect.cld2.Cld2; +import org.commoncrawl.langdetect.cld2.Flags; +import org.commoncrawl.util.LanguageDetector.Result.Status; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Utf8; + +public class LanguageDetector { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + protected static Set SUPPORTED_CONTENT_TYPES = new HashSet<>(); + static { + SUPPORTED_CONTENT_TYPES.add("text/html"); + SUPPORTED_CONTENT_TYPES.add("application/xhtml+xml"); + } + + protected Flags flags = new Flags(); + + protected EncodingDetector charsetDetector = new CompositeEncodingDetector( + Arrays.asList(new HtmlEncodingDetector(), new Icu4jEncodingDetector())); + + public void setBestEffort(boolean bestEffort) { + flags.setBestEffort(bestEffort); + } + + public static class Result { + Charset charset; + org.commoncrawl.langdetect.cld2.Result languages; + String errorReason; + enum Status { + EMPTY_CONTENT("empty content"), + CHARSET_DETECTION_FAILED("failed to detect charset"), + UNSUPPORTED_MIME_TYPE("MIME type not supported"); + String name; + Status(String name) { + this.name = name; + } + }; + Status errorStatus; + } + + protected Result detectLanguage(URI uri, Content content) { + + LanguageDetector.Result result = new Result(); + + if (content.getContent().length == 0) { + // empty content, nothing to detect + LOG.debug("Skipping empty document for language and charset detection"); + result.errorStatus = Status.EMPTY_CONTENT; + return result; + } + + String detectedContentType = content.getContentType(); + boolean isPlainText = false; + if (!SUPPORTED_CONTENT_TYPES.contains(detectedContentType)) { + // TODO: as an improvement, parse documents of non-HTML content types and + // do the language detection on extracted text, for now skip them + LOG.debug("Skipping document of Content-Type {} for language detection", + detectedContentType); + result.errorReason = "Content-Type " + detectedContentType + " not supported"; + result.errorStatus = Status.UNSUPPORTED_MIME_TYPE; + return result; + } + + String httpContentLanguage = content.getMetadata() + .get(Response.CONTENT_LANGUAGE); + String httpContentType = WarcWriter.getMeta(content.getMetadata(), + Response.CONTENT_TYPE); + + Metadata metadata = new Metadata(); + if (httpContentType != null) { + LOG.debug(" Content-Type: {}", httpContentType); + metadata.add(Metadata.CONTENT_TYPE, httpContentType); + } + String text; + byte[] bytes = content.getContent(); + + try (AutoDetectReader charsetDetectReader = new AutoDetectReader( + new ByteArrayInputStream(bytes), metadata, charsetDetector)) { + result.charset = charsetDetectReader.getCharset(); + boolean isValidUtf8 = false; + if (result.charset.equals(StandardCharsets.UTF_8)) { + // need to validate UTF-8 because CLD2 may segfault on invalid UTF-8 + LOG.debug("Validating UTF-8 for: {}", uri); + if (Utf8.isWellFormed(bytes)) { + isValidUtf8 = true; + } + } + if (isValidUtf8) { + // CLD2 requires that the input byte[] includes a trailing zero byte + bytes = Cld2.bytesToNative(bytes); + } else { + LOG.debug("Recoding from {}: {}", result.charset, uri); + text = new String(bytes, result.charset); + bytes = Cld2.encodeNative(text); + } + } catch (IOException | TikaException e) { + LOG.error("Failed to convert charset:", e); + result.errorReason = "Failed to convert charset " + e.getMessage(); + result.errorStatus = Status.CHARSET_DETECTION_FAILED; + return result; + } + CLDHints hints = new CLDHints(); + hints.setEncodingHint(result.charset); + hints.setTopLevelDomainHint(uri); + if (httpContentLanguage != null) { + hints.setContentLanguageHint(httpContentLanguage); + } + result.languages = Cld2.detect(bytes, hints, flags, isPlainText); + result.languages.configurePruning(10, 2, 0.0); + + return result; + } + +} diff --git a/src/java/org/commoncrawl/util/NullOutputCommitter.java b/src/java/org/commoncrawl/util/NullOutputCommitter.java new file mode 100644 index 0000000000..8975e5f655 --- /dev/null +++ b/src/java/org/commoncrawl/util/NullOutputCommitter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; + +import java.io.IOException; + +public class NullOutputCommitter extends OutputCommitter { + + @Override + public void setupJob(JobContext jobContext) throws IOException { + } + + @Override + public void setupTask(TaskAttemptContext taskContext) throws IOException { + } + + @Override + public boolean needsTaskCommit(TaskAttemptContext taskContext) + throws IOException { + return false; + } + + @Override + public void commitTask(TaskAttemptContext taskContext) throws IOException { + } + + @Override + public void abortTask(TaskAttemptContext taskContext) throws IOException { + } + +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/S3FileOutputFormat.java b/src/java/org/commoncrawl/util/S3FileOutputFormat.java new file mode 100644 index 0000000000..cd2c625d8a --- /dev/null +++ b/src/java/org/commoncrawl/util/S3FileOutputFormat.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; + +/** + * {@link FileOutputFormat} using {@link NullOutputCommitter} when writing to + * AWS S3. + * + * Because default FileOutputCommitter is slow on S3, the + * {@link NullOutputCommitter} is used until a better solution is available, cf. + * Hadoop + * AWS committers and Hadoop + * AWS committer architecture. + */ +public abstract class S3FileOutputFormat extends FileOutputFormat { + + private OutputCommitter committer; + + @Override + public synchronized OutputCommitter getOutputCommitter( + TaskAttemptContext context) throws java.io.IOException { + if (committer == null) { + Path output = getOutputPath(context); + + String scheme = output.getFileSystem(context.getConfiguration()) + .getScheme(); + if (scheme.startsWith("s3")) { + committer = new NullOutputCommitter(); + } else { + committer = super.getOutputCommitter(context); + } + } + return committer; + } + +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/WarcCapture.java b/src/java/org/commoncrawl/util/WarcCapture.java new file mode 100644 index 0000000000..f2ad2df497 --- /dev/null +++ b/src/java/org/commoncrawl/util/WarcCapture.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.Content; + +/** + * Container to hold all Nutch objects related to a single page capture and + * necessary to write the WARC request and response records. + */ +public class WarcCapture implements Writable { + public Text url; + public CrawlDatum datum; + public Content content; + + public WarcCapture() { + url = new Text(); + datum = new CrawlDatum(); + content = new Content(); + } + + public WarcCapture(Text url, CrawlDatum datum, Content content) { + this.url = url; + this.datum = datum; + this.content = content; + } + + @Override + public void readFields(DataInput in) throws IOException { + url.readFields(in); + if (in.readBoolean()) { + datum.readFields(in); + } else { + datum = null; + } + if (in.readBoolean()) { + content.readFields(in); + } else { + content = null; + } + } + + @Override + public void write(DataOutput out) throws IOException { + url.write(out); + if (datum != null) { + out.writeBoolean(true); + datum.write(out); + } else { + out.writeBoolean(false); + } + if (content != null) { + out.writeBoolean(true); + content.write(out); + } else { + out.writeBoolean(false); + } + } + + @Override + public String toString() { + return "url=" + url.toString() + ", datum=" + datum.toString(); + } +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/WarcCdxWriter.java b/src/java/org/commoncrawl/util/WarcCdxWriter.java new file mode 100644 index 0000000000..9c9c756da0 --- /dev/null +++ b/src/java/org/commoncrawl/util/WarcCdxWriter.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; +import java.util.TimeZone; + +import org.apache.commons.io.output.CountingOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.archive.url.WaybackURLKeyMaker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonGenerationException; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.util.MinimalPrettyPrinter; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; + +public class WarcCdxWriter extends WarcWriter { + + public static Logger LOG = LoggerFactory.getLogger(WarcCdxWriter.class); + + private static final Charset UTF_8 = StandardCharsets.UTF_8; + + protected CountingOutputStream countingOut; + protected OutputStream cdxOut; + protected String warcFilename; + + private SimpleDateFormat timestampFormat; + private ObjectWriter jsonWriter; + private WaybackURLKeyMaker surtKeyMaker = new WaybackURLKeyMaker(true); + private URLNormalizers urlNormalizersRedirect; + + /** + * JSON indentation same as by Python WayBack + * (https://github.com/ikreymer/pywb) + */ + @SuppressWarnings("serial") + public static class JsonIndenter extends MinimalPrettyPrinter { + + // @Override + @Override + public void writeObjectFieldValueSeparator(JsonGenerator jg) + throws IOException, JsonGenerationException { + jg.writeRaw(": "); + } + + // @Override + @Override + public void writeObjectEntrySeparator(JsonGenerator jg) + throws IOException, JsonGenerationException { + jg.writeRaw(", "); + } + } + + public WarcCdxWriter(OutputStream warcOut, OutputStream cdxOut, + Path warcFilePath, URLNormalizers redirectNormalizers) { + super(new CountingOutputStream(warcOut)); + countingOut = (CountingOutputStream) this.out; + this.cdxOut = cdxOut; + timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ROOT); + timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + warcFilename = warcFilePath.toUri().getPath().replaceFirst("^/", ""); + ObjectMapper jsonMapper = new ObjectMapper(); + jsonMapper.getFactory().configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, + true); + jsonWriter = jsonMapper.writer(new JsonIndenter()); + urlNormalizersRedirect = redirectNormalizers; + } + + @Override + public URI writeWarcRevisitRecord(final URI targetUri, final String ip, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String warcProfile, final Date refersToDate, + final String payloadDigest, final String blockDigest, + String[] protocolVersions, String[] cipherSuites, byte[] block, + Content content) throws IOException { + long offset = countingOut.getByteCount(); + URI recordId = super.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, + date, warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, protocolVersions, cipherSuites, block, content); + long length = (countingOut.getByteCount() - offset); + writeCdxLine(targetUri, date, offset, length, payloadDigest, content, true, + null, null); + return recordId; + } + + @Override + public URI writeWarcResponseRecord(final URI targetUri, final String ip, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String payloadDigest, final String blockDigest, + final String truncated, String[] protocolVersions, String[] cipherSuites, + final byte[] block, Content content) throws IOException { + long offset = countingOut.getByteCount(); + URI recordId = super.writeWarcResponseRecord(targetUri, ip, httpStatusCode, + date, warcinfoId, relatedId, payloadDigest, blockDigest, truncated, + protocolVersions, cipherSuites, block, content); + long length = (countingOut.getByteCount() - offset); + String redirectLocation = null; + if (isRedirect(httpStatusCode)) { + redirectLocation = getMeta(content.getMetadata(), "Location"); + if (redirectLocation != null) { + try { + // convert redirects from relative to absolute URLs + redirectLocation = new URL(targetUri.toURL(), redirectLocation).toString(); + if (urlNormalizersRedirect != null) { + // normalize the redirect target URL + redirectLocation = urlNormalizersRedirect.normalize(redirectLocation, + URLNormalizers.SCOPE_FETCHER); + } + } catch (MalformedURLException e) { + redirectLocation = null; + } + } + } + writeCdxLine(targetUri, date, offset, length, payloadDigest, content, false, + redirectLocation, truncated); + return recordId; + } + + public void writeCdxLine(final URI targetUri, final Date date, long offset, + long length, String payloadDigest, Content content, boolean revisit, + String redirectLocation, String truncated) throws IOException { + String url = targetUri.toASCIIString(); + String surt = url; + Metadata meta = content.getMetadata(); + try { + surt = surtKeyMaker.makeKey(url); + } catch (URISyntaxException e) { + LOG.error("Failed to make SURT for {}: {}", url, + StringUtils.stringifyException(e)); + return; + } + if (payloadDigest == null) { + // no content, e.g., revisit record + } else if (payloadDigest.startsWith("sha1:")) { + payloadDigest = payloadDigest.substring(5); + } + cdxOut.write(surt.getBytes(UTF_8)); + cdxOut.write(' '); + cdxOut.write(timestampFormat.format(date).getBytes(UTF_8)); + cdxOut.write(' '); + Map data = new LinkedHashMap(); + data.put("url", url); + if (revisit) { + data.put("mime", "warc/revisit"); + } else { + data.put("mime", cleanMimeType(getMeta(meta, Response.CONTENT_TYPE))); + data.put("mime-detected", content.getContentType()); + } + data.put("status", meta.get(WarcWriter.HTTP_STATUS_CODE)); + if (payloadDigest != null) { + data.put("digest", payloadDigest); + } + data.put("length", String.format("%d", length)); + data.put("offset", String.format("%d", offset)); + data.put("filename", warcFilename); + String val = meta.get(WarcWriter.DETECTED_CHARSET); + if (val != null) { + data.put("charset", val); + } + val = meta.get(WarcWriter.DETECTED_LANGUAGE); + if (val != null) { + data.put("languages", val); + } + if (truncated != null) { + data.put("truncated", truncated); + } + if (redirectLocation != null) { + data.put("redirect", redirectLocation); + } + cdxOut.write(jsonWriter.writeValueAsBytes(data)); + cdxOut.write('\n'); + } + + protected static String cleanMimeType(String mime) { + if (mime == null) + return "unk"; + final char[] delimiters = { ';', ' ' }; + for (char delim : delimiters) { + int pos = mime.indexOf(delim); + if (pos > -1) + mime = mime.substring(0, pos); + } + if (mime.isEmpty()) + return "unk"; + return mime; + } + + protected static boolean isRedirect(int httpStatusCode) { + return httpStatusCode >= 300 && httpStatusCode < 400 + && httpStatusCode != 304; + } +} diff --git a/src/java/org/commoncrawl/util/WarcOutputFormat.java b/src/java/org/commoncrawl/util/WarcOutputFormat.java new file mode 100644 index 0000000000..0a993f09a5 --- /dev/null +++ b/src/java/org/commoncrawl/util/WarcOutputFormat.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InvalidJobConfException; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskID; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WarcOutputFormat extends FileOutputFormat { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private OutputCommitter committer; + + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext context) throws IOException { + + TaskID taskid = context.getTaskAttemptID().getTaskID(); + int partition = taskid.getId(); + LOG.info("Partition: {}", partition); + + Configuration conf = context.getConfiguration(); + Path outputPath = getOutputPath(context); + + String warcOutputPath = conf.get("warc.export.path"); + if (warcOutputPath != null) { + LOG.info("Writing WARC output to {} as configured by warc.export.path", + warcOutputPath); + outputPath = new Path(warcOutputPath); + } + + return new WarcRecordWriter(conf, outputPath, partition, context); + } + + @Override + public synchronized OutputCommitter getOutputCommitter( + TaskAttemptContext context) throws java.io.IOException { + if (committer == null) { + Path output = getOutputPath(context); + + String scheme = output.getFileSystem(context.getConfiguration()).getScheme(); + if (scheme.startsWith("s3")) { + /* + * The default FileOutputCommitter is slow on S3, use + * NullOutputCommitter until a better solution is available, cf. + * https://hadoop.apache.org/docs/r3.1.0/hadoop-aws/tools/hadoop-aws/ + * committers.html and + * https://hadoop.apache.org/docs/r3.1.0/hadoop-aws/tools/hadoop-aws/ + * committer_architecture.html + */ + committer = new NullOutputCommitter(); + } else { + committer = super.getOutputCommitter(context); + } + } + return committer; + } + + @Override + public void checkOutputSpecs(JobContext job) + throws FileAlreadyExistsException, IOException { + // Ensure that the output directory is set and not already there + Path outDir = getOutputPath(job); + if (outDir == null) { + throw new InvalidJobConfException("Output directory not set."); + } + + // get delegation token for outDir's file system + TokenCache.obtainTokensForNamenodes(job.getCredentials(), + new Path[] { outDir }, job.getConfiguration()); + } + +} \ No newline at end of file diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java new file mode 100644 index 0000000000..05f2a304f6 --- /dev/null +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -0,0 +1,1005 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.InetAddress; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.text.NumberFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TimeZone; +import java.util.concurrent.ThreadLocalRandom; +import java.util.regex.Pattern; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.codec.binary.Base32; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.net.InetAddresses; + +class WarcRecordWriter extends RecordWriter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** + * Holds duration of fetch in metadata, see + * {@link org.apache.nutch.protocol.http.api.HttpBase#RESPONSE_TIME} + */ + protected static final Text FETCH_DURATION = new Text("_rs_"); + public static final String CRLF = "\r\n"; + public static final String COLONSP = ": "; + protected static final Pattern PROBLEMATIC_HEADERS = Pattern + .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)"); + protected static final String X_HIDE_HEADER = "X-Crawler-"; + + protected static final Pattern STATUS_LINE_PATTERN = Pattern + .compile("^HTTP/1\\.[01] [0-9]{3}(?: .*)?$"); + protected static final Pattern WS_PATTERN = Pattern.compile("\\s+"); + protected static final Pattern HTTP_VERSION_PATTERN = Pattern + .compile("^HTTP/1\\.[01]$"); + protected static final Pattern HTTP_STATUS_CODE_PATTERN = Pattern + .compile("^[0-9]{3}$"); + protected static final String HTTP_VERSION_FALLBACK = "HTTP/1.1"; + + private TaskAttemptContext context; + private DataOutputStream warcOut; + private WarcWriter warcWriter; + private DataOutputStream crawlDiagnosticsWarcOut; + private WarcWriter crawlDiagnosticsWarcWriter; + private DataOutputStream robotsTxtWarcOut; + private WarcWriter robotsTxtWarcWriter; + private DataOutputStream cdxOut; + private DataOutputStream crawlDiagnosticsCdxOut; + private DataOutputStream robotsTxtCdxOut; + private URI warcinfoId; + private URI crawlDiagnosticsWarcinfoId; + private URI robotsTxtWarcinfoId; + private MessageDigest sha1 = null; + private Base32 base32 = new Base32(); + private LanguageDetector langDetect; + private boolean generateCrawlDiagnostics; + private boolean generateRobotsTxt; + private boolean generateCdx; + private boolean deduplicate; + private boolean detectLanguage; + private boolean skipByContent; + Pattern mimetypeSkipPattern; + float mimetypeSkipFactor = .0f; + float truncatedSkipFactor = .0f; + int maxContent = Integer.MAX_VALUE; + private String precedingURL = ""; // for deduplication + private URLNormalizers urlNormalizers; + private URLNormalizers urlNormalizersRedirect; + + private SimpleDateFormat isoDate; + + public WarcRecordWriter(Configuration conf, Path outputPath, int partition, + TaskAttemptContext context) throws IOException { + + this.context = context; + + FileSystem fs = outputPath.getFileSystem(conf); + + SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss", + Locale.ROOT); + fileDate.setTimeZone(TimeZone.getTimeZone("UTC")); + + isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT); + isoDate.setTimeZone(TimeZone.getTimeZone("UTC")); + + String prefix = conf.get("warc.export.prefix", "NUTCH-CRAWL"); + + /* + * WARC-Date : "The timestamp shall represent the instant that data capture + * for record creation began." + * (http://iipc.github.io/warc-specifications/specifications/warc-format/ + * warc-1.1/#warc-date-mandatory) + */ + String date = conf.get("warc.export.date", fileDate.format(new Date())); + String endDate = conf.get("warc.export.date.end", date); + Date captureStartDate = new Date(); + try { + captureStartDate = fileDate.parse(date); + } catch (ParseException e) { + LOG.error("Failed to parse warc.export.date {}: {}", date, + e.getMessage()); + } + + String hostname = conf.get("warc.export.hostname", getHostname()); + String filename = getFileName(prefix, date, endDate, hostname, partition); + + String publisher = conf.get("warc.export.publisher", null); + String operator = conf.get("warc.export.operator", null); + String software = conf.get("warc.export.software", "Apache Nutch"); + String isPartOf = conf.get("warc.export.isPartOf", null); + String description = conf.get("warc.export.description", null); + generateCrawlDiagnostics = conf.getBoolean("warc.export.crawldiagnostics", + false); + generateRobotsTxt = conf.getBoolean("warc.export.robotstxt", false); + generateCdx = conf.getBoolean("warc.export.cdx", false); + deduplicate = conf.getBoolean("warc.deduplicate", false); + detectLanguage = conf.getBoolean("warc.detect.language", false); + mimetypeSkipPattern = conf.getPattern("warc.skip.mimetype.pattern", null); + mimetypeSkipFactor = conf.getFloat("warc.skip.mimetype.factor", .0f); + truncatedSkipFactor = conf.getFloat("warc.skip.mimetype.truncated.factor", .0f); + maxContent = conf.getInt("http.content.limit", Integer.MAX_VALUE); + if ((mimetypeSkipPattern != null && mimetypeSkipFactor > .0f) || truncatedSkipFactor > .0f) { + skipByContent = true; + } + urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_INDEXER); + if (generateCdx) { + // URL normalizers to normalize the redirect targets/locations put into + // the CDX index + urlNormalizersRedirect = new URLNormalizers(conf, + URLNormalizers.SCOPE_FETCHER); + } + + Path warcPath = new Path(new Path(outputPath, "warc"), filename); + warcOut = fs.create(warcPath); + Path cdxPath = null; + if (generateCdx) { + cdxPath = new Path( + conf.get("warc.export.cdx.path", outputPath.toString())); + cdxOut = openCdxOutputStream(new Path(cdxPath, "warc"), filename, conf); + } + warcWriter = openWarcWriter(warcPath, warcOut, cdxOut, + urlNormalizersRedirect); + warcinfoId = warcWriter.writeWarcinfoRecord(filename, hostname, publisher, + operator, software, isPartOf, description, captureStartDate); + + if (generateCrawlDiagnostics) { + Path crawlDiagnosticsWarcPath = new Path( + new Path(outputPath, "crawldiagnostics"), filename); + crawlDiagnosticsWarcOut = fs.create(crawlDiagnosticsWarcPath); + if (generateCdx) { + crawlDiagnosticsCdxOut = openCdxOutputStream( + new Path(cdxPath, "crawldiagnostics"), filename, conf); + } + crawlDiagnosticsWarcWriter = openWarcWriter(crawlDiagnosticsWarcPath, + crawlDiagnosticsWarcOut, crawlDiagnosticsCdxOut, + urlNormalizersRedirect); + crawlDiagnosticsWarcinfoId = crawlDiagnosticsWarcWriter + .writeWarcinfoRecord(filename, hostname, publisher, operator, + software, isPartOf, description, captureStartDate); + } + + if (generateRobotsTxt) { + Path robotsTxtWarcPath = new Path(new Path(outputPath, "robotstxt"), + filename); + robotsTxtWarcOut = fs.create(robotsTxtWarcPath); + if (generateCdx) { + robotsTxtCdxOut = openCdxOutputStream(new Path(cdxPath, "robotstxt"), + filename, conf); + } + robotsTxtWarcWriter = openWarcWriter(robotsTxtWarcPath, robotsTxtWarcOut, + robotsTxtCdxOut, urlNormalizersRedirect); + robotsTxtWarcinfoId = robotsTxtWarcWriter.writeWarcinfoRecord(filename, + hostname, publisher, operator, software, isPartOf, description, + captureStartDate); + } + + try { + sha1 = MessageDigest.getInstance("SHA1"); + } catch (NoSuchAlgorithmException e) { + LOG.error("Unable to instantiate SHA1 MessageDigest object"); + throw new RuntimeException(e); + } + + if (detectLanguage) { + boolean bestEffort = conf + .getBoolean("warc.detect.language.cld2.besteffort", false); + langDetect = new LanguageDetector(); + langDetect.setBestEffort(bestEffort); + } + } + + /** + * Compose a unique WARC file name. + * + * The WARC specification recommends: + * Prefix-Timestamp-Serial-Crawlhost.warc.gz (WARC + * 1.1, Annex C) + * + * @param prefix + * WARC file name prefix + * @param startDate + * capture start date + * @param endDate + * capture end date + * @param hostname + * name of the crawling host + * @param partition + * MapReduce partition + * @return (unique) WARC file name + */ + protected String getFileName(String prefix, String startDate, String endDate, + String host, int partition) { + NumberFormat numberFormat = NumberFormat.getInstance(); + numberFormat.setMinimumIntegerDigits(5); + numberFormat.setGroupingUsed(false); + return prefix + "-" + startDate + "-" + endDate + "-" + + numberFormat.format(partition) + ".warc.gz"; + } + + protected String getSha1DigestWithAlg(byte[] bytes) { + sha1.reset(); + return "sha1:" + base32.encodeAsString(sha1.digest(bytes)); + } + + protected static String getStatusLine(String httpHeader) { + int eol = httpHeader.indexOf('\n'); + if (eol == -1) { + return httpHeader; + } + if (eol > 0 && httpHeader.charAt(eol - 1) == '\r') { + eol--; + } + return httpHeader.substring(0, eol); + } + + protected static int getStatusCode(String statusLine) { + int start = statusLine.indexOf(" "); + int end = statusLine.indexOf(" ", start + 1); + if (end == -1) + end = statusLine.length(); + int code = 200; + try { + code = Integer.parseInt(statusLine.substring(start + 1, end)); + } catch (NumberFormatException e) { + } + return code; + } + + /** Format status line and pair-wise list of headers as string */ + public static String formatHttpHeaders(String statusLine, List headers) { + StringBuilder sb = new StringBuilder(); + sb.append(statusLine).append(CRLF); + Iterator it = headers.iterator(); + while (it.hasNext()) { + String name = it.next(); + if (!it.hasNext()) { + // no value for name + break; + } + String value = it.next(); + sb.append(name).append(COLONSP).append(value).append(CRLF); + } + sb.append(CRLF); + return sb.toString(); + } + + /** + * Fix the HTTP version in the status line - replace HTTP/2 + * by HTTP/1.1 ({@link this#HTTP_VERSION_FALLBACK}}. + * + * See also {@link #fixHttpHeaders(String, int)} + * + * @param headers + * HTTP 1.1 or 1.0 request header string, CR-LF-separated lines, + * first line is the status line + * @return safe HTTP request header + */ + public static String fixHttpRequestHeaders(String headers) { + String http2version = " HTTP/2\r\n"; + int pos = headers.indexOf(http2version); + if (pos >= 0) { + StringBuilder replacement = new StringBuilder(); + String statusLinePrefix = headers.substring(0, pos); + if (statusLinePrefix.indexOf(CRLF) > 0) { + // match in subsequent header lines (should not or rarely happen) + return headers; + } + replacement.append(statusLinePrefix); + replacement.append(' '); + replacement.append(HTTP_VERSION_FALLBACK); + replacement.append(CRLF); + replacement.append(headers.substring(pos + http2version.length())); + return replacement.toString(); + } + return headers; + } + + /** + * Modify verbatim HTTP response headers: fix, remove or replace headers + * Content-Length, Content-Encoding and + * Transfer-Encoding which may confuse WARC readers. Ensure that + * returned header end with a single empty line (\r\n\r\n). + * + * If the HTTP version in the status line is HTTP/2, replace it + * by HTTP/1.1 ({@link this#HTTP_VERSION_FALLBACK}}. + * + * @param headers + * HTTP 1.1 or 1.0 response header string, CR-LF-separated lines, + * first line is the status line + * @return safe HTTP response header + */ + public static String fixHttpHeaders(String headers, int contentLength) { + int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0; + boolean hasContentLength = false; + StringBuilder replacement = new StringBuilder(); + while (start < headers.length()) { + lineEnd = headers.indexOf(CRLF, start); + trailingCrLf = 1; + if (lineEnd == -1) { + lineEnd = headers.length(); + trailingCrLf = 0; + } + int colonPos = -1; + for (int i = start; i < lineEnd; i++) { + if (headers.charAt(i) == ':') { + colonPos = i; + break; + } + } + if (colonPos == -1) { + boolean valid = true; + if (start == 0) { + // status line (without colon) + final String statusLine = headers.substring(0, lineEnd); + if (!STATUS_LINE_PATTERN.matcher(statusLine).matches()) { + final String[] parts = WS_PATTERN + .split(headers.substring(0, lineEnd), 3); + if (parts.length < 2 + || !HTTP_STATUS_CODE_PATTERN.matcher(parts[1]).matches()) { + // nothing we can do here, leave status line as is + LOG.warn( + "WARC parsers may fail on non-standard HTTP 1.0 / 1.1 response status line: {}", + statusLine); + } else { + if (HTTP_VERSION_PATTERN.matcher(parts[0]).matches()) { + replacement.append(parts[0]); + } else { + replacement.append(HTTP_VERSION_FALLBACK); + } + replacement.append(' '); + replacement.append(parts[1]); // status code + replacement.append(' '); + if (parts.length == 3) { + replacement.append(parts[2]); // message + } + replacement.append(CRLF); + last = lineEnd + 2 * trailingCrLf; + } + } + } else if ((lineEnd + 4) == headers.length() + && headers.endsWith(CRLF + CRLF)) { + // ok, trailing empty line + trailingCrLf = 2; + } else if (start == lineEnd) { + // skip/remove empty line + LOG.debug("Skipping empty header line"); + valid = false; + } else { + LOG.warn("Invalid header line: {}", + headers.substring(start, lineEnd)); + valid = false; + } + if (!valid) { + if (last < start) { + replacement.append(headers.substring(last, start)); + } + last = lineEnd + 2 * trailingCrLf; + } + start = lineEnd + 2 * trailingCrLf; + /* + * skip over invalid header line, no further check for problematic + * headers required + */ + continue; + } + String name = headers.substring(start, colonPos); + if (PROBLEMATIC_HEADERS.matcher(name).matches()) { + boolean needsFix = true; + if (name.equalsIgnoreCase("content-length")) { + hasContentLength = true; + String value = headers.substring(colonPos + 1, lineEnd).trim(); + try { + int l = Integer.parseInt(value); + if (l == contentLength) { + needsFix = false; + } + } catch (NumberFormatException e) { + // needs to be fixed + } + } + if (needsFix) { + if (last < start) { + replacement.append(headers.substring(last, start)); + } + last = lineEnd + 2 * trailingCrLf; + replacement.append(X_HIDE_HEADER) + .append(headers.substring(start, lineEnd + 2 * trailingCrLf)); + if (trailingCrLf == 0) { + replacement.append(CRLF); + trailingCrLf = 1; + } + if (name.equalsIgnoreCase("content-length")) { + // add effective uncompressed and unchunked length of content + replacement.append("Content-Length").append(COLONSP) + .append(contentLength).append(CRLF); + } + } + } + start = lineEnd + 2 * trailingCrLf; + } + if (last > 0 || trailingCrLf != 2 || !hasContentLength) { + if (last < headers.length()) { + // append trailing headers + replacement.append(headers.substring(last)); + } + if (!hasContentLength) { + replacement.append("Content-Length").append(COLONSP).append(contentLength) + .append(CRLF); + } + while (trailingCrLf < 2) { + replacement.append(CRLF); + trailingCrLf++; + } + return replacement.toString(); + } + return headers; + } + + protected static String getHostname() { + try { + return InetAddress.getLocalHost().getHostName(); + } catch (UnknownHostException e) { + LOG.warn("Failed to get hostname: {}", e.getMessage()); + } + return "localhost"; + } + + /** + * Canonicalize IPv6 address strings. + * + * @param ip + * IP address string representation + * @return the canonical IP address + */ + protected static String canonicalizeIP(String ip) { + if (ip.indexOf(':') > -1) { + return InetAddresses.toAddrString(InetAddresses.forString(ip)); + } + return ip; + } + + private WarcWriter openWarcWriter(Path warcPath, DataOutputStream warcOut, + DataOutputStream cdxOut, URLNormalizers redirectNormalizers) { + if (cdxOut != null) { + return new WarcCdxWriter(warcOut, cdxOut, warcPath, redirectNormalizers); + } + return new WarcWriter(warcOut); + } + + protected static DataOutputStream openCdxOutputStream(Path cdxPath, + String warcFilename, Configuration conf) throws IOException { + String cdxFilename = warcFilename.replaceFirst("\\.warc\\.gz$", ".cdx.gz"); + Path cdxFile = new Path(cdxPath, cdxFilename); + FileSystem fs = cdxPath.getFileSystem(conf); + return new DataOutputStream(new GZIPOutputStream(fs.create(cdxFile))); + } + + @Override + public synchronized void write(Text key, WarcCapture value) + throws IOException { + + if (value.content == null) { + ProtocolStatus pstatus = null; + if (value.datum != null) { + pstatus = (ProtocolStatus) value.datum.getMetaData() + .get(Nutch.WRITABLE_PROTO_STATUS_KEY); + } + if (pstatus != null) { + LOG.warn( + "Cannot write WARC record, no content for {}, protocol status: {} - {}", + value.url, pstatus.getName(), pstatus.getMessage()); + } else { + LOG.warn( + "Cannot write WARC record, no content and protocol status for {}", + value.url); + } + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL).increment(1); + return; + } + + URI targetUri = null; + String url = value.url.toString(); + try { + targetUri = new URI(url); + } catch (URISyntaxException e) { + if (value.datum != null + && value.datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) { + // if a successful capture, try to normalize the URL + String urlNorm = null; + try { + urlNorm = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INDEXER); + } catch (MalformedURLException ee) { + // ignore, log exception observed on original URL + } + if (urlNorm != null && !url.equals(urlNorm)) { + try { + targetUri = new URI(urlNorm); + LOG.info("Normalized URL to valid URI: {} -> {}", url, urlNorm); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_URI_NORMALIZED_TOTAL).increment(1); + } catch (URISyntaxException ee) { + // ignore, log exception observed on original URL + } + } + } + if (targetUri == null) { + LOG.error("Cannot write WARC record, invalid URI: {}", url); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_INVALID_URI_TOTAL) + .increment(1); + return; + } + } + + if (skipByContent) { + float factor = .0f; + if (mimetypeSkipFactor > .0f && mimetypeSkipPattern.matcher(value.content.getContentType()).find()) { + factor += mimetypeSkipFactor; + } + if (factor > .0f) { + factor *= value.content.getContent().length / (1.0 * maxContent); + String truncated = value.content.getMetadata().get(Response.TRUNCATED_CONTENT_REASON); + if (truncatedSkipFactor > .0f && truncated != null) { + factor += truncatedSkipFactor; + } + if (ThreadLocalRandom.current().nextFloat() < factor) { + LOG.info( + "Skipped record by content (truncated: {}, content-type: {}, length: {}): {}", + (truncated != null ? truncated : "-"), + value.content.getContentType(), value.content.getContent().length, + value.url); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL) + .increment(1); + return; + } + } + } + + if (deduplicate) { + // given that the reducer input is sorted, a comparison with the preceding + // URL is sufficient + if (precedingURL.equals(url)) { + // LOG.info("Skipping duplicate record: {}", value.url); + try { + String status = "?"; + if (value.content.getMetadata() + .get(Response.RESPONSE_HEADERS) != null) { + status = Integer.toString(getStatusCode(getStatusLine( + value.content.getMetadata().get(Response.RESPONSE_HEADERS)))); + } else if (value.datum != null) { + ProtocolStatus pstatus = (ProtocolStatus) value.datum.getMetaData() + .get(Nutch.WRITABLE_PROTO_STATUS_KEY); + status = pstatus.getName(); + } + Instant date = null; + if (value.datum != null) { + date = Instant.ofEpochMilli(value.datum.getFetchTime()); + } else { + String fetchTime = value.content.getMetadata() + .get(Nutch.FETCH_TIME_KEY); + if (fetchTime != null) { + try { + date = Instant.ofEpochMilli(Long.parseLong(fetchTime)); + } catch (NumberFormatException e) { + LOG.error("Invalid fetch time '{}' in content metadata of {}", + fetchTime, value.url.toString()); + } + } + } + LOG.info("Skipping duplicate record: {} ({}, status: {}, size: {})", + value.url, date, status, value.content.getContent().length); + } catch (Throwable t) { + LOG.error(t.getMessage()); + } + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_DUPLICATE_TOTAL).increment(1); + return; + } + precedingURL = url; + } + + String ip = "0.0.0.0"; + Date date = null; + boolean notModified = false; + Date lastModifiedDate = null; + String verbatimResponseHeaders = null; + String verbatimRequestHeaders = null; + List headers = new ArrayList<>(); + String responseHeaders = null; + String statusLine = ""; + int httpStatusCode = 200; + String fetchDuration = null; + String truncatedReason = null; + String[] protocolVersions = null; + String[] cipherSuites = null; + + if (value.datum != null) { + date = new Date(value.datum.getFetchTime()); + // This is for older crawl dbs that don't include the verbatim status + // line in the metadata + ProtocolStatus pstatus = (ProtocolStatus) value.datum.getMetaData() + .get(Nutch.WRITABLE_PROTO_STATUS_KEY); + if (pstatus == null) { + LOG.warn("Cannot write WARC record, no protocol status for {}", + value.url); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL) + .increment(1); + return; + } + switch (pstatus.getCode()) { + case ProtocolStatus.SUCCESS: + statusLine = "HTTP/1.1 200 OK"; + httpStatusCode = 200; + break; + case ProtocolStatus.TEMP_MOVED: + statusLine = "HTTP/1.1 302 Found"; + httpStatusCode = 302; + break; + case ProtocolStatus.MOVED: + statusLine = "HTTP/1.1 301 Moved Permanently"; + httpStatusCode = 301; + break; + case ProtocolStatus.NOTMODIFIED: + statusLine = "HTTP/1.1 304 Not Modified"; + httpStatusCode = 304; + notModified = true; + long modifiedTime = value.datum.getModifiedTime(); + if (modifiedTime > 0) { + lastModifiedDate = new Date(modifiedTime); + } + break; + default: + if (value.content.getMetadata() + .get(Response.RESPONSE_HEADERS) == null) { + LOG.warn("Unknown or ambiguous protocol status: {}", pstatus); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL) + .increment(1); + return; + } + } + String httpStatusCodeVal = value.datum.getMetaData() + .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString(); + if (httpStatusCodeVal != null) { + try { + httpStatusCode = Integer.parseInt(httpStatusCodeVal); + } catch (NumberFormatException e) { + } + } + if (value.datum.getMetaData().get(FETCH_DURATION) != null) { + fetchDuration = value.datum.getMetaData().get(FETCH_DURATION) + .toString(); + } + } else { + // robots.txt, no CrawlDatum available + String fetchTime = value.content.getMetadata().get(Nutch.FETCH_TIME_KEY); + if (fetchTime != null) { + try { + date = new Date(Long.parseLong(fetchTime)); + } catch (NumberFormatException e) { + LOG.error("Invalid fetch time '{}' in content metadata of {}", + fetchTime, value.url.toString()); + } + } + if (date == null) { + String httpDate = value.content.getMetadata().get("Date"); + if (httpDate != null) { + try { + date = HttpDateFormat.toDate(httpDate); + } catch (ParseException e) { + LOG.warn("Failed to parse HTTP Date {} for {}", httpDate, + targetUri); + date = new Date(); + } + } else { + LOG.warn("No HTTP Date for {}", targetUri); + date = new Date(); + } + } + // status is taken from header + } + + boolean useVerbatimResponseHeaders = false; + + for (String name : value.content.getMetadata().names()) { + String val = value.content.getMetadata().get(name); + switch (name) { + case Response.IP_ADDRESS: + ip = canonicalizeIP(val); + break; + case Response.REQUEST: + verbatimRequestHeaders = val; + break; + case Response.RESPONSE_HEADERS: + verbatimResponseHeaders = val; + if (verbatimResponseHeaders.contains(CRLF)) { + useVerbatimResponseHeaders = true; + } + statusLine = getStatusLine(verbatimResponseHeaders); + httpStatusCode = getStatusCode(statusLine); + break; + case Response.TRUNCATED_CONTENT_REASON: + truncatedReason = val; + break; + case Response.PROTOCOL_VERSIONS: + protocolVersions = val.split(","); + break; + case Response.CIPHER_SUITES: + cipherSuites = val.split(","); + break; + case Nutch.SEGMENT_NAME_KEY: + case Nutch.FETCH_STATUS_KEY: + case Nutch.SCORE_KEY: + case Nutch.SIGNATURE_KEY: + case Response.FETCH_TIME: + break; // ignore, not required for WARC record + default: + // We have to fix up a few headers because we don't have the raw + // responses to avoid that WARC readers try to read the content + // as chunked or gzip-compressed. + if (name.equalsIgnoreCase(Response.CONTENT_LENGTH)) { + int origContentLength = -1; + try { + origContentLength = Integer.parseInt(val); + } catch (NumberFormatException e) { + // ignore + } + headers.add(Response.CONTENT_LENGTH); + if (origContentLength != value.content.getContent().length) { + headers.add("" + value.content.getContent().length); + headers.add(X_HIDE_HEADER + Response.CONTENT_LENGTH); + } + } else if (name.equalsIgnoreCase(Response.CONTENT_ENCODING)) { + if (val.equalsIgnoreCase("identity")) { + headers.add(name); + } else { + headers.add(X_HIDE_HEADER + Response.CONTENT_ENCODING); + } + } else if (name.equalsIgnoreCase(Response.TRANSFER_ENCODING)) { + if (val.equalsIgnoreCase("identity")) { + headers.add(name); + } else { + headers.add(X_HIDE_HEADER + Response.TRANSFER_ENCODING); + } + } else { + headers.add(name); + } + headers.add(val); + } + } + + if (verbatimRequestHeaders == null) { + LOG.error("No request headers for {}", url); + } + + if (useVerbatimResponseHeaders && verbatimResponseHeaders != null) { + responseHeaders = fixHttpHeaders(verbatimResponseHeaders, value.content.getContent().length); + } else { + responseHeaders = formatHttpHeaders(statusLine, headers); + } + + WarcWriter writer = warcWriter; + URI infoId = this.warcinfoId; + if (value.datum == null) { + // no CrawlDatum: must be a robots.txt + if (!generateRobotsTxt) + return; + writer = robotsTxtWarcWriter; + infoId = robotsTxtWarcinfoId; + } else if (value.datum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) { + if (!generateCrawlDiagnostics) + return; + writer = crawlDiagnosticsWarcWriter; + infoId = crawlDiagnosticsWarcinfoId; + } + + LOG.info("WARC {} record {} ({}, status: {}, size: {})", + (notModified ? "revisit" : "response"), targetUri, isoDate.format(date), + httpStatusCode, value.content.getContent().length); + + URI requestId = null; + if (verbatimRequestHeaders != null) { + requestId = writer.writeWarcRequestRecord(targetUri, ip, date, infoId, + protocolVersions, cipherSuites, + fixHttpRequestHeaders(verbatimRequestHeaders) + .getBytes(StandardCharsets.UTF_8)); + } + + if (generateCdx) { + value.content.getMetadata().add(WarcWriter.HTTP_STATUS_CODE, + String.format("%d", httpStatusCode)); + } + + LanguageDetector.Result ldres = null; + if (detectLanguage && writer == warcWriter) { + // detect language only for successfully fetched primary documents + ldres = langDetect.detectLanguage(targetUri, value.content); + if (ldres.errorReason != null) { + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_ERROR_PREFIX + ldres.errorStatus.name) + .increment(1); + } else if (ldres.languages == null) { + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_NO_RESULT_TOTAL).increment(1); + } else if (ldres.languages.isReliable()) { + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_RESULT_RELIABLE_TOTAL) + .increment(1); + } else { + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL) + .increment(1); + } + if (generateCdx) { + if (ldres.charset != null) { + value.content.getMetadata().add(WarcWriter.DETECTED_CHARSET, + ldres.charset.name()); + } + org.commoncrawl.langdetect.cld2.Result lr = ldres.languages; + if (lr != null) { + String codes = lr.getLanguageCodesISO639_3(",", true); + if (codes != null && !codes.isEmpty()) { + value.content.getMetadata().add(WarcWriter.DETECTED_LANGUAGE, + codes); + } + } + } + } + + URI responseId = null; + if (notModified) { + /* + * revisit record of profile WarcWriter.PROFILE_REVISIT_NOT_MODIFIED + * + * Note: "revisits" identified by signature comparison + * (WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST) are stored as response + * records. + * + * The modified date of the CrawlDatum is the date of the last successful + * fetch with content (status 200). It is used for the + * WARC-Refers-To-Date. + */ + byte[] responseHeaderBytes = responseHeaders + .getBytes(StandardCharsets.UTF_8); + String blockDigest = getSha1DigestWithAlg(responseHeaderBytes); + /* + * HTTP 304 not-modified responses do not have a payload, should not add a + * digest for it according to the WARC specification: + * "The WARC-Payload-Digest field ... shall not be used on records without + * a well-defined payload." + */ + String payloadDigest = null; + responseId = writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, infoId, + requestId, WarcWriter.PROFILE_REVISIT_NOT_MODIFIED, lastModifiedDate, + payloadDigest, blockDigest, protocolVersions, cipherSuites, + responseHeaderBytes, value.content); + } else { + StringBuilder responsesb = new StringBuilder(4096); + responsesb.append(responseHeaders); + + byte[] responseHeaderBytes = responsesb.toString() + .getBytes(StandardCharsets.UTF_8); + byte[] responseBytes = new byte[responseHeaderBytes.length + + value.content.getContent().length]; + System.arraycopy(responseHeaderBytes, 0, responseBytes, 0, + responseHeaderBytes.length); + System.arraycopy(value.content.getContent(), 0, responseBytes, + responseHeaderBytes.length, value.content.getContent().length); + + String payloadDigest = getSha1DigestWithAlg(value.content.getContent()); + String blockDigest = getSha1DigestWithAlg(responseBytes); + responseId = writer.writeWarcResponseRecord(targetUri, ip, + httpStatusCode, date, infoId, requestId, payloadDigest, blockDigest, + truncatedReason, protocolVersions, cipherSuites, responseBytes, value.content); + } + + // Write metadata record + StringBuilder metadatasb = new StringBuilder(4096); + Map metadata = new LinkedHashMap(); + + if (fetchDuration != null) { + metadata.put("fetchTimeMs", fetchDuration); + } + if (ldres != null) { + if (ldres.charset != null) { + metadata.put("charset-detected", ldres.charset.name()); + } + if (ldres.languages != null) { + metadata.put("languages-cld2", ldres.languages.toJSON()); + } + } + if (metadata.size() > 0) { + for (Map.Entry entry : metadata.entrySet()) { + metadatasb.append(entry.getKey()).append(COLONSP) + .append(entry.getValue()).append(CRLF); + } + metadatasb.append(CRLF); + + writer.writeWarcMetadataRecord(targetUri, date, infoId, responseId, null, + metadatasb.toString().getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + public synchronized void close(TaskAttemptContext context) + throws IOException { + context.setStatus("closing WARC output writers"); + warcOut.close(); + if (generateCrawlDiagnostics) { + crawlDiagnosticsWarcOut.close(); + } + if (generateRobotsTxt) { + robotsTxtWarcOut.close(); + } + if (generateCdx) { + cdxOut.close(); + if (generateCrawlDiagnostics) { + crawlDiagnosticsCdxOut.close(); + } + if (generateRobotsTxt) { + robotsTxtCdxOut.close(); + } + } + } +} diff --git a/src/java/org/commoncrawl/util/WarcWriter.java b/src/java/org/commoncrawl/util/WarcWriter.java new file mode 100644 index 0000000000..aa9b20ba3b --- /dev/null +++ b/src/java/org/commoncrawl/util/WarcWriter.java @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TimeZone; +import java.util.UUID; +import java.util.regex.Pattern; +import java.util.zip.GZIPOutputStream; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.LinkedListMultimap; +import com.google.common.collect.Multimap; + +public class WarcWriter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + protected OutputStream out = null; + protected OutputStream origOut = null; + + private static final String WARC_VERSION = "WARC/1.0"; + + // Record types + private static final String WARC_INFO = "warcinfo"; + private static final String WARC_RESPONSE = "response"; + private static final String WARC_REQUEST = "request"; + private static final String WARC_REVISIT = "revisit"; + private static final String WARC_CONVERSION = "conversion"; + private static final String WARC_METADATA = "metadata"; + + // Defined fields + private static final String WARC_TYPE = "WARC-Type"; + private static final String WARC_DATE = "WARC-Date"; + private static final String WARC_RECORD_ID = "WARC-Record-ID"; + private static final String CONTENT_LENGTH = "Content-Length"; + private static final String CONTENT_TYPE = "Content-Type"; + private static final String WARC_IP_ADDRESS = "WARC-IP-Address"; + private static final String WARC_WARCINFO_ID = "WARC-Warcinfo-ID"; + private static final String WARC_TARGET_URI = "WARC-Target-URI"; + private static final String WARC_CONCURRENT_TO = "WARC-Concurrent-To"; + private static final String WARC_REFERS_TO = "WARC-Refers-To"; + private static final String WARC_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI"; + private static final String WARC_REFERS_TO_DATE = "WARC-Refers-To-Date"; + private static final String WARC_BLOCK_DIGEST = "WARC-Block-Digest"; + private static final String WARC_PAYLOAD_DIGEST = "WARC-Payload-Digest"; + private static final String WARC_TRUNCATED = "WARC-Truncated"; + private static final String WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type"; + private static final String WARC_PROFILE = "WARC-Profile"; + private static final String WARC_FILENAME = "WARC-Filename"; + /** WARC-Protocol, see https://github.com/iipc/warc-specifications/issues/42 */ + private static final String WARC_PROTOCOL = "WARC-Protocol"; + /** WARC-Cipher-Suite, see https://github.com/iipc/warc-specifications/issues/94 */ + private static final String WARC_CIPHER_SUITE = "WARC-Cipher-Suite"; + + public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = "http://netpreserve.org/warc/1.1/revisit/identical-payload-digest"; + public static final String PROFILE_REVISIT_NOT_MODIFIED = "http://netpreserve.org/warc/1.1/revisit/server-not-modified"; + + private static final String CRLF = "\r\n"; + private static final String COLONSP = ": "; + + /* Metadata names to pass from WARC to CDX */ + protected static final String HTTP_STATUS_CODE = "HTTP-Status-Code"; + protected static final String DETECTED_CHARSET = "Detected-Charset"; + protected static final String DETECTED_LANGUAGE = "Detected-Language"; + + public static final String CONTENT_TYPE_RESPONSE = "application/http; msgtype=response"; + public static final String CONTENT_TYPE_METADATA = "application/warc-fields"; + + private SimpleDateFormat isoDate; + + public static class CompressedOutputStream extends GZIPOutputStream { + public CompressedOutputStream(OutputStream out) throws IOException { + super(out); + } + + public void end() { + def.end(); + } + } + + public WarcWriter(final OutputStream out) { + this.origOut = this.out = out; + isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT); + isoDate.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + /** + * Class to hold HTTP and SSL/TLS protocol versions to fill the + * WARC-Protocol field. Protocol names require normalization, see + * https://github.com/iipc/warc-specifications/issues/42 + */ + public static class WarcProtocol { + public static Set protocols = Set.of("dns", "ftp", "gemini", + "gopher", "http/0.9", "http/1.0", "http/1.1", "h2", "h2c", "spdy/1", + "spdy/2", "spdy/3", "ssl/2", "ssl/3", "tls/1.0", "tls/1.1", "tls/1.2", + "tls/1.3"); + public static Pattern vPattern = Pattern.compile("^(?:ssl|tls)v[0-9]", + Pattern.CASE_INSENSITIVE); + private String name; + + public WarcProtocol(final String protocol) { + name = protocol.toLowerCase(Locale.ROOT); + if (vPattern.matcher(name).find()) { + name = name.substring(0, 3) + '/' + name.substring(4); + } + if (!protocols.contains(name)) { + LOG.warn("Unknown protocol name or version: {}", name); + } + } + + @Override + public String toString() { + return name; + } + } + + /** + * + * @return record id for the warcinfo record + * @throws IOException + */ + public URI writeWarcinfoRecord(String filename, String hostname, + String publisher, String operator, String software, String isPartOf, + String description, Date date) + throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_FILENAME, filename); + + StringBuilder sb = new StringBuilder(); + Multimap settings = LinkedListMultimap.create(); + + if (isPartOf != null) { + settings.put("isPartOf", isPartOf); + } + + if (publisher != null) { + settings.put("publisher", publisher); + } + + if (description != null) { + settings.put("description", description); + } + + if (operator != null) { + settings.put("operator", operator); + } + + if (hostname != null) { + settings.put("hostname", hostname); + } + + if (software != null) { + settings.put("software", software); + } + + String robotsTxtParser = String.format(Locale.ROOT, + "checked via crawler-commons %s (https://github.com/crawler-commons/crawler-commons)", + crawlercommons.CrawlerCommons.getVersion()); + settings.put("robots", robotsTxtParser); + + settings.put("format", "WARC File Format 1.1"); + settings.put("conformsTo", + "https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/"); + + writeWarcKeyValue(sb, settings); + + byte[] ba = sb.toString().getBytes(StandardCharsets.UTF_8); + URI recordId = getRecordId(); + + writeRecord(WARC_INFO, date, CONTENT_TYPE_METADATA, recordId, extra, + new ByteArrayInputStream(ba), ba.length); + return recordId; + } + + public URI writeWarcRequestRecord(final URI targetUri, final String ip, + final Date date, final URI warcinfoId, String[] protocolVersions, + String[] cipherSuites, final byte[] block) throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); + extra.put(WARC_IP_ADDRESS, ip); + extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } + + URI recordId = getRecordId(); + writeRecord(WARC_REQUEST, date, "application/http; msgtype=request", + recordId, extra, block); + return recordId; + } + + public URI writeWarcResponseRecord(final URI targetUri, final String ip, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String payloadDigest, final String blockDigest, + final String truncated, String[] protocolVersions, String[] cipherSuites, + final byte[] block, Content content) throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); + if (relatedId != null) { + extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">"); + } + extra.put(WARC_IP_ADDRESS, ip); + extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } + + if (payloadDigest != null) { + extra.put(WARC_PAYLOAD_DIGEST, payloadDigest); + } + + if (blockDigest != null) { + extra.put(WARC_BLOCK_DIGEST, blockDigest); + } + + if (truncated != null) { + extra.put(WARC_TRUNCATED, truncated); + } + + extra.put(WARC_IDENTIFIED_PAYLOAD_TYPE, content.getContentType()); + + URI recordId = getRecordId(); + writeRecord(WARC_RESPONSE, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); + return recordId; + } + + public URI writeWarcRevisitRecord(final URI targetUri, final String ip, + final int httpStatusCode, final Date date, final URI warcinfoId, + final URI relatedId, final String warcProfile, final Date refersToDate, + final String payloadDigest, final String blockDigest, + String[] protocolVersions, String[] cipherSuites, byte[] block, + Content content) throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); + extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">"); + extra.put(WARC_IP_ADDRESS, ip); + extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + if (protocolVersions != null) { + for (String pVersion : protocolVersions) { + extra.put(WARC_PROTOCOL, new WarcProtocol(pVersion).toString()); + } + } + if (cipherSuites != null) { + for (String cipher : cipherSuites) { + extra.put(WARC_CIPHER_SUITE, cipher); + } + } + // WARC-Refers-To-Target-URI only useful for revisit by digest + extra.put(WARC_REFERS_TO_TARGET_URI, targetUri.toASCIIString()); + if (refersToDate != null) { + extra.put(WARC_REFERS_TO_DATE, isoDate.format(refersToDate)); + } + extra.put(WARC_PROFILE, warcProfile); + + if (payloadDigest != null) { + extra.put(WARC_PAYLOAD_DIGEST, payloadDigest); + } + if (blockDigest != null) { + extra.put(WARC_BLOCK_DIGEST, blockDigest); + } + + URI recordId = getRecordId(); + writeRecord(WARC_REVISIT, date, CONTENT_TYPE_RESPONSE, recordId, extra, block); + return recordId; + } + + public URI writeWarcMetadataRecord(final URI targetUri, final Date date, + final URI warcinfoId, final URI relatedId, final String blockDigest, + final byte[] block) throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); + extra.put(WARC_CONCURRENT_TO, "<" + relatedId.toString() + ">"); + extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + + if (blockDigest != null) { + extra.put(WARC_BLOCK_DIGEST, blockDigest); + } + + URI recordId = getRecordId(); + writeRecord(WARC_METADATA, date, CONTENT_TYPE_METADATA, recordId, extra, block); + return recordId; + } + + public URI writeWarcConversionRecord(final URI targetUri, final Date date, + final URI warcinfoId, final URI relatedId, final String blockDigest, + final String contentType, final byte[] block) throws IOException { + Multimap extra = LinkedListMultimap.create(); + extra.put(WARC_WARCINFO_ID, "<" + warcinfoId.toString() + ">"); + extra.put(WARC_REFERS_TO, "<" + relatedId.toString() + ">"); + extra.put(WARC_TARGET_URI, targetUri.toASCIIString()); + + if (blockDigest != null) { + extra.put(WARC_BLOCK_DIGEST, blockDigest); + } + + URI recordId = getRecordId(); + writeRecord(WARC_CONVERSION, date, contentType, recordId, extra, block); + return recordId; + } + + protected void writeRecord(final String type, final Date date, + final String contentType, final URI recordId, + Multimap extra, final InputStream content, + final long contentLength) throws IOException { + StringBuilder sb = new StringBuilder(4096); + + sb.append(WARC_VERSION).append(CRLF); + + Multimap header = LinkedListMultimap.create(); + header.put(WARC_TYPE, type); + header.put(WARC_DATE, isoDate.format(date)); + header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">"); + header.put(CONTENT_LENGTH, Long.toString(contentLength)); + header.put(CONTENT_TYPE, contentType); + + writeWarcKeyValue(sb, header); + writeWarcKeyValue(sb, extra); + + sb.append(CRLF); + + startRecord(); + out.write(sb.toString().getBytes(StandardCharsets.UTF_8)); + if (contentLength != 0 && content != null) { + copyStream(content, out, contentLength); + } + + out.write(CRLF.getBytes()); + out.write(CRLF.getBytes()); + endRecord(); + } + + protected void writeRecord(final String type, final Date date, + final String contentType, final URI recordId, + Multimap extra, final byte[] block) throws IOException { + StringBuilder sb = new StringBuilder(4096); + + sb.append(WARC_VERSION).append(CRLF); + + Multimap header = LinkedListMultimap.create(); + header.put(WARC_TYPE, type); + header.put(WARC_DATE, isoDate.format(date)); + header.put(WARC_RECORD_ID, "<" + recordId.toString() + ">"); + header.put(CONTENT_LENGTH, Long.toString(block.length)); + header.put(CONTENT_TYPE, contentType); + + writeWarcKeyValue(sb, header); + writeWarcKeyValue(sb, extra); + + sb.append(CRLF); + + startRecord(); + out.write(sb.toString().getBytes(StandardCharsets.UTF_8)); + out.write(block); + + out.write(CRLF.getBytes()); + out.write(CRLF.getBytes()); + endRecord(); + } + + protected void startRecord() throws IOException { + this.out = new CompressedOutputStream(this.origOut); + } + + protected void endRecord() throws IOException { + CompressedOutputStream compressedOut = (CompressedOutputStream) this.out; + compressedOut.finish(); + compressedOut.flush(); + compressedOut.end(); + + this.out = this.origOut; + } + + protected long copyStream(InputStream input, OutputStream output, + long maxBytes) throws IOException { + byte[] buffer = new byte[4096]; + long count = 0L; + int n = 0; + while (-1 != (n = input.read(buffer))) { + if (maxBytes > 0 && maxBytes < n) { + n = (int) maxBytes; + } + + output.write(buffer, 0, n); + count += n; + maxBytes -= n; + + if (maxBytes == 0) { + return count; + } + } + return count; + } + + protected static void writeWarcKeyValue(StringBuilder sb, + Map headers) { + if (headers != null) { + headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v)); + } + } + + protected static void writeWarcKeyValue(StringBuilder sb, + Multimap headers) { + if (headers != null) { + headers.forEach((k, v) -> writeWarcKeyValue(sb, k, v)); + } + } + + protected static void writeWarcKeyValue(StringBuilder sb, String key, + String value) { + sb.append(key).append(COLONSP).append(value).append(CRLF); + } + + private String getUUID() { + return UUID.randomUUID().toString(); + } + + public URI getRecordId() { + try { + return new URI("urn:uuid:" + getUUID()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + + protected static String getMeta(Metadata metadata, String name) { + String value = metadata.get(name); + if (value == null) { + // check for case variants + for (String n : metadata.names()) { + if (n.equalsIgnoreCase(name)) { + value = metadata.get(n); + break; + } + } + } + return value; + } + +} diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index b0aca71038..3ba9e1b2fc 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -84,6 +84,7 @@ + @@ -189,7 +190,7 @@ - + Tests failed! - + + + + + + + + + + + + + + + + + + + + + + + + + + + Indexer integration tests failed! + + + + + + + + + + + + + + + + + + + + + + + + + + + Protocol integration tests failed! + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 498259a950..24edd14639 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -27,12 +27,12 @@ - + - + - + @@ -42,21 +42,21 @@ - + - + - - - - - + + + + + @@ -68,24 +68,25 @@ - + - + - + - - + + + - - + + @@ -142,6 +143,7 @@ + @@ -172,6 +174,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + @@ -183,28 +209,28 @@ - + - + - - - - - + + + + + - + - + - + @@ -216,24 +242,25 @@ - + - + - + - - + + + - - + + diff --git a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java index a55557595d..be6bf0dbe8 100644 --- a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java +++ b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java @@ -41,7 +41,8 @@ public class JexlExchange implements Exchange { */ @Override public void open(Map parameters) { - expression = JexlUtil.parseExpression(parameters.get(EXPRESSION_KEY)); + expression = JexlUtil.parseExpression(getConf(), + parameters.get(EXPRESSION_KEY)); } /** diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java index e1fa792951..a89be63826 100644 --- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java +++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java @@ -114,7 +114,7 @@ public void setConf(Configuration conf) { "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none"); } - expr = JexlUtil.parseExpression(strExpr); + expr = JexlUtil.parseExpression(conf, strExpr); if (expr == null) { LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr); diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml index ee812a225c..04c1a071d0 100644 --- a/src/plugin/indexer-elastic/ivy.xml +++ b/src/plugin/indexer-elastic/ivy.xml @@ -36,7 +36,10 @@ - + + + + diff --git a/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java new file mode 100644 index 0000000000..0479213c3f --- /dev/null +++ b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/ElasticIndexWriterIT.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.elastic; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.http.HttpHost; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.elasticsearch.action.get.GetRequest; +import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.client.RestHighLevelClient; +import org.junit.jupiter.api.Test; +import org.testcontainers.elasticsearch.ElasticsearchContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration tests for ElasticIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class ElasticIndexWriterIT extends AbstractIndexWriterIT { + + private static final String ELASTICSEARCH_IMAGE = + "docker.elastic.co/elasticsearch/elasticsearch:7.10.2"; + + @Container + private static final ElasticsearchContainer elasticsearchContainer = + new ElasticsearchContainer(ELASTICSEARCH_IMAGE) + .withEnv("discovery.type", "single-node") + .withEnv("xpack.security.enabled", "false"); + + private ElasticIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new ElasticIndexWriter(); + indexWriter.setConf(conf); + + Map params = new HashMap<>(); + params.put(ElasticConstants.HOSTS, elasticsearchContainer.getHost()); + params.put(ElasticConstants.PORT, String.valueOf(elasticsearchContainer.getMappedPort(9200))); + params.put(ElasticConstants.INDEX, "test-index"); + params.put(ElasticConstants.SCHEME, "http"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } + + @Override + public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + try (RestHighLevelClient client = new RestHighLevelClient( + RestClient.builder( + new HttpHost(elasticsearchContainer.getHost(), + elasticsearchContainer.getMappedPort(9200), + "http")))) { + GetRequest getRequest = new GetRequest("test-index", docId); + GetResponse getResponse = client.get(getRequest, RequestOptions.DEFAULT); + assertTrue(getResponse.isExists(), "Document should exist in index"); + assertNotNull(getResponse.getSource()); + assertEquals(expectedTitle, getResponse.getSource().get("title")); + } + } +} diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml index d6157d953e..ffba6746d1 100644 --- a/src/plugin/indexer-kafka/ivy.xml +++ b/src/plugin/indexer-kafka/ivy.xml @@ -37,6 +37,7 @@ + diff --git a/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java new file mode 100644 index 0000000000..4f6a306d46 --- /dev/null +++ b/src/plugin/indexer-kafka/src/test/org/apache/nutch/indexwriter/kafka/KafkaIndexWriterIT.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.kafka; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.kafka.KafkaContainer; + +/** + * Integration tests for KafkaIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class KafkaIndexWriterIT extends AbstractIndexWriterIT { + + private static final String KAFKA_IMAGE = "apache/kafka-native:3.8.0"; + private static final String TEST_TOPIC = "nutch-indexer-test"; + + @Container + private static final KafkaContainer kafkaContainer = + new KafkaContainer(KAFKA_IMAGE); + + private KafkaIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new KafkaIndexWriter(); + indexWriter.setConf(conf); + + String bootstrapServers = kafkaContainer.getBootstrapServers(); + String hostPort = bootstrapServers.contains("://") + ? bootstrapServers.substring(bootstrapServers.indexOf("://") + 3) + : bootstrapServers; + String[] parts = hostPort.split(":"); + String host = parts[0]; + int port = Integer.parseInt(parts[1]); + + Map params = new HashMap<>(); + params.put(KafkaConstants.HOST, host); + params.put(KafkaConstants.PORT, String.valueOf(port)); + params.put(KafkaConstants.TOPIC, TEST_TOPIC); + params.put(KafkaConstants.VALUE_SERIALIZER, + "org.apache.kafka.connect.json.JsonSerializer"); + params.put(KafkaConstants.KEY_SERIALIZER, + "org.apache.kafka.common.serialization.StringSerializer"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return false; + } +} diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml index 81822a0fb7..54930331cc 100644 --- a/src/plugin/indexer-rabbit/ivy.xml +++ b/src/plugin/indexer-rabbit/ivy.xml @@ -35,5 +35,9 @@ - + + + + + diff --git a/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java new file mode 100644 index 0000000000..ed7d055350 --- /dev/null +++ b/src/plugin/indexer-rabbit/src/test/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriterIT.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.rabbit; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.testcontainers.containers.RabbitMQContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for RabbitIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class RabbitIndexWriterIT extends AbstractIndexWriterIT { + + private static final String RABBITMQ_IMAGE = "rabbitmq:3.13-management"; + + @Container + private static final RabbitMQContainer rabbitContainer = + new RabbitMQContainer(RABBITMQ_IMAGE); + + private RabbitIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + indexWriter = new RabbitIndexWriter(); + indexWriter.setConf(conf); + + Map params = new HashMap<>(); + params.put(RabbitMQConstants.SERVER_URI, rabbitContainer.getAmqpUrl()); + params.put(RabbitMQConstants.EXCHANGE_NAME, "nutch-indexer-test"); + params.put(RabbitMQConstants.ROUTING_KEY, "indexer"); + params.put(RabbitMQConstants.COMMIT_MODE, "single"); + params.put(RabbitMQConstants.COMMIT_SIZE, "10"); + params.put(RabbitMQConstants.BINDING, "true"); + params.put(RabbitMQConstants.QUEUE_NAME, "nutch-indexer-queue"); + params.put(RabbitMQConstants.EXCHANGE_OPTIONS, "type=direct,durable=true"); + params.put(RabbitMQConstants.QUEUE_OPTIONS, + "durable=true,exclusive=false,auto-delete=false"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } +} diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml index 99a713c18b..4d2120955c 100644 --- a/src/plugin/indexer-solr/ivy.xml +++ b/src/plugin/indexer-solr/ivy.xml @@ -38,6 +38,7 @@ + diff --git a/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java new file mode 100644 index 0000000000..dcd88bdacb --- /dev/null +++ b/src/plugin/indexer-solr/src/test/org/apache/nutch/indexwriter/solr/SolrIndexWriterIT.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexwriter.solr; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.IndexerMapReduce; +import org.apache.nutch.indexer.AbstractIndexWriterIT; +import org.apache.nutch.indexer.IndexWriter; +import org.apache.nutch.indexer.IndexWriterParams; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.solr.SolrContainer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration tests for SolrIndexWriter using Testcontainers. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SolrIndexWriterIT extends AbstractIndexWriterIT { + + private static final String SOLR_IMAGE = "solr:8.11.2"; + private static final String COLLECTION = "nutch-test"; + + @Container + private static final SolrContainer solrContainer = + new SolrContainer(SOLR_IMAGE).withCollection(COLLECTION); + + private SolrIndexWriter indexWriter; + private Configuration conf; + + @Override + public void setUpIndexWriter() throws Exception { + conf = NutchConfiguration.create(); + conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, false); + + indexWriter = new SolrIndexWriter(); + indexWriter.setConf(conf); + + String solrUrl = "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION; + + Map params = new HashMap<>(); + params.put(SolrConstants.SERVER_TYPE, "http"); + params.put(SolrConstants.SERVER_URLS, solrUrl); + params.put(SolrConstants.COLLECTION, COLLECTION); + params.put(SolrConstants.COMMIT_SIZE, "100"); + + IndexWriterParams writerParams = new IndexWriterParams(params); + indexWriter.open(writerParams); + } + + @Override + public void tearDownIndexWriter() throws Exception { + if (indexWriter != null) { + try { + indexWriter.close(); + } catch (Exception e) { + // Ignore if open() failed and close state is invalid + } + indexWriter = null; + } + } + + @Override + public IndexWriter getIndexWriter() { + return indexWriter; + } + + @Override + public boolean supportsDelete() { + return true; + } + + @Override + public void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + try (SolrClient client = new Http2SolrClient.Builder( + "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION).build()) { + ModifiableSolrParams queryParams = new ModifiableSolrParams(); + queryParams.set("q", "id:" + docId); + QueryResponse response = client.query(queryParams); + assertTrue(response.getResults().getNumFound() >= 1, + "Document should exist in Solr"); + Object titleValue = response.getResults().get(0).getFieldValue("title"); + String title = titleValue instanceof Collection + ? ((Collection) titleValue).iterator().next().toString() + : titleValue.toString(); + assertEquals(expectedTitle, title); + } + } + + @Override + public IndexWriter prepareWriterForDeleteTest() throws Exception { + tearDownIndexWriter(); + + Configuration deleteConf = NutchConfiguration.create(); + deleteConf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true); + SolrIndexWriter deleteWriter = new SolrIndexWriter(); + deleteWriter.setConf(deleteConf); + + String solrUrl = "http://" + solrContainer.getHost() + ":" + + solrContainer.getSolrPort() + "/solr/" + COLLECTION; + Map params = new HashMap<>(); + params.put(SolrConstants.SERVER_TYPE, "http"); + params.put(SolrConstants.SERVER_URLS, solrUrl); + params.put(SolrConstants.COLLECTION, COLLECTION); + deleteWriter.open(new IndexWriterParams(params)); + + return deleteWriter; + } +} diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 79b45882eb..63b9224b0d 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -223,6 +223,12 @@ public void setConf(Configuration conf) { this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 1024 * 1024); this.maxDuration = conf.getInt("http.time.limit", -1); + if (maxDuration >= 0 && (maxDuration * 1000) < timeout) { + LOG.warn( + "The configuration property http.time.limit ({} seconds) is less than http.timeout ({} ms), " + + "the entire request will time out before individual reads are timed out.", + maxDuration, timeout); + } this.partialAsTruncated = conf.getBoolean("http.partial.truncated", false); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf.get("http.agent.description"), @@ -272,8 +278,8 @@ public void setConf(Configuration conf) { } } catch (Exception e) { - this.logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, - StringUtils.stringifyException(e)); + this.logger.warn("Failed to read http.agent.rotate.file {}:", + agentsFile, e); this.userAgentNames = null; } finally { if (br != null) { @@ -314,8 +320,8 @@ public void setConf(Configuration conf) { } } } catch (Exception e) { - this.logger.warn("Failed to read http.agent.host.cookie.file {}: {}", - cookieFile, StringUtils.stringifyException(e)); + this.logger.warn("Failed to read http.agent.host.cookie.file {}:", + cookieFile, e); this.hostCookies = null; } finally { if (br != null) { @@ -614,8 +620,9 @@ protected void logConf() { this.logger.info("http.proxy.host = {}", this.proxyHost); this.logger.info("http.proxy.port = {}", this.proxyPort); this.logger.info("http.proxy.exception.list = {}", this.useProxy); - this.logger.info("http.timeout = {}", this.timeout); - this.logger.info("http.content.limit = {}", this.maxContent); + this.logger.info("http.timeout = {} ms", this.timeout); + this.logger.info("http.time.limit = {} seconds", this.maxDuration); + this.logger.info("http.content.limit = {} bytes", this.maxContent); this.logger.info("http.agent = {}", this.userAgent); this.logger.info("http.accept.language = {}", this.acceptLanguage); this.logger.info("http.accept = {}", this.accept); @@ -721,6 +728,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return this.robots.getRobotRulesSet(this, url, robotsTxtContent); } + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return this.robots.getRobotRulesSet(this, url, robotsTxtContent); + } + /** * Transforming a String[] into a HashMap for faster searching * diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 9da92698fd..4f4bd99774 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -28,6 +28,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -299,6 +300,10 @@ protected void addRobotsContent(List robotsTxtContent, robotsUrl.toString(), robotsBytes, robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), getConf()); + content.getMetadata().add(Nutch.FETCH_TIME_KEY, + Long.toString(System.currentTimeMillis())); + content.getMetadata().add(Nutch.FETCH_STATUS_KEY, + Integer.toString(robotsResponse.getCode())); robotsTxtContent.add(content); } diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 6d96ed3cd9..75d2b0fe23 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -37,7 +37,7 @@ - + diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 04afb9faca..9ec410182b 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -25,7 +25,7 @@ - + diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index e4d2010696..877873b64b 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return RobotRulesParser.EMPTY_RULES; } + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return RobotRulesParser.EMPTY_RULES; + } + } diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java index c50988c2dd..64186b9035 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java @@ -16,20 +16,18 @@ */ package org.apache.nutch.protocol.file; -import java.net.URL; import java.io.IOException; -import java.io.UnsupportedEncodingException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.protocol.Content; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; - +import org.apache.nutch.protocol.Content; import org.apache.tika.Tika; -import org.apache.hadoop.conf.Configuration; - /** * FileResponse.java mimics file replies as http response. It tries its best to * follow http's way for headers, response codes as well as exceptions. @@ -125,11 +123,8 @@ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) String path = url.getPath().isEmpty() ? "/" : url.getPath(); - try { - // specify the encoding via the config later? - path = java.net.URLDecoder.decode(path, "UTF-8"); - } catch (UnsupportedEncodingException ex) { - } + // specify the encoding via the config later? + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); try { diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml index 7749a873ff..5e6a0d8c72 100644 --- a/src/plugin/protocol-ftp/ivy.xml +++ b/src/plugin/protocol-ftp/ivy.xml @@ -37,7 +37,8 @@ - + + diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 2a47b63d61..3570d91188 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -183,6 +183,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { } catch (Exception e) { LOG.error("Could not get protocol output for {}: {}", url, e.getMessage()); + datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text("500")); return new ProtocolOutput(null, new ProtocolStatus(e)); } } @@ -304,6 +305,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return robots.getRobotRulesSet(this, url, robotsTxtContent); } + /** + * Get the robots rules for a given url + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); + } + public int getBufferSize() { return BUFFER_SIZE; } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index d6f7fd64a4..0d7ad1b289 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -30,6 +30,7 @@ import java.net.InetAddress; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.LinkedList; import java.io.ByteArrayOutputStream; @@ -164,7 +165,8 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) Ftp.LOG.info("connect to {}", addr); } - ftp.client.connect(addr); + int port = url.getPort(); + ftp.client.connect(addr, port > 0 ? port : FTP.DEFAULT_PORT); if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { ftp.client.disconnect(); Ftp.LOG.warn("ftp.client.connect() failed: {} {}", addr, @@ -206,6 +208,11 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) try { ftp.parser = null; String parserKey = ftp.client.getSystemName(); + // strip surrounding quotes that some servers include in SYST reply + if (parserKey.length() > 2 && parserKey.charAt(0) == '"' + && parserKey.charAt(parserKey.length() - 1) == '"') { + parserKey = parserKey.substring(1, parserKey.length() - 1); + } // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8 if (parserKey.startsWith("UNKNOWN Type: L8")) parserKey = "UNIX Type: L8"; @@ -239,7 +246,7 @@ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) this.content = null; - path = java.net.URLDecoder.decode(path, "UTF-8"); + path = java.net.URLDecoder.decode(path, StandardCharsets.UTF_8); if (path.endsWith("/")) { getDirAsHttpResponse(path, datum.getModifiedTime()); @@ -302,6 +309,11 @@ private void getFileAsHttpResponse(String path, long lastModified) list = new LinkedList(); ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser); + if (list.isEmpty()) { + this.code = 404; // file not found (server returned empty listing) + return; + } + FTPFile ftpFile = (FTPFile) list.get(0); this.headers.set(Response.CONTENT_LENGTH, Long.valueOf(ftpFile.getSize()).toString()); diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java index 33d11d5e7a..6f982c1ee6 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java @@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolOutput; @@ -107,6 +108,8 @@ public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, ProtocolStatus status = output.getStatus(); if (robotsTxtContent != null) { + output.getContent().getMetadata().add(Nutch.FETCH_TIME_KEY, + Long.toString(System.currentTimeMillis())); robotsTxtContent.add(output.getContent()); } diff --git a/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java new file mode 100644 index 0000000000..ccd3cd1ccb --- /dev/null +++ b/src/plugin/protocol-ftp/src/test/org/apache/nutch/protocol/ftp/FtpProtocolIT.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.ftp; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockftpserver.fake.FakeFtpServer; +import org.mockftpserver.fake.UserAccount; +import org.mockftpserver.fake.filesystem.DirectoryEntry; +import org.mockftpserver.fake.filesystem.FileEntry; +import org.mockftpserver.fake.filesystem.UnixFakeFileSystem; + +/** + * Integration tests for protocol-ftp using an in-process FakeFtpServer. + * + *

FTP passive mode with Testcontainers requires that the PASV response IP + * matches the host-visible address of the container, which is not reliable + * across Docker Desktop (macOS/Windows) and Linux Docker environments. An + * in-process {@link FakeFtpServer} from MockFtpServer avoids this constraint + * while still testing the Nutch FTP client against a real FTP protocol + * implementation. + */ +public class FtpProtocolIT implements ProtocolPluginIntegrationTest { + + private static final String FTP_USER = "testuser"; + private static final String FTP_PASS = "testpass"; + private static final String FTP_HOME = "/home/testuser"; + private static final String TEST_FILE = "test.txt"; + private static final String TEST_CONTENT = "FTP integration test content"; + + private static FakeFtpServer fakeFtpServer; + private Ftp protocol; + + @BeforeAll + static void startFtpServer() { + fakeFtpServer = new FakeFtpServer(); + fakeFtpServer.setServerControlPort(0); // bind to a random free port + + UserAccount userAccount = new UserAccount(FTP_USER, FTP_PASS, FTP_HOME); + fakeFtpServer.addUserAccount(userAccount); + + UnixFakeFileSystem fileSystem = new UnixFakeFileSystem(); + fileSystem.add(new DirectoryEntry(FTP_HOME)); + fileSystem.add(new FileEntry(FTP_HOME + "/" + TEST_FILE, TEST_CONTENT)); + fakeFtpServer.setFileSystem(fileSystem); + + fakeFtpServer.start(); + } + + @AfterAll + static void stopFtpServer() { + if (fakeFtpServer != null) { + fakeFtpServer.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-ftp|nutch-extensionpoints"); + conf.set("http.agent.name", "NutchFtpProtocolIT"); + conf.set("ftp.username", FTP_USER); + conf.set("ftp.password", FTP_PASS); + conf.setInt("ftp.timeout", 10000); + protocol = new Ftp(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/" + TEST_FILE; + } + + @Test + void testFtpFileDownload() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for file download"); + + assertNotNull(output.getContent(), "Content must not be null"); + String body = new String(output.getContent().getContent(), StandardCharsets.UTF_8); + assertTrue(body.contains(TEST_CONTENT), + "Downloaded content must match the file on the FTP server"); + } + + @Test + void testFtpDirectoryListing() throws Exception { + String dirUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(dirUrl), datum); + + assertNotNull(output, "ProtocolOutput for directory listing must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected FTP 200 for directory listing"); + } + + @Test + void testFtpMissingFileReturnsError() throws Exception { + String missingUrl = "ftp://localhost:" + fakeFtpServer.getServerControlPort() + + FTP_HOME + "/nonexistent.txt"; + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(missingUrl), datum); + assertNotNull(output, "ProtocolOutput must not be null even for missing files"); + // FTP 550 "No such file" maps to a non-200 Nutch status + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertTrue(code != 200, "Expected non-200 code for missing FTP file, got: " + code); + } +} diff --git a/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java new file mode 100644 index 0000000000..42c551b72f --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/test/org/apache/nutch/protocol/htmlunit/HtmlUnitProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-htmlunit using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HtmlUnitProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-htmlunit|lib-htmlunit|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-htmlunit must return non-null content for a live nginx page"); + } +} diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java new file mode 100644 index 0000000000..87db32335b --- /dev/null +++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/HttpProtocolIT.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.http; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-http using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class HttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-http|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchRedirect301() throws Exception { + // nginx returns 301 for directory URLs without trailing slash when autoindex + // is off; test a manual redirect via the default nginx welcome page path + String redirectUrl = + "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/index.html"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(redirectUrl), datum); + int code = getHttpStatusCode(datum); + // nginx serves index.html directly with 200; the base test covers 200/404 + assertEquals(200, code, "Expected 200 for index.html from nginx"); + } +} diff --git a/src/plugin/protocol-httpclient/ivy.xml b/src/plugin/protocol-httpclient/ivy.xml index 0b3ce0af73..e5987074b8 100644 --- a/src/plugin/protocol-httpclient/ivy.xml +++ b/src/plugin/protocol-httpclient/ivy.xml @@ -38,6 +38,7 @@ + diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java new file mode 100644 index 0000000000..7345e4b029 --- /dev/null +++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/HttpClientProtocolIT.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.httpclient; + +import static com.github.tomakehurst.wiremock.client.WireMock.aResponse; +import static com.github.tomakehurst.wiremock.client.WireMock.get; +import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import com.github.tomakehurst.wiremock.WireMockServer; +import com.github.tomakehurst.wiremock.core.WireMockConfiguration; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolPluginIntegrationTest; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Integration tests for protocol-httpclient using an in-process WireMock + * server. + * + *

WireMock runs in the test JVM so no Docker container is required. The + * Nutch httpclient plugin connects to it over a real TCP socket, exercising + * the full HTTP client stack including header handling and Basic-auth + * challenge/response. + */ +public class HttpClientProtocolIT implements ProtocolPluginIntegrationTest { + + private static WireMockServer wireMock; + private Http protocol; + + @BeforeAll + static void startWireMock() { + wireMock = new WireMockServer(WireMockConfiguration.options().dynamicPort()); + wireMock.start(); + + wireMock.stubFor(get(urlEqualTo("/")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Integration test"))); + + wireMock.stubFor(get(urlEqualTo("/notfound")) + .willReturn(aResponse().withStatus(404))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .withBasicAuth("testuser", "testpass") + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html") + .withBody("Authenticated"))); + + wireMock.stubFor(get(urlEqualTo("/secure")) + .willReturn(aResponse() + .withStatus(401) + .withHeader("WWW-Authenticate", "Basic realm=\"Test\"") + .withBody("Unauthorized"))); + } + + @AfterAll + static void stopWireMock() { + if (wireMock != null) { + wireMock.stop(); + } + } + + @BeforeEach + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-httpclient|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @AfterEach + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://localhost:" + wireMock.port() + "/"; + } + + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(200, code, "Expected HTTP 200 from WireMock stub"); + } + + @Test + void testFetch404() throws Exception { + String url = "http://localhost:" + wireMock.port() + "/notfound"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(url), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(404, code, "Expected HTTP 404 for /notfound stub"); + } + + @Test + void testUnauthenticatedRequestReturns401() throws Exception { + String secureUrl = "http://localhost:" + wireMock.port() + "/secure"; + CrawlDatum datum = new CrawlDatum(); + protocol.getProtocolOutput(new Text(secureUrl), datum); + int code = Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + assertEquals(401, code, + "Unauthenticated request to /secure should return 401"); + } +} diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml index 0768def785..28f355d7b9 100644 --- a/src/plugin/protocol-okhttp/ivy.xml +++ b/src/plugin/protocol-okhttp/ivy.xml @@ -37,8 +37,9 @@ - - + + + - + diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml index e2183d2b50..51f65f5d25 100755 --- a/src/plugin/protocol-okhttp/plugin.xml +++ b/src/plugin/protocol-okhttp/plugin.xml @@ -28,13 +28,15 @@ - - - - - - - + + + + + + + + + diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 954c3f6df1..b47142cb24 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -52,15 +52,19 @@ import org.slf4j.LoggerFactory; import okhttp3.Authenticator; +import okhttp3.CompressionInterceptor; import okhttp3.Connection; import okhttp3.ConnectionPool; +import okhttp3.Gzip; import okhttp3.Handshake; import okhttp3.Headers; import okhttp3.Interceptor; import okhttp3.OkHttpClient; import okhttp3.Protocol; import okhttp3.Request; -import okhttp3.brotli.BrotliInterceptor; +import okhttp3.brotli.Brotli; +import okhttp3.zstd.Zstd; + public class OkHttp extends HttpBase { @@ -115,6 +119,11 @@ public void setConf(Configuration conf) { .writeTimeout(this.timeout, TimeUnit.MILLISECONDS) .readTimeout(this.timeout, TimeUnit.MILLISECONDS); + if (this.maxDuration >= 0) { + // timeout for the entire request + builder.callTimeout(this.maxDuration, TimeUnit.SECONDS); + } + if (!this.tlsCheckCertificate) { try { SSLContext trustAllSslContext = SSLContext.getInstance("TLS"); @@ -156,13 +165,11 @@ public boolean verify(String hostname, SSLSession session) { String proxyUsername = conf.get("http.proxy.username"); if (proxyUsername == null) { ProxySelector selector = new ProxySelector() { - @SuppressWarnings("serial") private final List noProxyList = new ArrayList() { { add(Proxy.NO_PROXY); } }; - @SuppressWarnings("serial") private final List proxyList = new ArrayList() { { add(proxy); @@ -224,8 +231,9 @@ public Request authenticate(okhttp3.Route route, builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } - // enable support for Brotli compression (Content-Encoding) - builder.addInterceptor(BrotliInterceptor.INSTANCE); + // enable support for Zstd, Brotli, Gzip Content-Encoding + builder.addInterceptor(new CompressionInterceptor(Zstd.INSTANCE, + Brotli.INSTANCE, Gzip.INSTANCE)); // instantiate connection pool(s), cf. // https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 605c03390f..9aa1526157 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -17,6 +17,7 @@ package org.apache.nutch.protocol.okhttp; import java.io.IOException; +import java.io.InterruptedIOException; import java.lang.invoke.MethodHandles; import java.net.URL; import java.util.Base64; @@ -179,7 +180,12 @@ private final byte[] toByteArray(final ResponseBody responseBody, } catch (IOException e) { if (partialAsTruncated && source.getBuffer().size() > 0) { // treat already fetched content as truncated - truncated.setReason(TruncatedContentReason.DISCONNECT); + if (e instanceof InterruptedIOException) { + // thrown by OkHttp if the call timeout is hit + truncated.setReason(TruncatedContentReason.TIME); + } else { + truncated.setReason(TruncatedContentReason.DISCONNECT); + } LOG.info("Truncated content for {}, partial fetch caused by:", this.url, e); } else { diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java new file mode 100644 index 0000000000..d5342d8309 --- /dev/null +++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/OkHttpProtocolIT.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-okhttp using a real nginx container. + */ +@Testcontainers(disabledWithoutDocker = true) +public class OkHttpProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private OkHttp protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", "protocol-okhttp|lib-http|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new OkHttp(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + /** OkHttp transparently decompresses gzip; verify content is returned. */ + @Test + void testFetchWithAcceptEncoding() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "Content must be present even when server uses compression"); + } +} diff --git a/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java new file mode 100644 index 0000000000..ec928df64f --- /dev/null +++ b/src/plugin/protocol-selenium/src/test/org/apache/nutch/protocol/selenium/SeleniumProtocolIT.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.selenium; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.AbstractProtocolPluginIT; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Integration tests for protocol-selenium using a real nginx container. + * + *

Note: protocol-selenium uses raw HTTP sockets (the same underlying + * transport as protocol-http) rather than a Selenium WebDriver. Tests here + * validate that the plugin connects to and fetches content from a live HTTP + * server. Browser-based rendering is covered by protocol-interactiveselenium + * which is excluded from automated integration tests due to its stateful + * handler requirements. + */ +@Testcontainers(disabledWithoutDocker = true) +public class SeleniumProtocolIT extends AbstractProtocolPluginIT { + + @Container + private static final GenericContainer nginx = + new GenericContainer<>("nginx:alpine").withExposedPorts(80); + + private Http protocol; + + @Override + public void setUpProtocol() throws Exception { + Configuration conf = NutchConfiguration.create(); + conf.set("plugin.includes", + "protocol-selenium|lib-http|lib-selenium|nutch-extensionpoints"); + conf.set("http.agent.name", "Nutch-Test"); + conf.setInt("http.timeout", 10000); + conf.setBoolean("store.http.headers", true); + protocol = new Http(); + protocol.setConf(conf); + } + + @Override + public void tearDownProtocol() { + protocol = null; + } + + @Override + public Protocol getProtocol() { + return protocol; + } + + @Override + public String getTestUrl() { + return "http://" + nginx.getHost() + ":" + nginx.getMappedPort(80) + "/"; + } + + @Test + void testFetchReturnsContent() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = protocol.getProtocolOutput( + new org.apache.hadoop.io.Text(getTestUrl()), datum); + assertNotNull(output.getContent(), + "protocol-selenium must return non-null content for a live nginx page"); + } +} diff --git a/src/plugin/scoring-adaptive/build.xml b/src/plugin/scoring-adaptive/build.xml new file mode 100644 index 0000000000..634be707c3 --- /dev/null +++ b/src/plugin/scoring-adaptive/build.xml @@ -0,0 +1,27 @@ + + + + + + + + + + + + diff --git a/src/plugin/scoring-adaptive/ivy.xml b/src/plugin/scoring-adaptive/ivy.xml new file mode 100644 index 0000000000..1a86d68030 --- /dev/null +++ b/src/plugin/scoring-adaptive/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/scoring-adaptive/plugin.xml b/src/plugin/scoring-adaptive/plugin.xml new file mode 100644 index 0000000000..011c3d1e90 --- /dev/null +++ b/src/plugin/scoring-adaptive/plugin.xml @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + diff --git a/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java new file mode 100644 index 0000000000..52417dfd0b --- /dev/null +++ b/src/plugin/scoring-adaptive/src/java/org/apache/nutch/scoring/adaptive/AdaptiveScoringFilter.java @@ -0,0 +1,511 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.scoring.adaptive; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.TreeMap; +import java.util.concurrent.ThreadLocalRandom; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Generator; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.scoring.AbstractScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; + +/** + * Scoring filter adaptive to page score, fetch status and time. + * + *

+ * The generator score of a page depends in a configurable way on + *

the page score
the crawl status (fetched, not modified, redirect, gone)
the time elapsed since the scheduled fetch time
whether or not a canonical link has been detected on the page and the + * link points to a different URL

+ *

+ * + *

+ * While {@link org.apache.nutch.crawl.FetchSchedule}s set a fix (re)retch time + * immediately after a page has been fetched, this scoring plugin allows a more + * dynamic selection how many and which pages should be generated based on the + * current configuration, independent from previous settings, by adjusting + *

the plugin parameters in accordance with
-topN and generate.min.score
generate.max.count

+ * + *

+ * The plugin is thought for large crawls where there are far more URLs than can + * be fetched and selecting a representative sample is mandatory. Sampling is, + * of course, usually based on the page score - relevant pages with a high score + * are fetched with higher probability. However, a dynamic rotation of generated + * items helps to avoid that the same page with a slightly higher score is + * fetched again while others are still waiting to be queued. The plugin also + * allows to adjust when pages gone or not modified are revisited. + *

+ * + * The plugin also includes heuristics to "retire" pages to status + * db_orphan if they fail to fetch or are duplicates and are not seen in seeds + * or via inlinks (cf. the plugin scoring-orphan). + * + */ +public class AdaptiveScoringFilter extends AbstractScoringFilter { + + private final static Logger LOG = LoggerFactory + .getLogger(AdaptiveScoringFilter.class); + + /** + * Generator sort value factor for pages to be (re)fetched based on the time + * (in days) elapsed since the scheduled fetch time: + * + *

+   * generator_sort_value += (factor * days_elapsed)
+   *

+ */ + public static final String ADAPTIVE_FETCH_TIME_SORT_FACTOR = "scoring.adaptive.factor.fetchtime"; + + /** + * Random factor added to the sort value factor for pages to be (re)fetched based on the time, + * see {@link #ADAPTIVE_FETCH_TIME_SORT_FACTOR}: + *

+   * generator_sort_value += (factor * days_elapsed * (1.0 + random_factor * Random.nextGaussian()))
+   *

+ * With a value of 0.0 no random factor is added. + */ + public static final String ADAPTIVE_FETCH_TIME_SORT_FACTOR_RANDOM = "scoring.adaptive.factor.fetchtime.random"; + + /** + * Generator sort value factor for pages to be (re)fetched based on the time + * (in days) elapsed since the time a URL has been seen as seed or link. + * URLs not seen since long are penalized by this factor (opposed to + * scoring.adaptive.factor.fetchtime which prefers pages not revisited + * for a longer period of time): + * + *

+   * generator_sort_value -= (factor * days_since_last_seen)
+   *

+ */ + public static final String ADAPTIVE_LAST_SEEN_TIME_SORT_FACTOR = "scoring.adaptive.factor.lastseentime"; + + /** + * Configuration file to adjust the generator sort value by fetch status. + * + * Format: + *

+   * status \t sortvalue
+   *

+ * + * For example: + *

+   * db_unfetched .1
+   * db_gone -.5
+   *

+ * The sort value is added to other sort values (score, fetch time). + * It may be negative to penalize fetch items. + */ + public static final String ADAPTIVE_STATUS_SORT_FACTOR_FILE = "scoring.adaptive.sort.by_status.file"; + + /** + * Factor penalizing pages not successfully fetched for each failed fetch + * trial: + * + *

+   * generator_sort_value -= (penalty * retries_since_fetch)
+   *

+ */ + public static final String ADAPTIVE_FETCH_RETRY_PENALTY = "scoring.adaptive.penalty.fetch_retry"; + + /** + * Boost recently injected URLs (injected within the last 7 days): + * + *

+   * generator_sort_value += injected_boost
+   *

+ */ + public static final String ADAPTIVE_INJECTED_BOOST = "scoring.adaptive.boost.injected"; + + /** + * Penalty for pages with a canonical link different than the page URL. + * + * Revisits are delayed by subtracting this penalty from the generator sort + * value. + * + * Note: In order to avoid that pages without a canonical link are preferred, + * the penalty shouldn't be too high. The default is + * 7 * scoring.adaptive.factor.fetchtime, that is a revisit can + * be delayed by up to 7 days, in comparison to a page where the canonical + * link equals the page URL, or a page without a canonical link. + */ + public static final String ADAPTIVE_NON_CANONICAL_PENALTY = "scoring.adaptive.penalty.non_canonical"; + + /* + * Time span (in minutes) after which a page not seen anymore by inlink or + * seed is marked as orphaned. + */ + public static final String ADAPTIVE_ORPHAN_TIME = "scoring.adaptive.mark.orphan.after"; + + /* + * Time span (in minutes) after which a "redirected" page not seen anymore + * by inlink or seed is marked as orphaned. + */ + public static final String ADAPTIVE_ORPHAN_TIME_REDIRECT = "scoring.adaptive.mark.redirect.orphan.after"; + + /* + * Time span (in minutes) after which a "unfetched" page not seen anymore + * by inlink or seed is marked as orphaned. + */ + public static final String ADAPTIVE_ORPHAN_TIME_UNFETCHED = "scoring.adaptive.mark.unfetched.orphan.after"; + + /* + * Time span (in minutes) after which a "gone" page not seen anymore + * by inlink or seed is marked as orphaned. Also duplicates and unfetched pages + * with a retry count >= 3 are considered as "gone". + */ + public static final String ADAPTIVE_ORPHAN_TIME_GONE = "scoring.adaptive.mark.gone.orphan.after"; + + /* + * Date when a page is considered to be seen last time by default / as + * fall-back if there is no last seen time tracked in the CrawlDatum. + */ + public static final String ADAPTIVE_ORPHAN_TIME_LAST_SEEN_DEFAULT = "scoring.adaptive.mark.orphan.last.seen.default.date"; + + /** + * Time stamp (in minutes) when a page has been "seen" the last + * time, either as link or as seed URL. + */ + public static final String LAST_SEEN_TIME = "_lst_"; + public static final Text WRITABLE_LAST_SEEN_TIME = new Text(LAST_SEEN_TIME); + + /** + * Time stamp (in minutes) when a page has been successfully fetched. + */ + public static final String SUCCESSFUL_FETCH_TIME = "_sft_"; + public static final Text WRITABLE_SUCCESSFUL_FETCH_TIME = new Text(SUCCESSFUL_FETCH_TIME); + + private static final Writable EMPTY_VALUE = NullWritable.get(); + + private Configuration conf; + + /** + * Current time in milliseconds used to calculate time elapsed since a page + * should have been (re)fetched while generating fetch lists. Can be + * set/overwritten from {@link Generator} by option -adddays (internally set + * via @{link Generator.GENERATOR_CUR_TIME}. + */ + private long curTime; + + private float adaptiveFetchTimeSort; + private float adaptiveFetchTimeSortRandom; + private float adaptiveLastSeenTimeSort; + private float adaptiveFetchRetryPenalty; + private float adaptiveBoostInjected; + private float nonCanonicalPenalty; + + private Map statusSortMap = new TreeMap(); + private Map contentTypeSortMap = new HashMap(); + + int nowMinutes; + int orphanTimeGone; + int orphanTimeRedirect; + int orphanTimeUnfetched; + int orphanTimeAny; + int orphanTimeLastSeenDefault; + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + curTime = conf.getLong(Generator.GENERATOR_CUR_TIME, + System.currentTimeMillis()); + adaptiveFetchTimeSort = conf.getFloat(ADAPTIVE_FETCH_TIME_SORT_FACTOR, + .01f); + adaptiveFetchTimeSortRandom = conf + .getFloat(ADAPTIVE_FETCH_TIME_SORT_FACTOR_RANDOM, .0f); + adaptiveLastSeenTimeSort = conf.getFloat(ADAPTIVE_LAST_SEEN_TIME_SORT_FACTOR, + .005f); + adaptiveFetchRetryPenalty = conf.getFloat(ADAPTIVE_FETCH_RETRY_PENALTY, + .1f); + adaptiveBoostInjected = conf.getFloat(ADAPTIVE_INJECTED_BOOST, .2f); + String adaptiveStatusSortFile = conf.get(ADAPTIVE_STATUS_SORT_FACTOR_FILE, + "adaptive-scoring.txt"); + Reader adaptiveStatusSortReader = conf + .getConfResourceAsReader(adaptiveStatusSortFile); + try { + readSortFile(adaptiveStatusSortReader); + } catch (IOException e) { + LOG.error("Failed to read adaptive scoring file {}: {}", + adaptiveStatusSortFile, StringUtils.stringifyException(e)); + } + + // orphan detection + nowMinutes = (int) (System.currentTimeMillis() / (60000)); + int orphanTimeSpanAny = conf.getInt(ADAPTIVE_ORPHAN_TIME, + 60 * 24 * 30 * 12); + orphanTimeAny = nowMinutes - orphanTimeSpanAny; + int orphanTimeSpanRedirect = conf.getInt(ADAPTIVE_ORPHAN_TIME_REDIRECT, + 60 * 24 * 30 * 4); + orphanTimeRedirect = nowMinutes - orphanTimeSpanRedirect; + int orphanTimeSpanUnfetched = conf.getInt(ADAPTIVE_ORPHAN_TIME_UNFETCHED, orphanTimeSpanAny); + orphanTimeUnfetched = nowMinutes - orphanTimeSpanUnfetched; + int orphanTimeSpanGone = conf.getInt(ADAPTIVE_ORPHAN_TIME_GONE, + 60 * 24 * 30 * 4); + orphanTimeGone = nowMinutes - orphanTimeSpanGone; + String lastSeenDefaultDate = conf + .get(ADAPTIVE_ORPHAN_TIME_LAST_SEEN_DEFAULT); + if (lastSeenDefaultDate != null) { + orphanTimeLastSeenDefault = (int) LocalDateTime.parse(lastSeenDefaultDate) + .toEpochSecond(ZoneOffset.UTC) / 60; + } else { + // choose a time that will never trigger that a page/CrawlDatum + // is marked as orphaned when it's last seen time is not given + orphanTimeLastSeenDefault = nowMinutes; + } + + /* Penalize non-canonical pages: default is to delay revisits by 7 days */ + nonCanonicalPenalty = conf.getFloat(ADAPTIVE_NON_CANONICAL_PENALTY, + 7 * adaptiveFetchTimeSort); + } + + private void readSortFile(Reader sortFileReader) throws IOException { + BufferedReader reader = new BufferedReader(sortFileReader); + String line = null; + String[] splits = null; + while ((line = reader.readLine()) != null) { + if (line.matches("^\\s*$") || line.startsWith("#")) + continue; // skip empty lines and comments + splits = line.split("\t"); + if (splits.length < 2) { + LOG.warn("Invalid line (expected format \t ): {}", + line); + continue; + } + float value; + try { + value = Float.parseFloat(splits[1]); + } catch (NumberFormatException e) { + LOG.warn("Invalid sort value `{}' in line: {}", splits[1], line); + continue; + } + if (splits[0].startsWith("Content-Type:")) { + contentTypeSortMap.put(splits[0].substring("Content-Type:".length()), value); + continue; + } + byte status = -1; + for (Entry entry : CrawlDatum.statNames.entrySet()) { + if (entry.getValue().equals(splits[0])) { + status = entry.getKey(); + statusSortMap.put(status, value); + break; + } + } + if (status == -1) { + LOG.warn("Invalid status `{}' in line: {}", splits[0], line); + } + } + } + + /** Add injected timestamp to metadata */ + @Override + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + datum.getMetaData().put(WRITABLE_LAST_SEEN_TIME, + new IntWritable(nowMinutes)); + } + + /** Add detection timestamp to metadata (URL found as outlink) */ + @Override + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + datum.getMetaData().put(WRITABLE_LAST_SEEN_TIME, + new IntWritable(nowMinutes)); + } + + /** + * Use {@link CrawlDatum#getScore()} but be adaptive to page status and + * fetch time. + */ + @Override + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + initSort *= datum.getScore(); + long fetchTime = datum.getFetchTime(); + byte status = datum.getStatus(); + long daysSinceScheduledFetch = (curTime - fetchTime) / 86400000; + String contentType = null; + if (datum.getMetaData().containsKey(new Text("Content-Type"))) { + contentType = datum.getMetaData().get(new Text("Content-Type")).toString(); + } + if (adaptiveFetchTimeSort > 0.0f) { + // boost/penalize by time elapsed since the scheduled fetch time + if (adaptiveFetchTimeSortRandom > .0f) { + initSort += adaptiveFetchTimeSort * daysSinceScheduledFetch + * (1.0 + adaptiveFetchTimeSortRandom + * ThreadLocalRandom.current().nextGaussian()); + } else { + initSort += adaptiveFetchTimeSort * daysSinceScheduledFetch; + } + } + if (statusSortMap.containsKey(status)) { + // boost/penalize by fetch status + initSort += statusSortMap.get(status); + } + if (contentType != null && contentTypeSortMap.containsKey(contentType)) { + // boost/penalize by Content-Type / MIME type + initSort += contentTypeSortMap.get(contentType); + } + if (status == CrawlDatum.STATUS_DB_UNFETCHED) { + if (datum.getRetriesSinceFetch() > 0) { + // penalize by fetch retry count + initSort -= datum.getRetriesSinceFetch() * adaptiveFetchRetryPenalty; + } else if (daysSinceScheduledFetch <= 7) { + // boost recently injected URLs + // - status unfetched + // - retry count == 0 + // - scheduled fetch within the last 7 days + initSort += adaptiveBoostInjected; + } + } + if (adaptiveLastSeenTimeSort > 0.0 + && datum.getMetaData().containsKey(WRITABLE_LAST_SEEN_TIME)) { + IntWritable writable = (IntWritable) datum.getMetaData() + .get(WRITABLE_LAST_SEEN_TIME); + int lastSeenMinutes = writable.get(); + int daysSinceLastSeen = (nowMinutes - lastSeenMinutes) / (60 * 24); + if (daysSinceLastSeen> 0) { + initSort -= adaptiveLastSeenTimeSort * daysSinceLastSeen; + } + } + if (pageIsNotCanonical(url, datum)) { + // penalize for not being the canonical page + initSort -= nonCanonicalPenalty; + } + return initSort; + } + + @Override + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List inlinks) throws ScoringFilterException { + + // Are there inlinks for this record? + if (inlinks.size() > 0) { + // If yes, set the last time we have seen this page to now + // (assuming the link has been found recently) + datum.getMetaData().put(WRITABLE_LAST_SEEN_TIME, + new IntWritable(nowMinutes)); + } else { + orphanedScore(url, datum, old); + } + } + + @Override + public void orphanedScore(Text url, CrawlDatum datum) { + orphanedScore(url, datum, null); + } + + private void orphanedScore(Text url, CrawlDatum datum, CrawlDatum old) { + int lastSeenMinutes = orphanTimeLastSeenDefault; + if (datum.getMetaData().containsKey(WRITABLE_LAST_SEEN_TIME)) { + IntWritable writable = (IntWritable) datum.getMetaData() + .get(WRITABLE_LAST_SEEN_TIME); + lastSeenMinutes = writable.get(); + } + if (lastSeenMinutes < orphanTimeAny) { + // last seen time before mark-any-as-orphan time) + datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN); + } else if (pageIsRedirect(datum)) { + if (lastSeenMinutes < orphanTimeRedirect) { + // last seen time before mark-as-orphan-if-redirect time + datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN); + } + } else if (pageIsGone(datum)) { + if (lastSeenMinutes < orphanTimeGone) { + // last seen time before mark-as-orphan-if-gone time + datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN); + } + } else if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) { + if (lastSeenMinutes < orphanTimeUnfetched) { + // last seen time before mark-as-orphan-if-still-unfetched time + // Note: unfetched pages with high retry count are considered as "gone" + datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN); + } + } + } + + private static boolean pageIsGone(CrawlDatum datum) { + byte status = datum.getStatus(); + if (status == CrawlDatum.STATUS_DB_GONE + || status == CrawlDatum.STATUS_DB_DUPLICATE + || (status == CrawlDatum.STATUS_DB_UNFETCHED + && datum.getRetriesSinceFetch() >= 3)) { + return true; + } + return false; + } + + private static boolean pageIsRedirect(CrawlDatum datum) { + byte status = datum.getStatus(); + if (status == CrawlDatum.STATUS_DB_REDIR_PERM + || status == CrawlDatum.STATUS_DB_REDIR_TEMP) { + return true; + } + return false; + } + + private static boolean pageIsNotCanonical(Text url, CrawlDatum datum) { + if (datum.getStatus() != CrawlDatum.STATUS_DB_FETCHED) { + // If not successfully fetched, there's no canonical link + return false; + } + Writable canonicalUrl = datum.getMetaData().get(Nutch.CANONICAL_LINK_KEY); + if (canonicalUrl != null && !canonicalUrl.equals(EMPTY_VALUE) + && !url.equals(canonicalUrl)) { + /* + * If there is a canonical link and it's different from the URL, it's not + * the canonical page. + */ + return true; + } + // Otherwise, it's the canonical page or no canonical link was detected. + return false; + } + +} diff --git a/src/plugin/scoring-adaptive/src/test/org/apache/nutch/scoring/adaptive/TestAdaptiveScoringFilter.java b/src/plugin/scoring-adaptive/src/test/org/apache/nutch/scoring/adaptive/TestAdaptiveScoringFilter.java new file mode 100644 index 0000000000..450fbd0946 --- /dev/null +++ b/src/plugin/scoring-adaptive/src/test/org/apache/nutch/scoring/adaptive/TestAdaptiveScoringFilter.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.scoring.adaptive; + +import static org.junit.jupiter.api.Assertions.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.ScoringFilterException; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.jupiter.api.Test; + +public class TestAdaptiveScoringFilter { + + public ScoringFilter getFilter(float nonCanonicalPenalty) { + Configuration conf = NutchConfiguration.create(); + conf.setFloat(AdaptiveScoringFilter.ADAPTIVE_NON_CANONICAL_PENALTY, nonCanonicalPenalty); + + ScoringFilter filter = new AdaptiveScoringFilter(); + filter.setConf(conf); + return filter; + } + + @Test + public void testPenalizeNonCanonical() throws ScoringFilterException { + String canonicalUrl = "https://example.org/"; + String nonCanonicalUrl = "https://www.example.org/"; + float score = .5f; + float initSort = 1.0f; + Text url1 = new Text(canonicalUrl); + Text url2 = new Text(nonCanonicalUrl); + CrawlDatum datum = new CrawlDatum(); + datum.setStatus(CrawlDatum.STATUS_DB_FETCHED); + datum.setScore(.5f); + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, url1); + + // test with zero penalty configured + ScoringFilter filter = getFilter(.0f); + assertEquals(score, filter.generatorSortValue(url1, datum, initSort), + "With zero penalty, generator sort value should be equal to score"); + assertEquals(score, filter.generatorSortValue(url2, datum, initSort), + "With zero penalty, generator sort value should be equal to score"); + + // using a penalty, the canonical page should get a higher value + float penalty = .07f; + filter = getFilter(penalty); + float valCanonical = filter.generatorSortValue(url1, datum, initSort); + float valNonCanonical = filter.generatorSortValue(url2, datum, initSort); + assertEquals(score, valCanonical, + "For canonical pages, generator sort value should be equal to score"); + assertNotEquals(score, valNonCanonical, + "For non-canonical pages, generator sort value should *not* be equal to score"); + assertTrue(score > valNonCanonical, + "For non-canonical pages, generator sort value should be lower than score"); + assertEquals((score - penalty), valNonCanonical); + + // test with empty canonical link + datum.getMetaData().put(Nutch.CANONICAL_LINK_KEY, NullWritable.get()); + assertEquals(score, filter.generatorSortValue(url1, datum, initSort), + "Without canonical link, generator sort value should be equal to score"); + datum.getMetaData().remove(Nutch.CANONICAL_LINK_KEY); + assertEquals(score, filter.generatorSortValue(url1, datum, initSort), + "Without canonical link, generator sort value should be equal to score"); + } + +} diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java index 54e2fe5beb..23a4327496 100644 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java +++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java @@ -84,7 +84,8 @@ public void injectedScore(Text url, CrawlDatum datum) @Override public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { - datum.setScore(0.0f); + // TODO: allow to pass score to redirects + // datum.setScore(0.0f); } /** Use {@link CrawlDatum#getScore()}. */ diff --git a/src/plugin/urlfilter-fast/sample/test.urls b/src/plugin/urlfilter-fast/sample/test.urls index 3aa4354a63..8b7b7ec577 100644 --- a/src/plugin/urlfilter-fast/sample/test.urls +++ b/src/plugin/urlfilter-fast/sample/test.urls @@ -18,4 +18,5 @@ +ftp://ftp.example.org/file1.txt +file:/path/file1.txt +file:///path/file1.txt --file:/abcd/foo/bar/xyz/foo/bar/foo/ \ No newline at end of file +-file:/abcd/foo/bar/xyz/foo/bar/foo/ + diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java index 2342ced68f..5518e39544 100644 --- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java +++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java @@ -16,19 +16,18 @@ */ package org.apache.nutch.net.urlnormalizer.ajax; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; -import java.net.MalformedURLException; -import java.nio.charset.Charset; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; -import org.apache.hadoop.conf.Configuration; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * URLNormalizer capable of dealing with AJAX URL's. * @@ -43,13 +42,11 @@ public class AjaxURLNormalizer implements URLNormalizer { public static String ESCAPED_URL_PART = "_escaped_fragment_="; private Configuration conf; - private Charset utf8; /** * Default constructor. */ public AjaxURLNormalizer() { - utf8 = Charset.forName("UTF-8"); } /** @@ -195,7 +192,7 @@ protected String escape(String fragmentPart) { String hex = null; StringBuilder sb = new StringBuilder(fragmentPart.length()); - for (byte b : fragmentPart.getBytes(utf8)) { + for (byte b : fragmentPart.getBytes(UTF_8)) { if (b < 33) { sb.append('%'); diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 2123d8fa9f..4ff9fc64a6 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -16,12 +16,10 @@ */ package org.apache.nutch.net.urlnormalizer.basic; -import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; -import java.net.IDN; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; @@ -36,6 +34,7 @@ import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,6 +46,12 @@ *

normalize * percent-encoding in URL paths

+ *

normalize the host name if it is an Internationalized Domain Name (IDN) + * to ASCII or Unicode, depending on the configuration properties + * urlnormalizer.basic.host.idn and + * urlnormalizer.basic.host.idna2008

+ *

remove a trailing dot in the host name (if the property + * urlnormalizer.basic.host.trim-trailing-dot is true)

*/ public class BasicURLNormalizer implements URLNormalizer { @@ -54,6 +59,7 @@ public class BasicURLNormalizer implements URLNormalizer { .getLogger(MethodHandles.lookup().lookupClass()); public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn"; + public final static String NORM_HOST_IDNA_2008 = "urlnormalizer.basic.host.idna2008"; public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot"; /** @@ -70,7 +76,7 @@ public class BasicURLNormalizer implements URLNormalizer { .compile("%([0-9A-Fa-f]{2})"); // charset used for encoding URLs before escaping - private final static Charset utf8 = StandardCharsets.UTF_8; + private final static Charset UTF_8 = StandardCharsets.UTF_8; /** look-up table for characters which should not be escaped in URL paths */ private final static boolean[] unescapedCharacters = new boolean[128]; @@ -132,20 +138,11 @@ private static boolean isHexCharacter(int c) { || (0x30 <= c && c <= 0x39); } - private static boolean isAscii(String str) { - char[] chars = str.toCharArray(); - for (char c : chars) { - if (c > 127) { - return false; - } - } - return true; - } - private Configuration conf; private boolean hostIDNtoASCII; private boolean hostASCIItoIDN; + private boolean hostIDNA2008; private boolean hostTrimTrailingDot; @Override @@ -159,9 +156,12 @@ public void setConf(Configuration conf) { String normIdn = conf.get(NORM_HOST_IDN, ""); if (normIdn.equalsIgnoreCase("toAscii")) { hostIDNtoASCII = true; + hostASCIItoIDN = false; } else if (normIdn.equalsIgnoreCase("toUnicode")) { + hostIDNtoASCII = false; hostASCIItoIDN = true; } + hostIDNA2008 = conf.getBoolean(NORM_HOST_IDNA_2008, false); hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false); } @@ -364,7 +364,7 @@ private String escapePath(String path) { StringBuilder sb = new StringBuilder(path.length()); // Traverse over all bytes in this URL - byte[] bytes = path.getBytes(utf8); + byte[] bytes = path.getBytes(UTF_8); for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; // Is this a control character? @@ -415,8 +415,8 @@ private String normalizeHostName(String host) throws MalformedURLException { // 1. unescape percent-encoded characters in host name if (host.indexOf('%') != -1) { try { - host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString()); - } catch (UnsupportedEncodingException | IllegalArgumentException e) { + host = URLDecoder.decode(host, UTF_8); + } catch (IllegalArgumentException e) { LOG.debug("Failed to convert percent-encoded host name {}: ", host, e); throw (MalformedURLException) new MalformedURLException( "Invalid percent-encoded host name " + host + ": " + e.getMessage()) @@ -429,21 +429,18 @@ private String normalizeHostName(String host) throws MalformedURLException { // 3. if configured: convert between Unicode and ASCII forms // for Internationalized Domain Names (IDNs) - if (hostIDNtoASCII && !isAscii(host)) { - try { - host = IDN.toASCII(host); - } catch (IllegalArgumentException | IndexOutOfBoundsException e) { - // IllegalArgumentException: thrown if the input string contains - // non-convertible Unicode codepoints - // IndexOutOfBoundsException: thrown (undocumented) if one "label" - // (non-ASCII dot-separated segment) is longer than 256 characters, - // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 - LOG.debug("Failed to convert IDN host {}: ", host, e); - throw (MalformedURLException) new MalformedURLException( - "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); + if (hostIDNtoASCII && !URLUtil.isAscii(host)) { + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, true); + } else { + host = URLUtil.convertIDNA2003(host, true, false); } } else if (hostASCIItoIDN && host.contains("xn--")) { - host = IDN.toUnicode(host); + if (hostIDNA2008) { + host = URLUtil.convertIDNA2008(host, false); + } else { + host = URLUtil.convertIDNA2003(host, false, false); + } } // 4. optionally trim a trailing dot @@ -466,7 +463,7 @@ public static void main(String args[]) throws IOException { } String line, normUrl; BufferedReader in = new BufferedReader( - new InputStreamReader(System.in, utf8)); + new InputStreamReader(System.in, UTF_8)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index a6bad41f2e..090c25f2da 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -258,15 +258,24 @@ public void testHostName() throws Exception { // test Internationalized Domain Names BasicURLNormalizer norm = new BasicURLNormalizer(); conf = NutchConfiguration.create(); + + // to ASCII normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); norm.setConf(conf); normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/"); // verify escaping of percent-encoded characters in IDNs (NUTCH-2824) normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/", "https://www.xn--0251-sachverstndiger-ozb.de/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://нЭб.РФ/", "https://xn--90ax2c.xn--p1ai/"); + + // to Unicode normalization conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); norm.setConf(conf); normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/"); + // verify that host names with uppercase characters are normalized + normalizeTest(norm, "https://Xn--90Ax2c.xN--P1ai/", "https://нэб.рф/"); + // test removal of trailing dot conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true); norm.setConf(conf); @@ -274,6 +283,63 @@ public void testHostName() throws Exception { "https://www.example.org/"); } + /** + * Test for IDNA2008 and IDNA2003 compatibility. + */ + @Test + public void testHostNameIDNA2008() throws Exception { + // IDNA2008 (https://www.rfc-editor.org/rfc/rfc5890.html#section-1.1) + BasicURLNormalizer norm = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii"); + norm.setConf(conf); + + // IDNA2003 / RFC 3490 + // Note: IDNA2008 and IDNA2003 deviate for this example + normalizeTest(norm, "https://straße.de/", "https://strasse.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // IDNA2008 / RFC 5890 + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, true); + norm.setConf(conf); + // Note: this is different from IDNA2003 + normalizeTest(norm, "https://straße.de/", "https://xn--strae-oqa.de/"); + + // Verify that characters not in Unicode 3.2 do not fail the normalization + normalizeTest(norm, "https://example.ᬩᬮᬶ.id/", "https://example.xn--9tfky.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding) + normalizeTest(norm, "https://xn--p1ai.%D1%80%D1%84/", + "https://xn--p1ai.xn--p1ai/"); + normalizeTest(norm, "https://xn--p1ai.рф/", "https://xn--p1ai.xn--p1ai/"); + + // test conversion to Unicode (IDNA2008) + conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode"); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://straße.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); + + // test conversion to Unicode (IDNA2003) + conf.setBoolean(BasicURLNormalizer.NORM_HOST_IDNA_2008, false); + norm.setConf(conf); + normalizeTest(norm, "https://xn--strae-oqa.de/", "https://xn--strae-oqa.de/"); + normalizeTest(norm, "https://example.xn--9tfky.id/", "https://example.ᬩᬮᬶ.id/"); + + // mixed encodings (Unicode, Punycode, percent encoding), mixed case + normalizeTest(norm, "https://xN--p1aI.Xn--P1ai/", "https://рф.рф/"); + normalizeTest(norm, "https://xN--p1Ai.%D1%80%d1%84/", "https://рф.рф/"); + normalizeTest(norm, "https://булГаков.xN--p1Ai.%D1%80%d1%84/", "https://булгаков.рф.рф/"); + normalizeTest(norm, "https://гоГоль.%d1%80%D1%84.Рф/", "https://гоголь.рф.рф/"); + } + /** * Test that normalizer throws MalformedURLException for invalid URLs */ diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java index 9e96071a0e..f429bff020 100644 --- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java +++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java @@ -360,6 +360,7 @@ public String getUser() { public Path getWorkingDirectory() throws IOException { return null; } + } /** * For now we need to manually construct our Configuration, because we need to diff --git a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java index 377d49ec81..c06ae30076 100644 --- a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java +++ b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java @@ -24,6 +24,12 @@ import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.time.Duration; +import java.time.Instant; +import java.util.Date; +import java.util.Properties; /** * Test cases for AdaptiveFetchSchedule. @@ -117,5 +123,91 @@ private void validateFetchInterval(int changed, int getInterval) { } } + + /** + * Test https://issues.apache.org/jira/browse/NUTCH-1564 + */ + @Test + public void testSetFetchSchedule1() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 172800 (2 days) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 604800 (7 days) + // db.fetch.interval.max = 604800 (7 days) + // 3-days cycle + // 30 days since last modified + doTestSetFetchSchedule(0.3, 2, 1, 7, 7, 3, 30); + } + + @Test + public void testSetFetchSchedule2() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 86400 (1 day) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 172800 (2 days) + // db.fetch.interval.max = 604800 (7 days) + // 1-day cycle + // 10 days since last modified + doTestSetFetchSchedule(0.3, 1, 1, 2, 7, 1, 10); + } + + @Test + public void testSetFetchSchedule3() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 172800 (2 days) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 864000 (10 days) + // db.fetch.interval.max = 864000 (10 days) + // 3-days cycle + // 180 days since last modified + doTestSetFetchSchedule(0.3, 2, 1, 10, 10, 3, 180); + } + + private void doTestSetFetchSchedule(double deltaRate, int intervalDefaultDays, + int minIntervalDays, int maxIntervalDays, int intervalMaxDays, + int previousFetchTimeDays, int modifiedTimeDays) { + // need to properly override defaults + Properties props = new Properties(); + props.setProperty("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); + props.setProperty("db.fetch.schedule.adaptive.sync_delta", "true"); // default + props.setProperty("db.fetch.schedule.adaptive.sync_delta_rate", String.valueOf(deltaRate)); + props.setProperty("db.fetch.interval.default", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalDefaultDays)); + props.setProperty("db.fetch.schedule.adaptive.min_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * minIntervalDays)); + props.setProperty("db.fetch.schedule.adaptive.max_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * maxIntervalDays)); + props.setProperty("db.fetch.interval.max", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalMaxDays)); + + conf = NutchConfiguration.create(true, props); + inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); // default + dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); // default + + // ignore adaptive-host-specific-intervals.txt + Text url = new Text("http://www.example2.com"); + + AdaptiveFetchSchedule fs = new AdaptiveFetchSchedule(); + fs.setConf(conf); + + CrawlDatum datum = prepareCrawlDatum(); + Date fetchTime = Date.from(Instant.now()); + Date previousFetchTime = Date.from(Instant.now().minus(Duration.ofDays(previousFetchTimeDays))); + Date modifiedTime = Date.from(Instant.now().minus(Duration.ofDays(modifiedTimeDays))); + datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + datum.setRetriesSinceFetch(0); + datum.setModifiedTime(modifiedTime.getTime()); + datum.setFetchTime(fetchTime.getTime()); + + System.out.println("CrawlDatum fetchTime: " + fetchTime + "; modifiedTime: " + modifiedTime); + + fs.setFetchSchedule(url, datum, previousFetchTime.getTime(), modifiedTime.getTime(), + fetchTime.getTime(), modifiedTime.getTime(), CrawlDatum.STATUS_DB_NOTMODIFIED); + + Date nextFetchTime = new Date(datum.getFetchTime()); + System.out.println("CrawlDatum next fetchTime: " + nextFetchTime); + + assertTrue(nextFetchTime.after(fetchTime)); + // adapt milliseconds to seconds + long fetchTimeDiff = (nextFetchTime.getTime() - fetchTime.getTime()) / 1000L ; + assertTrue(fetchTimeDiff >= FetchSchedule.SECONDS_PER_DAY * minIntervalDays); + assertTrue(fetchTimeDiff <= FetchSchedule.SECONDS_PER_DAY * maxIntervalDays); + } } diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java index 646749db5e..75fb5d7ca8 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java @@ -106,6 +106,31 @@ public void testDeduplicationHttpsOverHttp() throws Exception { checkStatus(url2, CrawlDatum.STATUS_DB_FETCHED); } + @Test + public void testDedupRedirects() throws Exception { + String[] args = new String[3]; + args[0] = testCrawlDb.toString(); + args[1] = "-compareOrder"; + args[2] = "fetchTime,urlLength,score"; + int result = ToolRunner.run(conf, new DedupRedirectsJob(), args); + assertEquals(0, result, "DedupRedirectsJob did not succeed"); + // url3 was fetched with status 200, so it should "survive" as "db_fetched" + // while url1 and url2 both redirect to url3 and should be duplicates + String url1 = "http://en.wikipedia.org/wiki/URL_redirection"; + String url2 = "https://www.wikipedia.org/wiki/URL_redirection"; + String url3 = "https://en.wikipedia.org/wiki/URL_redirection"; + checkStatus(url1, CrawlDatum.STATUS_DB_DUPLICATE); + checkStatus(url2, CrawlDatum.STATUS_DB_DUPLICATE); + checkStatus(url3, CrawlDatum.STATUS_DB_FETCHED); + // url4: redirect points to a redirect -> mark as duplicate + String url4 = "https://wikipedia.org/wiki/URL_redirection"; + checkStatus(url4, CrawlDatum.STATUS_DB_DUPLICATE); + // url5: points to redirect not in CrawlDb + // => leave as redirect in CrawlDb for now + String url5 = "https:/wikipedia.org/wiki/URL_forwarding"; + checkStatus(url5, CrawlDatum.STATUS_DB_REDIR_PERM); + } + private void checkStatus(String url, byte status) throws IOException { CrawlDatum datum = reader.get(testCrawlDb.toString(), url, conf); assertNotNull(datum, "No CrawlDatum found in CrawlDb for " + url); diff --git a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java similarity index 99% rename from src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java rename to src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java index dfad393512..2e6ea55af1 100644 --- a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java @@ -29,7 +29,7 @@ import static org.apache.nutch.crawl.CrawlDatum.*; import static org.junit.jupiter.api.Assertions.fail; -public class TODOTestCrawlDbStates extends TestCrawlDbStates { +public class TestCrawlDbStatesExtended extends TestCrawlDbStates { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); diff --git a/src/test/org/apache/nutch/crawl/TestSitemapInjector.java b/src/test/org/apache/nutch/crawl/TestSitemapInjector.java new file mode 100644 index 0000000000..8aaec3f959 --- /dev/null +++ b/src/test/org/apache/nutch/crawl/TestSitemapInjector.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.crawl; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Reader.Option; +import org.apache.hadoop.io.Text; +import org.apache.logging.log4j.core.config.Configurator; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * End-to-end test for {@link SitemapInjector}: points the injector at a local + * {@code file://} sitemap, runs the full two-step MapReduce pipeline, then + * reads the resulting CrawlDb and asserts the expected URLs (including + * hreflang alternates) are present. + */ +public class TestSitemapInjector { + + private Configuration conf; + private FileSystem fs; + private final static Path testdir = new Path("build/test/sitemap-inject-test"); + private Path crawldbPath; + private Path urlPath; + private String sitemapUrl; + + @BeforeEach + public void setUp() throws Exception { + Configurator.setLevel( + "org.apache.hadoop.mapred", org.apache.logging.log4j.Level.DEBUG); + + conf = CrawlDBTestUtil.createContext().getConfiguration(); + + conf.set("plugin.folders", new File("build/plugins").getAbsolutePath()); + conf.set("plugin.includes", + "protocol-file|urlfilter-regex|index-basic"); + conf.setInt("http.time.limit", 120); + conf.setBoolean("mime.type.magic", false); + conf.setBoolean("db.injector.sitemap.check-cross-submits", false); + conf.setInt("mapreduce.mapper.multithreadedmapper.threads", 1); + conf.set("http.filter.ipaddress.exclude", ""); + + conf.set("urlfilter.regex.rules", "+."); +// conf.set("urlfilter.regex.file", +// new File("src/testresources/regex-urlfilter-test.txt").getAbsoluteFile().toURI().toString()); + + urlPath = new Path(testdir, "urls"); + crawldbPath = new Path(testdir, "crawldb"); + fs = FileSystem.get(conf); + fs.delete(testdir, true); + } + + @AfterEach + public void tearDown() throws IOException { + fs.delete(testdir, true); + } + + @Test + public void injectsUrlsFromLocalSitemap1() throws Exception { + sitemapUrl = resolveFixture("sitemaps/sitemap.example.1.xml"); + + List seeds = new ArrayList<>(); + seeds.add(sitemapUrl); + CrawlDBTestUtil.generateSeedList(fs, urlPath, seeds); + + SitemapInjector sitemapInjector = new SitemapInjector(); + sitemapInjector.setConf(conf); + sitemapInjector.inject(crawldbPath, urlPath); + + List injected = readCrawldb(); + + assertFalse(injected.isEmpty(), + "SitemapInjector produced an empty CrawlDb"); + + // Primary URL from sitemap.example.1.xml + assertTrue( + injected.contains("https://example.com/sitemap.html"), + "Primary URL missing from CrawlDb"); + + // hreflang alternate from the same block - exercises the + // sitemap-localized-links extraction path. + assertTrue( + injected.contains("https://example.com/tr/en/sitemap.html"), + "hreflang alternate missing from CrawlDb (localized-links extraction failed)"); + + assertThat(injected.size(), is(6)); + } + + + @Test + public void injectsUrlsFromLocalSitemap2() throws Exception { + sitemapUrl = resolveFixture("sitemaps/sitemap.example.2.xml"); + + List seeds = new ArrayList<>(); + seeds.add(sitemapUrl); + CrawlDBTestUtil.generateSeedList(fs, urlPath, seeds); + + SitemapInjector sitemapInjector = new SitemapInjector(); + sitemapInjector.setConf(conf); + sitemapInjector.inject(crawldbPath, urlPath); + + List injected = readCrawldb(); + + assertFalse(injected.isEmpty(), + "SitemapInjector produced an empty CrawlDb"); + + assertThat(injected.size(), is(3)); + } + + /** + * Resolve a fixture path either from {@code build/test/data} (where + * {@code ant test-core} copies {@code src/testresources}) or directly from + * {@code src/testresources} when running from an IDE. + */ + private String resolveFixture(String relative) { + String testData = System.getProperty("test.build.data"); + if (testData != null) { + File f = new File(testData, relative); + if (f.exists()) { + return f.toURI().toString(); + } + } + File src = new File("src/testresources", relative); + if (!src.exists()) { + src = new File("../src/testresources", relative); + } + return src.getAbsoluteFile().toURI().toString(); + } + + private List readCrawldb() throws IOException { + Path dbfile = new Path(crawldbPath, + CrawlDb.CURRENT_NAME + "/part-r-00000/data"); + Option rFile = SequenceFile.Reader.file(dbfile); + @SuppressWarnings("resource") + SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile); + List read = new ArrayList<>(); + try { + while (true) { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) { + break; + } + read.add(key.toString()); + } + } finally { + reader.close(); + } + Collections.sort(read); + return read; + } +} diff --git a/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java new file mode 100644 index 0000000000..b0bf6e0239 --- /dev/null +++ b/src/test/org/apache/nutch/indexer/AbstractIndexWriterIT.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +import org.apache.nutch.indexer.NutchDocument; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for IndexWriter integration tests. Provides common test logic + * for write/commit and delete operations. + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractIndexWriterIT implements IndexWriterIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpIndexWriter(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownIndexWriter(); + } + + @Test + void testWriteAndCommitDocument() throws Exception { + NutchDocument doc = createTestDocument("test-doc-1", "Test Document", + "This is a test document for integration testing."); + assertDoesNotThrow(() -> getIndexWriter().write(doc)); + assertDoesNotThrow(() -> getIndexWriter().commit()); + tearDownIndexWriter(); + verifyDocumentWritten("test-doc-1", "Test Document"); + } + + @Test + void testDeleteDocument() throws Exception { + if (!supportsDelete()) { + return; + } + String docId = "test-doc-to-delete"; + NutchDocument doc = createTestDocument(docId, "Document to Delete", ""); + + IndexWriter writer = getIndexWriter(); + writer.write(doc); + writer.commit(); + + IndexWriter deleteWriter = prepareWriterForDeleteTest(); + if (deleteWriter == null) { + deleteWriter = writer; + } + final IndexWriter writerForDelete = deleteWriter; + assertDoesNotThrow(() -> writerForDelete.delete(docId)); + assertDoesNotThrow(() -> writerForDelete.commit()); + if (deleteWriter != writer) { + try { + deleteWriter.close(); + } catch (Exception e) { + // Ignore + } + } + } + + /** Create a NutchDocument with id, title, and content. */ + protected NutchDocument createTestDocument(String id, String title, String content) { + NutchDocument doc = new NutchDocument(); + doc.add("id", id); + doc.add("title", title); + doc.add("content", content); + return doc; + } +} diff --git a/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java new file mode 100644 index 0000000000..c6f1027da9 --- /dev/null +++ b/src/test/org/apache/nutch/indexer/IndexWriterIntegrationTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +/** + * Contract for IndexWriter integration tests. Implementations run against + * real backends via Testcontainers. + */ +public interface IndexWriterIntegrationTest { + + /** Open the index writer before tests. */ + void setUpIndexWriter() throws Exception; + + /** Close the index writer after tests. */ + void tearDownIndexWriter() throws Exception; + + /** The IndexWriter under test. */ + IndexWriter getIndexWriter(); + + /** Whether this writer supports document deletion (e.g. Kafka does not). */ + boolean supportsDelete(); + + /** + * Optional verification that a document was indexed. + * Default no-op; override for Elastic, Solr. + */ + default void verifyDocumentWritten(String docId, String expectedTitle) throws Exception { + // no-op + } + + /** + * Optional writer configured for delete operations. Used when the main + * writer has delete disabled (e.g. Solr requires INDEXER_DELETE=true). + * Default returns null to use {@link #getIndexWriter()}. + */ + default IndexWriter prepareWriterForDeleteTest() throws Exception { + return null; + } +} diff --git a/src/test/org/apache/nutch/metrics/TestErrorTracker.java b/src/test/org/apache/nutch/metrics/TestErrorTracker.java new file mode 100644 index 0000000000..5caa3e3a71 --- /dev/null +++ b/src/test/org/apache/nutch/metrics/TestErrorTracker.java @@ -0,0 +1,514 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +import java.io.IOException; +import java.net.ConnectException; +import java.net.MalformedURLException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; + +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.TaskInputOutputContext; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParserNotFound; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolNotFound; +import org.apache.nutch.scoring.ScoringFilterException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.xml.sax.SAXException; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.*; + +import org.apache.nutch.metrics.ErrorTracker.ErrorType; + +/** + * Unit tests for {@link ErrorTracker} categorization, counting, and Hadoop + * counter integration. + */ +@ExtendWith(MockitoExtension.class) +public class TestErrorTracker { + + @Mock + private TaskInputOutputContext mockContext; + + @Mock + private Counter mockCounter; + + @BeforeEach + void setUp() { + // Configure mock context to return mock counter for any counter request + lenient().when(mockContext.getCounter(anyString(), anyString())).thenReturn(mockCounter); + } + + // ========================================================================= + // Network Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeNetworkErrors() { + // Test IOException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new IOException("Connection failed"))); + + // Test SocketException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new SocketException("Socket closed"))); + + // Test UnknownHostException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new UnknownHostException("example.com"))); + + // Test ConnectException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new ConnectException("Connection refused"))); + } + + // ========================================================================= + // Timeout Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeTimeoutErrors() { + // Test SocketTimeoutException + assertEquals(ErrorType.TIMEOUT, + ErrorTracker.categorize(new SocketTimeoutException("Read timed out"))); + } + + @Test + public void testCategorizeTimeoutByClassName() { + // Test custom exception with "Timeout" in class name + // The categorize method checks className.contains("Timeout") + Exception customTimeout = new CustomTimeoutException("Custom timeout"); + assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(customTimeout)); + } + + // Custom exception class for testing class name-based detection + private static class CustomTimeoutException extends Exception { + CustomTimeoutException(String message) { + super(message); + } + } + + // ========================================================================= + // URL Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeUrlErrors() { + // Test MalformedURLException + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new MalformedURLException("Invalid URL"))); + + // Test URISyntaxException + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new URISyntaxException("bad uri", "Invalid syntax"))); + } + + @Test + public void testCategorizeUrlFilterException() { + // Test URLFilterException (Nutch-specific) + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new URLFilterException("URL filtered"))); + } + + // ========================================================================= + // Protocol Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeProtocolErrors() { + // Test ProtocolException (Nutch-specific) + assertEquals(ErrorType.PROTOCOL, + ErrorTracker.categorize(new ProtocolException("Protocol error"))); + + // Test ProtocolNotFound (Nutch-specific) + assertEquals(ErrorType.PROTOCOL, + ErrorTracker.categorize(new ProtocolNotFound("ftp"))); + } + + // ========================================================================= + // Parsing Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeParsingErrors() { + // Test ParseException (Nutch-specific) + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new ParseException("Parse failed"))); + + // Test ParserNotFound (Nutch-specific) + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new ParserNotFound("text/unknown"))); + + // Test SAXException + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new SAXException("XML parse error"))); + } + + // ========================================================================= + // Scoring Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeScoringErrors() { + // Test ScoringFilterException (Nutch-specific) + assertEquals(ErrorType.SCORING, + ErrorTracker.categorize(new ScoringFilterException("Scoring failed"))); + } + + // ========================================================================= + // Indexing Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeIndexingErrors() { + // Test IndexingException (Nutch-specific) + assertEquals(ErrorType.INDEXING, + ErrorTracker.categorize(new IndexingException("Indexing failed"))); + } + + // ========================================================================= + // Other/Fallback Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeNullThrowable() { + // Null should return OTHER + assertEquals(ErrorType.OTHER, ErrorTracker.categorize(null)); + } + + @Test + public void testCategorizeGenericException() { + // Generic Exception should return OTHER + assertEquals(ErrorType.OTHER, + ErrorTracker.categorize(new Exception("Generic error"))); + + // RuntimeException should return OTHER + assertEquals(ErrorType.OTHER, + ErrorTracker.categorize(new RuntimeException("Runtime error"))); + } + + // ========================================================================= + // Cause Chain Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeCauseChain() { + // Exception with a network cause should be categorized as NETWORK + IOException cause = new IOException("Root cause"); + Exception wrapper = new Exception("Wrapper", cause); + assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(wrapper)); + + // Exception with a timeout cause should be categorized as TIMEOUT + SocketTimeoutException timeoutCause = new SocketTimeoutException("Timeout"); + Exception timeoutWrapper = new Exception("Wrapper", timeoutCause); + assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(timeoutWrapper)); + } + + @Test + public void testCategorizeNestedCauseChain() { + // Deep nested cause chain: RuntimeException -> Exception -> IOException + IOException rootCause = new IOException("Root cause"); + Exception middleWrapper = new Exception("Middle", rootCause); + RuntimeException outerWrapper = new RuntimeException("Outer", middleWrapper); + assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(outerWrapper)); + + // Deep nested with Nutch-specific exception + ScoringFilterException scoringCause = new ScoringFilterException("Scoring error"); + Exception wrapper1 = new Exception("Wrapper 1", scoringCause); + Exception wrapper2 = new Exception("Wrapper 2", wrapper1); + assertEquals(ErrorType.SCORING, ErrorTracker.categorize(wrapper2)); + } + + // ========================================================================= + // Record Error Tests (Local Accumulation) + // ========================================================================= + + @Test + public void testRecordErrorByType() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Initially all counts should be 0 + assertEquals(0, tracker.getTotalCount()); + assertEquals(0, tracker.getCount(ErrorType.NETWORK)); + + // Record a NETWORK error + tracker.recordError(ErrorType.NETWORK); + assertEquals(1, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.NETWORK)); + assertEquals(0, tracker.getCount(ErrorType.TIMEOUT)); + + // Record another NETWORK error + tracker.recordError(ErrorType.NETWORK); + assertEquals(2, tracker.getTotalCount()); + assertEquals(2, tracker.getCount(ErrorType.NETWORK)); + + // Record a TIMEOUT error + tracker.recordError(ErrorType.TIMEOUT); + assertEquals(3, tracker.getTotalCount()); + assertEquals(2, tracker.getCount(ErrorType.NETWORK)); + assertEquals(1, tracker.getCount(ErrorType.TIMEOUT)); + } + + @Test + public void testRecordErrorByThrowable() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Record an IOException (should be categorized as NETWORK) + tracker.recordError(new IOException("Test")); + assertEquals(1, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.NETWORK)); + + // Record a SocketTimeoutException (should be categorized as TIMEOUT) + tracker.recordError(new SocketTimeoutException("Test")); + assertEquals(2, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.TIMEOUT)); + + // Record a MalformedURLException (should be categorized as URL) + tracker.recordError(new MalformedURLException("Test")); + assertEquals(3, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.URL)); + } + + // ========================================================================= + // Counter Name Mapping Tests + // ========================================================================= + + @Test + public void testGetCounterName() { + // Test counter name mapping + assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL, + ErrorTracker.getCounterName(ErrorType.NETWORK)); + assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL, + ErrorTracker.getCounterName(ErrorType.PROTOCOL)); + assertEquals(NutchMetrics.ERROR_PARSING_TOTAL, + ErrorTracker.getCounterName(ErrorType.PARSING)); + assertEquals(NutchMetrics.ERROR_URL_TOTAL, + ErrorTracker.getCounterName(ErrorType.URL)); + assertEquals(NutchMetrics.ERROR_SCORING_TOTAL, + ErrorTracker.getCounterName(ErrorType.SCORING)); + assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL, + ErrorTracker.getCounterName(ErrorType.INDEXING)); + assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL, + ErrorTracker.getCounterName(ErrorType.TIMEOUT)); + assertEquals(NutchMetrics.ERROR_OTHER_TOTAL, + ErrorTracker.getCounterName(ErrorType.OTHER)); + } + + @Test + public void testGetCounterNameForThrowable() { + // Test getting counter name directly from throwable + assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL, + ErrorTracker.getCounterName(new IOException("Test"))); + assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL, + ErrorTracker.getCounterName(new SocketTimeoutException("Test"))); + assertEquals(NutchMetrics.ERROR_URL_TOTAL, + ErrorTracker.getCounterName(new MalformedURLException("Test"))); + assertEquals(NutchMetrics.ERROR_OTHER_TOTAL, + ErrorTracker.getCounterName(new RuntimeException("Test"))); + + // Test Nutch-specific exceptions + assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL, + ErrorTracker.getCounterName(new ProtocolException("Test"))); + assertEquals(NutchMetrics.ERROR_PARSING_TOTAL, + ErrorTracker.getCounterName(new ParseException("Test"))); + assertEquals(NutchMetrics.ERROR_SCORING_TOTAL, + ErrorTracker.getCounterName(new ScoringFilterException("Test"))); + assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL, + ErrorTracker.getCounterName(new IndexingException("Test"))); + } + + // ========================================================================= + // Hadoop Context Integration Tests (Using Mocks) + // ========================================================================= + + @Test + public void testConstructorWithContext() { + // Create ErrorTracker with context - should initialize counters + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Verify counters were requested from context + // Total counter + 8 error type counters = 9 calls + verify(mockContext, atLeast(9)).getCounter(anyString(), anyString()); + } + + @Test + public void testInitCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Initialize counters + tracker.initCounters(mockContext); + + // Verify counters were requested + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL); + } + + @Test + public void testIncrementCountersWithType() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Increment counters directly + tracker.incrementCounters(ErrorType.NETWORK); + + // Verify counter was incremented (total + specific type) + verify(mockCounter, times(2)).increment(1); + } + + @Test + public void testIncrementCountersWithThrowable() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Increment counters with throwable + tracker.incrementCounters(new IOException("Test")); + + // Verify counter was incremented (total + NETWORK type) + verify(mockCounter, times(2)).increment(1); + } + + @Test + public void testIncrementCountersWithoutInit() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Should throw IllegalStateException when counters not initialized + assertThrows(IllegalStateException.class, () -> { + tracker.incrementCounters(ErrorType.NETWORK); + }); + } + + @Test + public void testEmitCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Record some errors locally + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.TIMEOUT); + + // Emit counters (without cached counters - uses fallback) + tracker.emitCounters(mockContext); + + // Verify counters were requested and incremented + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL); + } + + @Test + public void testEmitCountersWithCachedCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Reset mock to clear constructor calls + reset(mockCounter); + + // Record some errors locally + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.TIMEOUT); + + // Emit counters (with cached counters) + tracker.emitCounters(mockContext); + + // Verify cached counters were used (increment called with accumulated values) + verify(mockCounter).increment(3L); // total count + verify(mockCounter).increment(2L); // NETWORK count + verify(mockCounter).increment(1L); // TIMEOUT count + } + + // ========================================================================= + // Thread Safety Tests + // ========================================================================= + + @Test + public void testThreadSafety() throws InterruptedException { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Create multiple threads that record errors concurrently + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + for (int j = 0; j < 100; j++) { + tracker.recordError(ErrorType.NETWORK); + } + }); + } + + // Start all threads + for (Thread thread : threads) { + thread.start(); + } + + // Wait for all threads to complete + for (Thread thread : threads) { + thread.join(); + } + + // Verify counts + assertEquals(1000, tracker.getTotalCount()); + assertEquals(1000, tracker.getCount(ErrorType.NETWORK)); + } + + @Test + public void testThreadSafetyMixedErrorTypes() throws InterruptedException { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Create threads that record different error types concurrently + Thread networkThread = new Thread(() -> { + for (int i = 0; i < 500; i++) { + tracker.recordError(ErrorType.NETWORK); + } + }); + + Thread timeoutThread = new Thread(() -> { + for (int i = 0; i < 300; i++) { + tracker.recordError(ErrorType.TIMEOUT); + } + }); + + Thread urlThread = new Thread(() -> { + for (int i = 0; i < 200; i++) { + tracker.recordError(ErrorType.URL); + } + }); + + networkThread.start(); + timeoutThread.start(); + urlThread.start(); + + networkThread.join(); + timeoutThread.join(); + urlThread.join(); + + // Verify counts + assertEquals(1000, tracker.getTotalCount()); + assertEquals(500, tracker.getCount(ErrorType.NETWORK)); + assertEquals(300, tracker.getCount(ErrorType.TIMEOUT)); + assertEquals(200, tracker.getCount(ErrorType.URL)); + } +} diff --git a/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java new file mode 100644 index 0000000000..9469b168fb --- /dev/null +++ b/src/test/org/apache/nutch/protocol/AbstractProtocolPluginIT.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Nutch; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.junit.jupiter.Testcontainers; + +/** + * Abstract base for Protocol plugin integration tests using Testcontainers. + * Provides common test logic for fetching URLs and verifying status codes. + * + *

Subclasses declare a static {@code @Container} field for the server + * container, implement {@link ProtocolPluginIntegrationTest}, and may add + * protocol-specific tests (e.g., redirect handling, authentication). + */ +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractProtocolPluginIT implements ProtocolPluginIntegrationTest { + + @BeforeEach + void setUp() throws Exception { + setUpProtocol(); + } + + @AfterEach + void tearDown() throws Exception { + tearDownProtocol(); + } + + /** Fetch the test URL and assert an HTTP 200 response. */ + @Test + void testFetch200() throws Exception { + CrawlDatum datum = new CrawlDatum(); + ProtocolOutput output = getProtocol() + .getProtocolOutput(new Text(getTestUrl()), datum); + assertNotNull(output, "ProtocolOutput must not be null"); + assertEquals(200, getHttpStatusCode(datum), + "Expected HTTP 200 for " + getTestUrl()); + verifyFetchedContent(output, datum); + } + + /** Fetch a non-existent path and assert an HTTP 404 response. */ + @Test + void testFetch404() throws Exception { + String url = get404Url(); + CrawlDatum datum = new CrawlDatum(); + getProtocol().getProtocolOutput(new Text(url), datum); + assertEquals(404, getHttpStatusCode(datum), + "Expected HTTP 404 for " + url); + } + + /** + * Returns a URL expected to produce a 404. Default appends a random path + * segment to {@link #getTestUrl()}; override if the server needs a specific + * path. + */ + protected String get404Url() { + String base = getTestUrl(); + if (base.endsWith("/")) { + return base + "nonexistent-path-xyz"; + } + return base + "/nonexistent-path-xyz"; + } + + /** + * Reads the HTTP status code stored in the CrawlDatum metadata by Nutch + * protocol plugins. Returns -1 if no status code was stored. + */ + protected static int getHttpStatusCode(CrawlDatum datum) { + if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) { + return Integer.parseInt( + datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString()); + } + return -1; + } +} diff --git a/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java new file mode 100644 index 0000000000..b3778077d9 --- /dev/null +++ b/src/test/org/apache/nutch/protocol/ProtocolPluginIntegrationTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol; + +import org.apache.nutch.crawl.CrawlDatum; + +/** + * Contract for Protocol plugin integration tests. Implementations run against + * real server backends (via Testcontainers or embedded servers). + */ +public interface ProtocolPluginIntegrationTest { + + /** Set up the protocol plugin and its backing server before tests. */ + void setUpProtocol() throws Exception; + + /** Shut down the protocol plugin after tests. */ + void tearDownProtocol() throws Exception; + + /** The Protocol under test. */ + Protocol getProtocol(); + + /** + * A URL that the backing server will serve with a 200/success response. + * Must point into the container or embedded server started by this test. + */ + String getTestUrl(); + + /** + * Optional extra verification after a successful fetch. + * Default is a no-op; override to inspect content, headers, etc. + */ + default void verifyFetchedContent(ProtocolOutput output, CrawlDatum datum) + throws Exception { + // no-op + } +} diff --git a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java index fee72b65a5..8124fe20b1 100644 --- a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java +++ b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java @@ -17,14 +17,19 @@ package org.apache.nutch.tools; import java.io.File; +import java.lang.reflect.Field; import java.nio.file.Files; import java.util.Collection; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.FileFilterUtils; +import org.apache.nutch.metrics.ErrorTracker; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; /** * @@ -106,6 +111,22 @@ public void testDump() throws Exception { } + @Test + public void testDumpWithNoSegmentDirectoriesRecordsOtherError() throws Exception { + File emptySegmentDir = Files.createTempDirectory("empty-segments").toFile(); + File outputDir = Files.createTempDirectory("dump-output").toFile(); + + ErrorTracker mockErrorTracker = mock(ErrorTracker.class); + CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(); + Field errorTrackerField = CommonCrawlDataDumper.class.getDeclaredField("errorTracker"); + errorTrackerField.setAccessible(true); + errorTrackerField.set(dumper, mockErrorTracker); + + dumper.dump(outputDir, emptySegmentDir, null, false, null, false, "", false); + + verify(mockErrorTracker, times(1)).recordError(ErrorTracker.ErrorType.OTHER); + } + private boolean hasFile(String fileName, Collection files) { for (File f : files) { if (f.getName().equals(fileName)) { diff --git a/src/test/org/apache/nutch/util/TestJexlUtil.java b/src/test/org/apache/nutch/util/TestJexlUtil.java new file mode 100644 index 0000000000..221fffea22 --- /dev/null +++ b/src/test/org/apache/nutch/util/TestJexlUtil.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import org.apache.commons.jexl3.JexlScript; +import org.apache.commons.jexl3.MapContext; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Unit tests for {@link JexlUtil} sandboxing. + */ +public class TestJexlUtil { + + @Test + public void testSandboxAllowsDocFieldCompare() throws Exception { + JexlScript script = JexlUtil.parseExpression("doc.lang == 'en'"); + assertNotNull(script); + MapContext doc = new MapContext(); + doc.set("lang", "en"); + MapContext root = new MapContext(); + root.set("doc", doc); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testSandboxAllowsScoreCompare() throws Exception { + JexlScript script = JexlUtil.parseExpression("score > 0.5"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("score", 0.9f); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testNewInstanceIoBlocked() { + assertNull(JexlUtil.parseExpression("new java.io.File('/')")); + } + + @Test + public void testNewInstanceFileOutputStreamBlocked() { + assertNull(JexlUtil.parseExpression( + "new java.io.FileOutputStream('/tmp/nutch-jexl-poc')")); + } + + @Test + public void testDisableSandboxAllowsNewExpressionParse() { + Configuration conf = new Configuration(); + conf.setBoolean(JexlUtil.DISABLE_SANDBOX_KEY, true); + JexlScript script = JexlUtil.parseExpression(conf, + "new java.io.File('/')"); + assertNotNull(script); + } + + @Test + public void testArithmeticAllowed() throws Exception { + JexlScript script = JexlUtil.parseExpression("2 * 3 + 1 == 7"); + assertNotNull(script); + assertTrue(Boolean.TRUE.equals(script.execute(new MapContext()))); + } + + @Test + public void testStringMethodsAllowed() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "url.startsWith('http://')"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("url", "http://example.org/"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testDateRewriteStillParses() { + JexlScript script = JexlUtil.parseExpression( + "fetchTime > 2016-03-20T00:00:00Z"); + assertNotNull(script); + } + + @Test + public void testNullExpression() { + assertNull(JexlUtil.parseExpression(null)); + assertNull(JexlUtil.parseExpression(new Configuration(), null)); + } + + @Test + public void testInvalidSyntaxReturnsNull() { + assertNull(JexlUtil.parseExpression("doc.lang=<>:='en'")); + } + + @Test + public void testListSize() throws Exception { + JexlScript script = JexlUtil.parseExpression("doc.tags.size() == 2"); + assertNotNull(script); + MapContext doc = new MapContext(); + java.util.List tags = new java.util.ArrayList<>(); + tags.add("a"); + tags.add("b"); + doc.set("tags", tags); + MapContext root = new MapContext(); + root.set("doc", doc); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testGeneratorStyleMetadata() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "warc_import_time > 0 && score > 0"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("warc_import_time", 1); + root.set("score", 1.0f); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testEqualsIgnoreCase() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "status.equalsIgnoreCase('FETCHED')"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("status", "fetched"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testRegex() throws Exception { + JexlScript script = JexlUtil.parseExpression( + "url =~ 'https?://.*\\.example\\.org/.*'"); + assertNotNull(script); + MapContext root = new MapContext(); + root.set("url", "http://foo.example.org/bar"); + assertTrue(Boolean.TRUE.equals(script.execute(root))); + } + + @Test + public void testTernary() throws Exception { + JexlScript script = JexlUtil.parseExpression("true ? 1 : 0"); + assertNotNull(script); + assertEquals(1, script.execute(new MapContext())); + } +} diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 59e486d696..573af66430 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -16,11 +16,18 @@ */ package org.apache.nutch.util; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.net.MalformedURLException; import java.net.URL; import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; /** Test class for URLUtil */ public class TestURLUtil { @@ -78,8 +85,13 @@ public void testGetDomainName() throws Exception { assertEquals("example.2000.hu", URLUtil.getDomainName(url)); // test non-ascii - url = new URL("http://www.example.商業.tw"); - assertEquals("example.商業.tw", URLUtil.getDomainName(url)); + url = new URL("http://www.example.flå.no"); + assertEquals("example.flå.no", URLUtil.getDomainName(url)); + url = new URL("http://www.example.栃木.jp"); + assertEquals("example.栃木.jp", URLUtil.getDomainName(url)); + // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885 + // url = new URL("http://www.example.商業.tw"); + // assertEquals("example.商業.tw", URLUtil.getDomainName(url)); // test URL without host/authority url = new URL("file:/path/index.html"); @@ -141,9 +153,16 @@ public void testGetDomainSuffix() throws Exception { url = new URL("http://www.example.2000.hu"); assertEquals("2000.hu", URLUtil.getDomainSuffix(url)); - // test non-ascii - url = new URL("http://www.example.商業.tw"); - assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url)); + // test non-ASCII + url = new URL("https://www.taiuru.māori.nz/"); + assertEquals("xn--mori-qsa.nz", URLUtil.getDomainSuffix(url)); + url = new URL("http://www.example.flå.no"); + assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url)); + url = new URL("http://www.example.栃木.jp"); + assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url)); + // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885 + // url = new URL("http://www.example.商業.tw"); + // assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url)); } @Test @@ -300,7 +319,20 @@ public void testToUNICODE() throws Exception { assertEquals("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1", URLUtil .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1")); - + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.ᬩᬮᬶ.id/", + URLUtil.toUNICODE("https://example.xn--9tfky.id/")); + // IDNA2008 + assertEquals("http://straße.de/", + URLUtil.toUNICODE("http://xn--strae-oqa.de/")); + // host names with uppercase characters + assertEquals("https://googie.com/", + URLUtil.toUNICODE("https://googIe.com/")); + assertEquals("https://googie.com/", URLUtil.toASCII("https://googIe.com/")); + assertEquals("https://xn--90ax2c.xn--p1ai/", + URLUtil.toASCII("https://нЭб.РФ/")); + assertEquals("https://нэб.рф/", + URLUtil.toUNICODE("https://Xn--90Ax2c.xN--P1ai/")); } @Test @@ -312,6 +344,106 @@ public void testToASCII() throws Exception { assertEquals("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1", URLUtil .toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); + // IDNA2003 + // assertEquals("http://strasse.de/", + // URLUtil.toASCII("http://straße.de/")); + // do not fail on characters not in Unicode 3.2 + assertEquals("https://example.xn--9tfky.id/", + URLUtil.toASCII("https://example.ᬩᬮᬶ.id/")); + // IDNA2008 + assertEquals("http://xn--strae-oqa.de/", + URLUtil.toASCII("http://straße.de/")); + } + + @ParameterizedTest + @CsvSource({ // + "www.xn--evir-zoa.com,www.çevir.com,IDNA2003,true", // + "xn--uni-tbingen-xhb.de,uni-tübingen.de,IDNA2003,true", // + "example.xn--9tfky.id,example.ᬩᬮᬶ.id,IDNA2008,true", // + // Test examples from whatwg-url + "xn--53h.example,☕.example,IDNA2008,true", // + "xn--0ca.xn--ssa73l,à.א̈,IDNA2008,true", // + "xn--mgba3gch31f060k.com,\u0646\u0627\u0645\u0647\u200c\u0627\u06cc.com,IDNA2008,true", // + /* Note: IDNA2008 and IDNA2003 deviate for the following examples, + * cf. https://www.unicode.org/reports/tr46/#IDNA2003-Section */ + "xn--strae-oqa.de,straße.de,IDNA2008,true", // + "strasse.de,straße.de,IDNA2003,false", // + "strasse.de,strasse.de,IDNA2003,true", // + "xn--fa-hia.de,faß.de,IDNA2008,true", // + "fass.de,faß.de,IDNA2003,false", // + "fass.de,fass.de,IDNA2003,true", // + "xn--nxasmm1c.com,βόλος.com,IDNA2008,true", // + "xn--nxasmq6b.com,βόλος.com,IDNA2003,false", // + "xn--nxasmq6b.com,βόλοσ.com,IDNA2003,true", // + "xn--10cl1a0b660p.com,ශ්‍රී.com,IDNA2008,true", // + "xn--10cl1a0b.com,ශ්‍රී.com,IDNA2003,false", // + "xn--10cl1a0b.com,ශ්රී.com,IDNA2003,true", // + "xn--mgba3gch31f060k.com,نامه‌ای.com,IDNA2008,true", // + "xn--mgba3gch31f.com,نامه‌ای.com,IDNA2003,false", // + "xn--mgba3gch31f.com,نامهای.com,IDNA2003,true", // + // mixed lowercase/uppercase: no round trip conversion + "xn--bb-eka.at,ÖBB.at,IDNA2003,false", // + "xn--bb-eka.at,öbb.at,IDNA2003,true", // + // mixed encoding (Punycode and Unicode) + "xn--p1ai.xn--p1ai,рф.xn--p1ai,IDNA2003,false", // + "xn--p1ai.xn--p1ai,xn--p1ai.рф,IDNA2003,false", // + // percent-encoding is not supported + // "xn--p1ai.xn--p1ai,xn--p1ai.%D1%80%D1%84,IDNA2003,false", // + }) + public final void testConvertHost(String ascii, String unicode, String type, + boolean roundTrip) throws Exception { + System.out.println(ascii + " <> " + unicode); + if ("IDNA2008".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2008(unicode, true)); + assertEquals(unicode, URLUtil.convertIDNA2008(ascii, false)); + try { + assertNotNull(URLUtil.convertIDNA2003(unicode, true, false)); + } catch (MalformedURLException e) { + /* + * Ok. A IDNA2008 input may raise an exception when using the IDNA2003 + * method + */ + } + } else if ("IDNA2003".equals(type)) { + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, true)); + assertEquals(ascii, URLUtil.convertIDNA2003(unicode, true, false)); + if (roundTrip) { + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, true)); + assertEquals(unicode, URLUtil.convertIDNA2003(ascii, false, false)); + } + } + } + + @Test + public final void testConvertHostInvalid() { + // broken Punycode + assertDoesNotThrow(() -> assertEquals("xn--xn--bss-7z6ccid.com", + URLUtil.convertIDNA2003("xn--xn--bss-7z6ccid.com", false, true))); + + // invalid Punycode + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--0.pt", false)); + + // IDNA2003 not allowing characters not in Unicode 3.2 + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2003("☕.example", true, true)); + assertDoesNotThrow(() -> assertEquals("xn--53h.example", + URLUtil.convertIDNA2003("xn--53h.example", false, true))); + + // IDNA2008 invalid, + // cf. https://www.unicode.org/reports/tr46/#Implementation_Notes + // cf. https://www.unicode.org/Public/17.0.0/idna/IdnaTestV2.txt + // disallowed character: ⒈ (U+2488 - DIGIT ONE FULL STOP) + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("\u2488com", true)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--acom-0w1b", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--xn--a--gua.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", false)); + assertThrows(MalformedURLException.class, + () -> URLUtil.convertIDNA2008("xn--a-ä.pt", true)); } @Test diff --git a/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java b/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java new file mode 100644 index 0000000000..60843e5a6f --- /dev/null +++ b/src/test/org/commoncrawl/util/TestCanonicalLinkDetector.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.*; + +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class TestCanonicalLinkDetector { + + @ParameterizedTest + @ValueSource(strings = { + "", // + "", // + "", // + "", // + "", // + "", // + "", // + "", // + "", // + }) + void testDetectHTML(String htmlSnippet) { + String canonicalLink = "https://www.example.org/canonical/"; + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinksHTML(htmlSnippet.getBytes(StandardCharsets.UTF_8), + 1024, 1); + assertFalse(canonicalLinks.isEmpty()); + URI baseUri = URI.create("https://www.example.org/"); + assertEquals(canonicalLink, + baseUri.resolve(canonicalLinks.get(0)).toASCIIString()); + } + + @ParameterizedTest + @ValueSource(strings = { + "; rel=\"canonical\"", // + "; rel=\"canonical\",; rel=\"shortlink\"", // + "; rel='canonical'", // + "; rel=\"canonical\",; rel=\"shortlink\",; rel=\"shortcut icon\"", // + }) + void testDetectHTTP(String httpHeader) { + String canonicalLink = "https://www.example.org/canonical/"; + String[] linkHeaderValues = List.of(httpHeader).toArray(new String[0]); + List canonicalLinks = CanonicalLinkDetector + .detectCanonicalLinksHttpHeader(linkHeaderValues, 1); + assertFalse(canonicalLinks.isEmpty()); + assertEquals(canonicalLink, canonicalLinks.get(0)); + } +} diff --git a/src/test/org/commoncrawl/util/TestLanguageDetector.java b/src/test/org/commoncrawl/util/TestLanguageDetector.java new file mode 100644 index 0000000000..970d74ea09 --- /dev/null +++ b/src/test/org/commoncrawl/util/TestLanguageDetector.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import org.apache.nutch.protocol.Content; +import org.junit.jupiter.api.Test; + +public class TestLanguageDetector { + + private String[][] languageData = { // test data from https://tatoeba.org/ + { "zho", "我不知道應該說什麼才好。", "我只是不知道應該說什麼而已…… 我法语说得不够好！" }, // + { "deu", "Ich finde keine Worte.", + "Ich weiß einfach nicht, was ich sagen soll. Mein Französisch ist nicht gut genug." }, // + { "eng", "I have no words.", + "I just don't know what to say.
I simply don't know what to tell... I don't speak French well enough!" }, // + { "fra", "J'en perds mes mots.", + "Je ne sais simplement pas quoi dire... Je ne parle pas assez bien français !" }, // + { "jpn", "何と言ったら良いか分かりません。", "何と言ったらいいか・・・。私はフランス語がそんなにきちんとは話せない。" }, // + { "nld", "Ik heb er geen woorden voor.", + "Ik weet gewoon niet wat ik moet zeggen..." }, // + { "rus", "У меня нет слов.", + "Просто не знаю, что и сказать... Я не говорю по-французски настолько хорошо." }, // + { "spa", "No tengo palabras.", + "Simplemente no sé qué decir... ¡No hablo francés lo suficientemente bien!" }, // + { "vie", "Tôi hết lời để nói.", "Tôi không biết nên nói gì cả..." }, // + }; + + protected void testHtml(LanguageDetector detector, + String language, String title, String body, Charset charset, + boolean addCharsetHint, boolean addLanguageHint) { + String headerStart = "\n\n"; + String headerEnd = "\n\n

"; + String footer = "

\n"; + URI uri = URI.create("http://www.example.com/"); + + Content content = new Content(); + content.setContentType("text/html"); + String doc = headerStart + "" + title + ""; + if (addLanguageHint) { + doc = doc.replace("\"\"", "\"" + language + "\""); + } + if (addCharsetHint) { + doc += "\n"; + } + doc += headerEnd + body + footer; + content.setContent(doc.getBytes(charset)); + LanguageDetector.Result res = detector.detectLanguage(uri, content); + assertEquals(charset, res.charset, "charset detection failed for " + language); + org.commoncrawl.langdetect.cld2.Result lr = res.languages; + assertEquals(language, lr.getLanguageCodeISO639_3(), "language detection failed for " + language); + } + + @Test + public void testLanguageDetector() throws IOException, URISyntaxException { + LanguageDetector langDetect = new LanguageDetector(); + langDetect.setBestEffort(true); + for (String[] data : languageData) { + testHtml(langDetect, data[0], data[1], data[2], StandardCharsets.UTF_8, true, true); + } + } + + @Test + public void testCharsetDetector() throws IOException { + LanguageDetector langDetect = new LanguageDetector(); + langDetect.setBestEffort(true); + for (String[] data : languageData) { + Charset charset = StandardCharsets.UTF_8; + boolean needHint = false; + switch (data[0]) { + case "eng": + charset = StandardCharsets.US_ASCII; + needHint = true; // subset of UTF-8 or ISO-8859-* + break; + case "fra": + charset = StandardCharsets.ISO_8859_1; + break; + case "deu": + case "nld": + charset = StandardCharsets.ISO_8859_1; + break; + case "spa": + needHint = true; // subset of UTF-8 or ISO-8859-* + charset = Charset.forName("x-MacRoman"); + break; + case "rus": + charset = Charset.forName("KOI8-R"); + break; + case "jpn": + charset = Charset.forName("SHIFT_JIS"); + break; + case "zho": + charset = Charset.forName("GB2312"); + needHint = true; // could be also GB18030 + break; + } + testHtml(langDetect, data[0], data[1], data[2], charset, needHint, true); + } + } +} diff --git a/src/test/org/commoncrawl/util/TestWarcRecordWriter.java b/src/test/org/commoncrawl/util/TestWarcRecordWriter.java new file mode 100644 index 0000000000..f3c8384c00 --- /dev/null +++ b/src/test/org/commoncrawl/util/TestWarcRecordWriter.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; + +import org.junit.jupiter.api.Test; + +public class TestWarcRecordWriter { + + public final static String statusLine1 = "HTTP/1.1 200 OK"; + public final static String testHeaders1[] = { // + "Content-Type", "text/html", // + "Accept-Ranges", "bytes", // + "Content-Encoding", "gzip", // + "Vary", "Accept-Encoding", "Server", + "Apache/2.0.63 (Unix) PHP/4.4.7 mod_ssl/2.0.63 OpenSSL/0.9.7e mod_fastcgi/2.4.2 DAV/2 SVN/1.4.2", + "Last-Modified", "Thu, 15 Jan 2009 00:02:29 GMT", "ETag", + "\"1262d9e-3ffa-2c19af40\"", // + "Date", "Mon, 26 Jan 2009 10:00:40 GMT", // + "Content-Length", "16378", // + "Connection", "close" }; + public final static String testHeaderString1; + static { + StringBuilder headers = new StringBuilder(); + headers.append(statusLine1).append(WarcRecordWriter.CRLF); + for (int i = 0; i < testHeaders1.length; i += 2) { + headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP); + headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF); + } + headers.append(WarcRecordWriter.CRLF); + testHeaderString1 = headers.toString(); + } + + @Test + public void testFormatHttpHeaders() { + assertEquals(testHeaderString1, WarcRecordWriter + .formatHttpHeaders(statusLine1, Arrays.asList(testHeaders1)), + "Formatting HTTP header failed"); + } + + @Test + public void testFixHttpHeaders() { + StringBuilder headers = new StringBuilder(); + headers.append(statusLine1).append(WarcRecordWriter.CRLF); + for (int i = 0; i < testHeaders1.length; i += 2) { + headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP); + headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF); + } + String headerStr = WarcRecordWriter.formatHttpHeaders(statusLine1, + Arrays.asList(testHeaders1)); + String fixed = WarcRecordWriter.fixHttpHeaders(headerStr, 50000); + assertFalse(fixed.contains("\r\nContent-Encoding:"), + "Content-Encoding should be removed"); + assertTrue(fixed.contains("\r\nX-Crawler-Content-Encoding:"), + "Prefixed original Content-Encoding not found"); +// assertFalse(fixed.contains("\r\nTransfer-Encoding:"), +// "Transfer-Encoding should be removed"); + assertFalse(fixed.contains("\r\nContent-Length: 16378\r\n"), + "Content-Length to be replaced"); + assertTrue(fixed.contains("\r\nX-Crawler-Content-Length: 16378\r\n"), + "Prefixed original Content-Length not found"); + assertTrue(fixed.contains("\r\nContent-Length: 50000\r\n"), + "Correct/fixed Content-Length not found"); + + fixed = WarcRecordWriter + .fixHttpHeaders(headerStr.replaceAll("[\r\n]+$", ""), 50000); + assertTrue(fixed.endsWith("\r\n\r\n"), + "No trailing \\r\\n\\r\\n in HTTP headers"); + } + + @Test + public void testCanonicalizeIPv6() { + assertEquals("::", WarcRecordWriter.canonicalizeIP("0:0:0:0:0:0:0:0")); + assertEquals("::1", WarcRecordWriter.canonicalizeIP("0:0:0:0:0:0:0:1")); + assertEquals("2001:db8::1", + WarcRecordWriter.canonicalizeIP("2001:db8:0:0:0:0:0:1")); + assertEquals("2001:db8:1:1:1:1:1:1", + WarcRecordWriter.canonicalizeIP("2001:db8:1:1:1:1:1:1")); + assertEquals("2001:0:0:1::1", + WarcRecordWriter.canonicalizeIP("2001:0:0:1:0:0:0:1")); + assertEquals("2001:db8:f::1", + WarcRecordWriter.canonicalizeIP("2001:db8:000f:0:0:0:0:1")); + assertEquals("2001:db8::1:0:0:1", WarcRecordWriter + .canonicalizeIP("2001:0db8:0000:0000:0001:0000:0000:0001")); + assertEquals("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", WarcRecordWriter + .canonicalizeIP("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")); + assertEquals("2001:200f::1", + WarcRecordWriter.canonicalizeIP("2001:200f:0:0:0:0:0:1")); + // https://datatracker.ietf.org/doc/html/rfc5952#section-4.2.2 + // "The symbol "::" MUST NOT be used to shorten just one 16-bit 0 field." + assertEquals("2001:0:3:4:5:6:7:8", + WarcRecordWriter.canonicalizeIP("2001:0:3:4:5:6:7:8")); + // shorten first of same-length consecutive 0 fields, also in initial + // position + assertEquals("::4:0:0:0:ffff", + WarcRecordWriter.canonicalizeIP("0:0:0:4:0:0:0:ffff")); + } +} diff --git a/src/test/org/commoncrawl/util/TestWarcWriter.java b/src/test/org/commoncrawl/util/TestWarcWriter.java new file mode 100644 index 0000000000..4f7344010d --- /dev/null +++ b/src/test/org/commoncrawl/util/TestWarcWriter.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.commoncrawl.util; + +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.protocol.Content; +import org.commoncrawl.util.test.SegmenterRecordReader; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.net.URI; +import java.util.Date; +import java.util.zip.GZIPInputStream; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestWarcWriter { + + @Test + public void testWriteRevisitRecordContentType() throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + WarcWriter writer = new WarcWriter(bos); + + File segmentDir = new File(System.getProperty("test.build.data", "."), "test-segments/20260224170658-revisit"); + assertNotNull(segmentDir, "Missing segment resource"); + String segmentPath = segmentDir.getAbsolutePath(); + String url = "https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"; + + Content content = SegmenterRecordReader.retrieveContent(segmentPath, url); + URI targetUri = new URI(content.getUrl()); + + Metadata metadata = content.getMetadata(); + String ip = content.getMetadata().get("_ip_"); + int httpStatusCode = 304; + + Date date = HttpDateFormat.toDate(metadata.get("date")); + URI warcinfoId = writer.getRecordId(); + URI relatedId = writer.getRecordId(); + String warcProfile = WarcWriter.PROFILE_REVISIT_IDENTICAL_DIGEST; + Date refersToDate = new Date(System.currentTimeMillis() - 3600000); + String payloadDigest = "sha1:abc123"; + String blockDigest = "sha1:def456"; + + writer.writeWarcRevisitRecord(targetUri, ip, httpStatusCode, date, + warcinfoId, relatedId, warcProfile, refersToDate, payloadDigest, + blockDigest, null, null, content.getContent(), content); + + byte[] compressed = bos.toByteArray(); + ByteArrayInputStream bis = new ByteArrayInputStream(compressed); + GZIPInputStream gis = new GZIPInputStream(bis); + ByteArrayOutputStream decompressed = new ByteArrayOutputStream(); + gis.transferTo(decompressed); + + String warcOutput = decompressed.toString(); + + assertTrue(warcOutput.contains("WARC-Type: revisit"), + "WARC record should have WARC-Type: revisit"); + assertTrue(warcOutput.contains("Content-Type: application/http; msgtype=response"), + "WARC revisit record should have Content-Type: application/http; msgtype=response"); + assertTrue(warcOutput.contains("WARC-Refers-To-Target-URI: https://de.wikipedia.org/wiki/Wikipedia:WikiCon_2025"), + "WARC record should have WARC-Refers-To-Target-URI header"); + assertTrue(warcOutput.contains("WARC-Profile: " + warcProfile), + "WARC record should have WARC-Profile header"); + } +} diff --git a/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java new file mode 100644 index 0000000000..62057f4e17 --- /dev/null +++ b/src/test/org/commoncrawl/util/test/SegmenterRecordReader.java @@ -0,0 +1,52 @@ +package org.commoncrawl.util.test; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + +import java.util.Arrays; + +public class SegmenterRecordReader extends Configured implements Tool { + + private Content content; + + @Override + public int run(String[] args) throws Exception { + return run(args[0], args[1]); + } + + private int run(String path, String url) throws Exception { + Path p = new Path(path, Content.DIR_NAME); + Text k = new Text(url); + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(p, getConf()); + Content c = new Content(); + readers[0].get(k, c); + assert (c.getUrl().equals(url)); + assert (c.getContent() == null || c.getContent().length == 0); + this.content = c; + + return 0; + } + + public static Content retrieveContent(String segmentPath, String url) throws Exception { + SegmenterRecordReader reader = new SegmenterRecordReader(); + ToolRunner.run(NutchConfiguration.create(), + reader, Arrays.asList(segmentPath, url).toArray(new String[0])); + + return reader.getContent(); + } + + public Content getContent() { + return content; + } + + public void setContent(Content content) { + this.content = content; + } +} diff --git a/src/testresources/deduplication-crawldb/old/part-r-00000/.data.crc b/src/testresources/deduplication-crawldb/old/part-r-00000/.data.crc new file mode 100644 index 0000000000..b7c4a93f62 Binary files /dev/null and b/src/testresources/deduplication-crawldb/old/part-r-00000/.data.crc differ diff --git a/src/testresources/deduplication-crawldb/old/part-r-00000/.index.crc b/src/testresources/deduplication-crawldb/old/part-r-00000/.index.crc new file mode 100644 index 0000000000..8318e4b7ac Binary files /dev/null and b/src/testresources/deduplication-crawldb/old/part-r-00000/.index.crc differ diff --git a/src/testresources/deduplication-crawldb/old/part-r-00000/data b/src/testresources/deduplication-crawldb/old/part-r-00000/data new file mode 100644 index 0000000000..f3a6d6a55b Binary files /dev/null and b/src/testresources/deduplication-crawldb/old/part-r-00000/data differ diff --git a/src/testresources/deduplication-crawldb/old/part-r-00000/index b/src/testresources/deduplication-crawldb/old/part-r-00000/index new file mode 100644 index 0000000000..78d1ba2bef Binary files /dev/null and b/src/testresources/deduplication-crawldb/old/part-r-00000/index differ diff --git a/src/testresources/sitemaps/sitemap.example.1.txt b/src/testresources/sitemaps/sitemap.example.1.txt new file mode 100644 index 0000000000..920a0d04ff --- /dev/null +++ b/src/testresources/sitemaps/sitemap.example.1.txt @@ -0,0 +1 @@ +file:src/testresources/sitemaps/sitemap.example.1.xml \ No newline at end of file diff --git a/src/testresources/sitemaps/sitemap.example.1.xml b/src/testresources/sitemaps/sitemap.example.1.xml new file mode 100644 index 0000000000..2e83b082a2 --- /dev/null +++ b/src/testresources/sitemaps/sitemap.example.1.xml @@ -0,0 +1,26 @@ + + + + https://example.com/sitemap.html + 2020-12-12T07:00:11.833Z + weekly + 0.2 + + + + + https://example.com/help.html + 2017-01-27T12:54:30.733Z + monthly + 0.2 + + https://example.com/logo.png + + + + + + \ No newline at end of file diff --git a/src/testresources/sitemaps/sitemap.example.2.txt b/src/testresources/sitemaps/sitemap.example.2.txt new file mode 100644 index 0000000000..58ff085c34 --- /dev/null +++ b/src/testresources/sitemaps/sitemap.example.2.txt @@ -0,0 +1 @@ +file:src/testresources/sitemaps/sitemap.example.2.xml \ No newline at end of file diff --git a/src/testresources/sitemaps/sitemap.example.2.xml b/src/testresources/sitemaps/sitemap.example.2.xml new file mode 100644 index 0000000000..fc93077ebc --- /dev/null +++ b/src/testresources/sitemaps/sitemap.example.2.xml @@ -0,0 +1,11 @@ + + + + https://www.example.org/example/ + + + + + https://www.example.org/example/ + + \ No newline at end of file diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc new file mode 100644 index 0000000000..4685c926b8 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.data.crc differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc new file mode 100644 index 0000000000..9f5864f594 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/.index.crc differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data new file mode 100644 index 0000000000..526d1fd871 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/data differ diff --git a/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index new file mode 100644 index 0000000000..6aca7f9506 Binary files /dev/null and b/src/testresources/test-segments/20260224170658-revisit/content/part-r-00000/index differ