From ca8ae7f58b99a411a7e4e7157b4a014c0a4d6f4b Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 11 Dec 2025 01:57:59 -0800 Subject: [PATCH 01/27] NUTCH-3126 Report JUnit test results in GitHub pull request thread (#868) --- .github/workflows/junit-report.yml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index ead3e5b325..e7658ffea6 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -37,18 +37,21 @@ jobs: report_paths: |- ./test/TEST-*.xml ./**/test/TEST-*.xml + check_name: |- + JUnit Test Report + JUnit Test Report Plugins commit: ${{ github.event.workflow_run.head_sha }} - comment: true - pr_id: ${{ github.event.workflow_run.pull_requests[0].number }} - fail_on_failure: true - job_summary: true - detailed_summary: true - truncate_stack_traces: false - fail_on_parse_error: false # temporary while debugging TestMimeUtil + fail_on_failure: false + fail_on_parse_error: false # temporary while debugging missing result for TestMimeUtil require_tests: true + require_passed_tests: true + include_passed: false + include_skipped: true + check_annotations: true + job_summary: true + skip_success_summary: true include_time_in_summary: true - include_passed: true + comment: true job_name: tests - check_name: |- - JUnit Test Report Core - JUnit Test Report Plugins + truncate_stack_traces: false + pr_id: ${{ github.event.workflow_run.pull_requests[0].number }} From 1d8106c5d554d7f9426cbc3b19d2b70437baeca0 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 11 Dec 2025 08:47:30 -0800 Subject: [PATCH 02/27] NUTCH-3132 Standardize existing Nutch metrics naming and implementation (#871) --- .../org/apache/nutch/crawl/CrawlDbFilter.java | 12 +- .../apache/nutch/crawl/CrawlDbReducer.java | 7 +- .../apache/nutch/crawl/DeduplicationJob.java | 5 +- .../org/apache/nutch/crawl/Generator.java | 37 +- src/java/org/apache/nutch/crawl/Injector.java | 37 +- .../org/apache/nutch/fetcher/Fetcher.java | 21 +- .../apache/nutch/fetcher/FetcherThread.java | 75 +++- .../org/apache/nutch/fetcher/QueueFeeder.java | 15 +- .../apache/nutch/hostdb/ResolverThread.java | 26 +- .../nutch/hostdb/UpdateHostDbMapper.java | 13 +- .../nutch/hostdb/UpdateHostDbReducer.java | 10 +- .../org/apache/nutch/indexer/CleaningJob.java | 4 +- .../nutch/indexer/IndexerMapReduce.java | 31 +- .../apache/nutch/metrics/NutchMetrics.java | 371 ++++++++++++++++++ .../apache/nutch/metrics/package-info.java | 32 ++ .../org/apache/nutch/parse/ParseSegment.java | 4 +- .../nutch/scoring/webgraph/WebGraph.java | 7 +- .../apache/nutch/tools/warc/WARCExporter.java | 40 +- .../apache/nutch/util/SitemapProcessor.java | 26 +- 19 files changed, 651 insertions(+), 122 deletions(-) create mode 100644 src/java/org/apache/nutch/metrics/NutchMetrics.java create mode 100644 src/java/org/apache/nutch/metrics/package-info.java diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java index d9ab0d3cc0..7f28a3a85a 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java @@ -24,6 +24,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -80,15 +81,15 @@ public void map(Text key, CrawlDatum value, // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, // cheaper than normalizing or filtering if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) { - context.getCounter("CrawlDB filter", - "Gone records removed").increment(1); + context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, + NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL).increment(1); return; } // Whether to remove orphaned pages // https://issues.apache.org/jira/browse/NUTCH-1932 if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) { - context.getCounter("CrawlDB filter", - "Orphan records removed").increment(1); + context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, + NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL).increment(1); return; } if (url != null && urlNormalizers) { @@ -108,7 +109,8 @@ public void map(Text key, CrawlDatum value, } } if (url == null) { - context.getCounter("CrawlDB filter", "URLs filtered").increment(1); + context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, + NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).increment(1); } else { // URL has passed filters newKey.set(url); // collect it diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java index deb266af61..e263f8463c 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java @@ -31,6 +31,7 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.PriorityQueue; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.StringUtil; @@ -163,7 +164,8 @@ public void reduce(Text key, Iterable values, LOG.warn("Couldn't update orphaned score, key={}: {}", key, e); } context.write(key, old); - context.getCounter("CrawlDB status", + // Dynamic counter based on status name + context.getCounter(NutchMetrics.GROUP_CRAWLDB, CrawlDatum.getStatusName(old.getStatus())).increment(1); } else { LOG.warn("Missing fetch and old value, signature={}", @@ -319,7 +321,8 @@ public void reduce(Text key, Iterable values, // remove generation time, if any result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); context.write(key, result); - context.getCounter("CrawlDB status", + // Dynamic counter based on status name + context.getCounter(NutchMetrics.GROUP_CRAWLDB, CrawlDatum.getStatusName(result.getStatus())).increment(1); } diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index 3e12d4598c..cdb291fe85 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -45,6 +45,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; @@ -139,8 +140,8 @@ protected void writeOutAsDuplicate(CrawlDatum datum, throws IOException, InterruptedException { datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE); Text key = (Text) datum.getMetaData().remove(urlKey); - context.getCounter("DeduplicationJobStatus", - "Documents marked as duplicate").increment(1); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL).increment(1); context.write(key, datum); } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 82475af5b8..db15f0426e 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -67,6 +67,7 @@ import org.apache.hadoop.io.WritableComparator; import org.apache.nutch.hostdb.HostDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -225,11 +226,13 @@ public void map(Text key, CrawlDatum value, Context context) // URLFilters try { if (filters.filter(url.toString()) == null) { - context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL).increment(1); return; } } catch (URLFilterException e) { - context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URL_FILTER_EXCEPTION_TOTAL).increment(1); LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage()); } } @@ -239,7 +242,8 @@ public void map(Text key, CrawlDatum value, Context context) if (!schedule.shouldFetch(url, crawlDatum, curTime)) { LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url, crawlDatum.getFetchTime(), curTime); - context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL).increment(1); return; } @@ -248,7 +252,8 @@ public void map(Text key, CrawlDatum value, Context context) if (oldGenTime != null) { // awaiting fetch & update if (oldGenTime.get() + genDelay > curTime) { // still wait for // update - context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL).increment(1); return; } } @@ -262,19 +267,22 @@ public void map(Text key, CrawlDatum value, Context context) // check expr if (expr != null) { if (!crawlDatum.execute(expr, key.toString())) { - context.getCounter("Generator", "EXPR_REJECTED").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL).increment(1); return; } } if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) { - context.getCounter("Generator", "STATUS_REJECTED").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL).increment(1); return; } // consider only entries with a score superior to the threshold if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) { - context.getCounter("Generator", "SCORE_TOO_LOW").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL).increment(1); return; } @@ -282,7 +290,8 @@ public void map(Text key, CrawlDatum value, Context context) // threshold if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) { - context.getCounter("Generator", "INTERVAL_REJECTED").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL).increment(1); return; } @@ -507,7 +516,8 @@ public void reduce(FloatWritable key, Iterable values, } catch (MalformedURLException e) { LOG.warn("Malformed URL: '{}', skipping ({})", urlString, StringUtils.stringifyException(e)); - context.getCounter("Generator", "MALFORMED_URL").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_MALFORMED_URL_TOTAL).increment(1); continue; } @@ -539,16 +549,15 @@ public void reduce(FloatWritable key, Iterable values, hostCount[1] = 1; } else { if (hostCount[1] == (maxCount+1)) { - context - .getCounter("Generator", "HOSTS_AFFECTED_PER_HOST_OVERFLOW") - .increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL).increment(1); LOG.info( "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", hostordomain, maxCount, maxNumSegments); } // skip this entry - context.getCounter("Generator", "URLS_SKIPPED_PER_HOST_OVERFLOW") - .increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL).increment(1); continue; } } diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 3e03f9ea8e..4845e4363d 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -36,6 +36,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilterException; @@ -218,7 +219,8 @@ public void map(Text key, Writable value, Context context) url = filterNormalize(url); if (url == null) { - context.getCounter("injector", "urls_filtered").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).increment(1); } else { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); @@ -238,7 +240,8 @@ public void map(Text key, Writable value, Context context) "Cannot filter injected score for url {}, using default ({})", url, e.getMessage()); } - context.getCounter("injector", "urls_injected").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).increment(1); context.write(key, datum); } } else if (value instanceof CrawlDatum) { @@ -248,14 +251,16 @@ public void map(Text key, Writable value, Context context) // remove 404 urls if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) { - context.getCounter("injector", "urls_purged_404").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).increment(1); return; } if (filterNormalizeAll) { String url = filterNormalize(key.toString()); if (url == null) { - context.getCounter("injector", "urls_purged_filter").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).increment(1); } else { key.set(url); context.write(key, datum); @@ -341,9 +346,11 @@ public void reduce(Text key, Iterable values, Context context) } } if (injectedSet) { - context.getCounter("injector", "urls_injected_unique").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).increment(1); if (oldSet) { - context.getCounter("injector", "urls_merged").increment(1); + context.getCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).increment(1); } } context.write(key, result); @@ -454,17 +461,23 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, if (LOG.isInfoEnabled()) { long urlsInjected = job.getCounters() - .findCounter("injector", "urls_injected").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).getValue(); long urlsInjectedUniq = job.getCounters() - .findCounter("injector", "urls_injected_unique").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).getValue(); long urlsFiltered = job.getCounters() - .findCounter("injector", "urls_filtered").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).getValue(); long urlsMerged = job.getCounters() - .findCounter("injector", "urls_merged").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).getValue(); long urlsPurged404 = job.getCounters() - .findCounter("injector", "urls_purged_404").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).getValue(); long urlsPurgedFilter = job.getCounters() - .findCounter("injector", "urls_purged_filter").getValue(); + .findCounter(NutchMetrics.GROUP_INJECTOR, + NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).getValue(); LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered); LOG.info( "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index f6518be761..4a139f5d08 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -48,6 +48,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -295,8 +296,8 @@ public void run(Context innerContext) pagesLastSec = pages.get() - pagesLastSec; bytesLastSec = (int) bytes.get() - bytesLastSec; - innerContext.getCounter("FetcherStatus", "bytes_downloaded") - .increment(bytesLastSec); + innerContext.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL).increment(bytesLastSec); reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec); @@ -334,8 +335,8 @@ public void run(Context innerContext) int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) - innerContext - .getCounter("FetcherStatus", "hitByThrougputThreshold") + innerContext.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL) .increment(hitByThrougputThreshold); } } @@ -417,8 +418,8 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) - innerContext.getCounter("FetcherStatus", "hitByTimeLimit") - .increment(hitByTimeLimit); + innerContext.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(hitByTimeLimit); } /* @@ -434,8 +435,8 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { timeout); LOG.warn("Aborting with {} hung threads{}.", activeThreads, feeder.isAlive() ? " (queue feeder still alive)" : ""); - innerContext.getCounter("FetcherStatus", "hungThreads") - .increment(activeThreads.get()); + innerContext.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HUNG_THREADS_TOTAL).increment(activeThreads.get()); for (int i = 0; i < fetcherThreads.size(); i++) { FetcherThread thread = fetcherThreads.get(i); if (thread.isAlive()) { @@ -470,8 +471,8 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { fetchQueues.getTotalSize(), fetchQueues.getQueueCount(), feeder.isAlive() ? " (queue feeder still alive)" : ""); int hitByTimeout = fetchQueues.emptyQueues(); - innerContext.getCounter("FetcherStatus", "hitByTimeout") - .increment(hitByTimeout); + innerContext.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(hitByTimeout); return; } diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 75ae606cb4..66e560af64 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -34,12 +34,14 @@ import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.fetcher.Fetcher.FetcherRun; import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLExemptionFilters; @@ -172,6 +174,18 @@ public class FetcherThread extends Thread { private ProtocolLogUtil logUtil = new ProtocolLogUtil(); + // Cached counters for performance (avoid repeated lookups in hot paths) + private Counter robotsDeniedCounter; + private Counter robotsDeniedMaxCrawlDelayCounter; + private Counter robotsDeferVisitsDroppedCounter; + private Counter redirectCountExceededCounter; + private Counter redirectDeduplicatedCounter; + private Counter redirectNotCreatedCounter; + private Counter hitByTimeLimitCounter; + private Counter aboveExceptionThresholdCounter; + private Counter outlinksDetectedCounter; + private Counter outlinksFollowingCounter; + public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context, AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, @@ -279,6 +293,35 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ getName(), Thread.currentThread().getId()); } } + + // Initialize cached counters for performance + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters() { + robotsDeniedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_TOTAL); + robotsDeniedMaxCrawlDelayCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL); + robotsDeferVisitsDroppedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL); + redirectCountExceededCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL); + redirectDeduplicatedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_DEDUPLICATED_TOTAL); + redirectNotCreatedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_REDIRECT_NOT_CREATED_TOTAL); + hitByTimeLimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + aboveExceptionThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL); + outlinksDetectedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL); + outlinksFollowingCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL); } @Override @@ -372,9 +415,7 @@ public void run() { fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay); if (killedURLs != 0) { - context - .getCounter("FetcherStatus", "robots_defer_visits_dropped") - .increment(killedURLs); + robotsDeferVisitsDroppedCounter.increment(killedURLs); } continue; } @@ -385,7 +426,7 @@ public void run() { output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); - context.getCounter("FetcherStatus", "robots_denied").increment(1); + robotsDeniedCounter.increment(1); continue; } if (rules.getCrawlDelay() > 0) { @@ -397,8 +438,7 @@ public void run() { output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); - context.getCounter("FetcherStatus", - "robots_denied_maxcrawldelay").increment(1); + robotsDeniedMaxCrawlDelayCounter.increment(1); continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); @@ -436,7 +476,8 @@ public void run() { endEvent.addEventData("status", status.getName()); publisher.publish(endEvent, conf); } - context.getCounter("FetcherStatus", status.getName()).increment(1); + // Dynamic counter for protocol status - can't cache as status varies + context.getCounter(NutchMetrics.GROUP_FETCHER, status.getName()).increment(1); if (storingProtocolVersions && content != null) { countProtocolVersions(content.getMetadata()); @@ -489,8 +530,7 @@ public void run() { int killedURLs = fetchQueues .checkExceptionThreshold(fit.getQueueID()); if (killedURLs != 0) - context.getCounter("FetcherStatus", - "AboveExceptionThresholdInQueue").increment(killedURLs); + aboveExceptionThresholdCounter.increment(killedURLs); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry @@ -520,8 +560,7 @@ public void run() { if (redirecting && redirectCount > maxRedirect) { fetchQueues.finishFetchItem(fit); - context.getCounter("FetcherStatus", "redirect_count_exceeded") - .increment(1); + redirectCountExceededCounter.increment(1); LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked"); @@ -655,13 +694,13 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { if (fetchQueues.redirectIsQueuedRecently(redirUrl)) { redirecting = false; - context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1); + redirectDeduplicatedCounter.increment(1); LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url, redirUrl); return null; } else if (fetchQueues.timelimitExceeded()) { redirecting = false; - context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1); + hitByTimeLimitCounter.increment(1); LOG.debug(" - ignoring redirect from {} to {} - timelimit reached", fit.url, redirUrl); return null; @@ -674,7 +713,7 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit) } else { // stop redirecting redirecting = false; - context.getCounter("FetcherStatus", "FetchItem.notCreated.redirect").increment(1); + redirectNotCreatedCounter.increment(1); } return fit; } @@ -885,8 +924,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID); queue.alreadyFetched.add(url.toString().hashCode()); - context.getCounter("FetcherOutlinks", "outlinks_detected").increment( - outlinks.size()); + outlinksDetectedCounter.increment(outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; @@ -918,7 +956,7 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); - context.getCounter("FetcherOutlinks", "outlinks_following").increment(1); + outlinksFollowingCounter.increment(1); fetchQueues.addFetchItem(fit); @@ -944,7 +982,8 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content, if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { - context.getCounter("ParserStatus", ParseStatus.majorCodes[p + // Dynamic counter for parse status - can't cache as status varies + context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[p .getData().getStatus().getMajorCode()]).increment(1); return p.getData().getStatus(); } diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java b/src/java/org/apache/nutch/fetcher/QueueFeeder.java index c48c4b8f31..6ee973dd3b 100644 --- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java +++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java @@ -25,6 +25,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.fetcher.FetchItemQueues.QueuingStatus; import org.apache.nutch.fetcher.Fetcher.FetcherRun; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -94,14 +95,16 @@ public void run() { LOG.info("QueueFeeder stopping, timeout reached."); } queuingStatus[qstatus]++; - context.getCounter("FetcherStatus", "hitByTimeout").increment(1); + context.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(1); } else { int qstatus = QueuingStatus.HIT_BY_TIMELIMIT.ordinal(); if (queuingStatus[qstatus] == 0) { LOG.info("QueueFeeder stopping, timelimit exceeded."); } queuingStatus[qstatus]++; - context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1); + context.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(1); } try { hasMore = context.nextKeyValue(); @@ -133,7 +136,8 @@ public void run() { String u = filterNormalize(url.toString()); if (u == null) { // filtered or failed to normalize - context.getCounter("FetcherStatus", "filtered").increment(1); + context.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_FILTERED_TOTAL).increment(1); continue; } url = new Text(u); @@ -150,9 +154,8 @@ public void run() { QueuingStatus status = queues.addFetchItem(url, datum); queuingStatus[status.ordinal()]++; if (status == QueuingStatus.ABOVE_EXCEPTION_THRESHOLD) { - context - .getCounter("FetcherStatus", "AboveExceptionThresholdInQueue") - .increment(1); + context.getCounter(NutchMetrics.GROUP_FETCHER, + NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL).increment(1); } cnt++; feed--; diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java index 2140ea52d1..2690a73fad 100644 --- a/src/java/org/apache/nutch/hostdb/ResolverThread.java +++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java @@ -24,6 +24,8 @@ import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.metrics.NutchMetrics; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -72,16 +74,19 @@ public void run() { InetAddress inetAddr = InetAddress.getByName(host); if (datum.isEmpty()) { - context.getCounter("UpdateHostDb", "new_known_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL).increment(1); datum.setLastCheck(); LOG.info("{}: new_known_host {}", host, datum); } else if (datum.getDnsFailures() > 0) { - context.getCounter("UpdateHostDb", "rediscovered_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL).increment(1); datum.setLastCheck(); datum.setDnsFailures(0l); LOG.info("{}: rediscovered_host {}", host, datum); } else { - context.getCounter("UpdateHostDb", "existing_known_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL).increment(1); datum.setLastCheck(); LOG.info("{}: existing_known_host {}", host, datum); } @@ -95,7 +100,8 @@ public void run() { datum.setLastCheck(); datum.setDnsFailures(1l); context.write(hostText, datum); - context.getCounter("UpdateHostDb", "new_unknown_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL).increment(1); LOG.info("{}: new_unknown_host {}", host, datum); } else { datum.setLastCheck(); @@ -106,15 +112,18 @@ public void run() { purgeFailedHostsThreshold < datum.getDnsFailures()) { context.write(hostText, datum); - context.getCounter("UpdateHostDb", "existing_unknown_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL).increment(1); LOG.info("{}: existing_unknown_host {}", host, datum); } else { - context.getCounter("UpdateHostDb", "purged_unknown_host").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL).increment(1); LOG.info("{}: purged_unknown_host {}", host, datum); } } - context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1); + // Dynamic counter based on failure count - can't cache + context.getCounter(NutchMetrics.GROUP_HOSTDB, createFailureCounterLabel(datum)).increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); } @@ -122,7 +131,8 @@ public void run() { LOG.warn(StringUtils.stringifyException(e)); } - context.getCounter("UpdateHostDb", "checked_hosts").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL).increment(1); } private String createFailureCounterLabel(HostDatum datum) { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java index ca6797ac0a..1495f74914 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java @@ -30,6 +30,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.protocol.ProtocolStatus; @@ -136,7 +137,8 @@ public void map(Text key, Writable value, try { url = new URL(keyStr); } catch (MalformedURLException e) { - context.getCounter("UpdateHostDb", "malformed_url").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_MALFORMED_URL_TOTAL).increment(1); return; } String hostName = URLUtil.getHost(url); @@ -146,7 +148,8 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); LOG.debug("UpdateHostDb: {} crawldatum has been filtered", hostName); return; } @@ -219,7 +222,8 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); LOG.debug("UpdateHostDb: {} hostdatum has been filtered", keyStr); return; } @@ -243,7 +247,8 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter("UpdateHostDb", "filtered_records").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); LOG.debug("UpdateHostDb: {} score has been filtered", keyStr); return; } diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java index 1431b56365..039fa5ba13 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java @@ -36,6 +36,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metrics.NutchMetrics; import com.tdunning.math.stats.TDigest; @@ -379,12 +380,14 @@ else if (value instanceof FloatWritable) { // Impose limits on minimum number of URLs? if (urlLimit > -1l) { if (hostDatum.numRecords() < urlLimit) { - context.getCounter("UpdateHostDb", "url_limit_not_reached").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL).increment(1); return; } } - context.getCounter("UpdateHostDb", "total_hosts").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL).increment(1); // See if this record is to be checked if (shouldCheck(hostDatum)) { @@ -401,7 +404,8 @@ else if (value instanceof FloatWritable) { // Do not progress, the datum will be written in the resolver thread return; } else if (checkAny) { - context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL).increment(1); LOG.debug("UpdateHostDb: {}: skipped_not_eligible", key); } diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index cedee8e34c..ae01e4b0d1 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -36,6 +36,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.slf4j.Logger; @@ -118,7 +119,8 @@ public void reduce(ByteWritable key, Iterable values, for (Text document : values) { writers.delete(document.toString()); totalDeleted++; - context.getCounter("CleaningJobStatus", "Deleted documents").increment(1); + context.getCounter(NutchMetrics.GROUP_CLEANING, + NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL).increment(1); } } } diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 9fb8007715..33f2f244a6 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -40,6 +40,7 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; @@ -283,7 +284,8 @@ public void reduce(Text key, Iterable values, .indexOf("noindex") != -1) { // Delete it! context.write(key, DELETE_ACTION); - context.getCounter("IndexerStatus", "deleted (robots=noindex)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL).increment(1); return; } } @@ -300,7 +302,8 @@ public void reduce(Text key, Iterable values, if (delete && fetchDatum != null) { if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { - context.getCounter("IndexerStatus", "deleted (gone)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_DELETED_GONE_TOTAL).increment(1); context.write(key, DELETE_ACTION); return; } @@ -309,7 +312,8 @@ public void reduce(Text key, Iterable values, || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { - context.getCounter("IndexerStatus", "deleted (redirects)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL).increment(1); context.write(key, DELETE_ACTION); return; } @@ -321,14 +325,16 @@ public void reduce(Text key, Iterable values, // Whether to delete pages marked as duplicates if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { - context.getCounter("IndexerStatus", "deleted (duplicates)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL).increment(1); context.write(key, DELETE_ACTION); return; } // Whether to skip DB_NOTMODIFIED pages if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { - context.getCounter("IndexerStatus", "skipped (not modified)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL).increment(1); return; } @@ -355,7 +361,8 @@ public void reduce(Text key, Iterable values, boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost); } catch (final ScoringFilterException e) { - context.getCounter("IndexerStatus", "errors (ScoringFilter)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_ERRORS_SCORING_FILTER_TOTAL).increment(1); LOG.warn("Error calculating score {}: {}", key, e); return; } @@ -390,7 +397,8 @@ public void reduce(Text key, Iterable values, doc = filters.filter(doc, parse, key, fetchDatum, inlinks); } catch (final IndexingException e) { LOG.warn("Error indexing {}: ", key, e); - context.getCounter("IndexerStatus", "errors (IndexingFilter)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_ERRORS_INDEXING_FILTER_TOTAL).increment(1); return; } @@ -400,9 +408,11 @@ public void reduce(Text key, Iterable values, if (deleteSkippedByIndexingFilter) { NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); context.write(key, action); - context.getCounter("IndexerStatus", "deleted (IndexingFilter)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL).increment(1); } else { - context.getCounter("IndexerStatus", "skipped (IndexingFilter)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL).increment(1); } return; } @@ -422,7 +432,8 @@ public void reduce(Text key, Iterable values, doc.add("binaryContent", binary); } - context.getCounter("IndexerStatus", "indexed (add/update)").increment(1); + context.getCounter(NutchMetrics.GROUP_INDEXER, + NutchMetrics.INDEXER_INDEXED_TOTAL).increment(1); NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD); context.write(key, action); diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java new file mode 100644 index 0000000000..e64a8d6d00 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -0,0 +1,371 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +/** + * Centralized constants for Hadoop metrics counter groups and names. + * + *

Follows Prometheus + * naming conventions: + *

    + *
  • Counter groups use the {@code nutch_} prefix namespace
  • + *
  • Counter names use snake_case
  • + *
  • Accumulating counters use {@code _total} suffix
  • + *
  • Units are included in counter names where applicable (e.g., {@code _bytes})
  • + *
+ * + * @since 1.22 + */ +public final class NutchMetrics { + + private NutchMetrics() { + // Utility class - prevent instantiation + } + + // ========================================================================= + // Counter Groups (Prometheus namespace style with nutch_ prefix) + // ========================================================================= + + /** Counter group for fetcher operations. */ + public static final String GROUP_FETCHER = "nutch_fetcher"; + + /** Counter group for fetcher outlink processing. */ + public static final String GROUP_FETCHER_OUTLINKS = "nutch_fetcher_outlinks"; + + /** Counter group for generator operations. */ + public static final String GROUP_GENERATOR = "nutch_generator"; + + /** Counter group for indexer operations. */ + public static final String GROUP_INDEXER = "nutch_indexer"; + + /** Counter group for CrawlDb operations. */ + public static final String GROUP_CRAWLDB = "nutch_crawldb"; + + /** Counter group for CrawlDb filter operations. */ + public static final String GROUP_CRAWLDB_FILTER = "nutch_crawldb_filter"; + + /** Counter group for injector operations. */ + public static final String GROUP_INJECTOR = "nutch_injector"; + + /** Counter group for HostDb operations. */ + public static final String GROUP_HOSTDB = "nutch_hostdb"; + + /** Counter group for parser operations. */ + public static final String GROUP_PARSER = "nutch_parser"; + + /** Counter group for deduplication operations. */ + public static final String GROUP_DEDUP = "nutch_dedup"; + + /** Counter group for cleaning job operations. */ + public static final String GROUP_CLEANING = "nutch_cleaning"; + + /** Counter group for WebGraph operations. */ + public static final String GROUP_WEBGRAPH = "nutch_webgraph"; + + /** Counter group for sitemap processing operations. */ + public static final String GROUP_SITEMAP = "nutch_sitemap"; + + /** Counter group for WARC export operations. */ + public static final String GROUP_WARC_EXPORTER = "nutch_warc_exporter"; + + /** Counter group for domain statistics operations. */ + public static final String GROUP_DOMAIN_STATS = "nutch_domain_stats"; + + // ========================================================================= + // Fetcher Counters + // ========================================================================= + + /** Total bytes downloaded by fetcher. */ + public static final String FETCHER_BYTES_DOWNLOADED_TOTAL = "bytes_downloaded_total"; + + /** URLs denied by robots.txt. */ + public static final String FETCHER_ROBOTS_DENIED_TOTAL = "robots_denied_total"; + + /** URLs denied due to crawl delay exceeding maximum. */ + public static final String FETCHER_ROBOTS_DENIED_MAXCRAWLDELAY_TOTAL = "robots_denied_maxcrawldelay_total"; + + /** URLs dropped due to robots.txt deferred visits. */ + public static final String FETCHER_ROBOTS_DEFER_VISITS_DROPPED_TOTAL = "robots_defer_visits_dropped_total"; + + /** Redirects that exceeded maximum redirect count. */ + public static final String FETCHER_REDIRECT_COUNT_EXCEEDED_TOTAL = "redirect_count_exceeded_total"; + + /** Redirects deduplicated (already seen). */ + public static final String FETCHER_REDIRECT_DEDUPLICATED_TOTAL = "redirect_deduplicated_total"; + + /** FetchItems not created for redirects. */ + public static final String FETCHER_REDIRECT_NOT_CREATED_TOTAL = "redirect_not_created_total"; + + /** URLs hit by time limit. */ + public static final String FETCHER_HIT_BY_TIMELIMIT_TOTAL = "hit_by_timelimit_total"; + + /** URLs hit by timeout. */ + public static final String FETCHER_HIT_BY_TIMEOUT_TOTAL = "hit_by_timeout_total"; + + /** URLs hit by throughput threshold. */ + public static final String FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL = "hit_by_throughput_threshold_total"; + + /** Threads that hung during fetching. */ + public static final String FETCHER_HUNG_THREADS_TOTAL = "hung_threads_total"; + + /** URLs filtered during fetching. */ + public static final String FETCHER_FILTERED_TOTAL = "filtered_total"; + + /** URLs dropped due to exception threshold in queue. */ + public static final String FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL = "above_exception_threshold_total"; + + // ========================================================================= + // Fetcher Outlinks Counters + // ========================================================================= + + /** Outlinks detected during parsing. */ + public static final String FETCHER_OUTLINKS_DETECTED_TOTAL = "outlinks_detected_total"; + + /** Outlinks being followed. */ + public static final String FETCHER_OUTLINKS_FOLLOWING_TOTAL = "outlinks_following_total"; + + // ========================================================================= + // Generator Counters + // ========================================================================= + + /** URLs rejected by URL filters. */ + public static final String GENERATOR_URL_FILTERS_REJECTED_TOTAL = "url_filters_rejected_total"; + + /** URL filter exceptions. */ + public static final String GENERATOR_URL_FILTER_EXCEPTION_TOTAL = "url_filter_exception_total"; + + /** URLs rejected by fetch schedule. */ + public static final String GENERATOR_SCHEDULE_REJECTED_TOTAL = "schedule_rejected_total"; + + /** URLs waiting for CrawlDb update. */ + public static final String GENERATOR_WAIT_FOR_UPDATE_TOTAL = "wait_for_update_total"; + + /** URLs rejected by JEXL expression. */ + public static final String GENERATOR_EXPR_REJECTED_TOTAL = "expr_rejected_total"; + + /** URLs rejected due to status restriction. */ + public static final String GENERATOR_STATUS_REJECTED_TOTAL = "status_rejected_total"; + + /** URLs rejected due to score below threshold. */ + public static final String GENERATOR_SCORE_TOO_LOW_TOTAL = "score_too_low_total"; + + /** URLs rejected due to fetch interval exceeding threshold. */ + public static final String GENERATOR_INTERVAL_REJECTED_TOTAL = "interval_rejected_total"; + + /** Malformed URLs encountered. */ + public static final String GENERATOR_MALFORMED_URL_TOTAL = "malformed_url_total"; + + /** URLs skipped due to per-host overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL = "urls_skipped_per_host_overflow_total"; + + /** Hosts affected by per-host overflow. */ + public static final String GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL = "hosts_affected_per_host_overflow_total"; + + // ========================================================================= + // Indexer Counters + // ========================================================================= + + /** Documents deleted due to robots noindex. */ + public static final String INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL = "deleted_robots_noindex_total"; + + /** Documents deleted because they are gone. */ + public static final String INDEXER_DELETED_GONE_TOTAL = "deleted_gone_total"; + + /** Documents deleted due to redirects. */ + public static final String INDEXER_DELETED_REDIRECTS_TOTAL = "deleted_redirects_total"; + + /** Documents deleted as duplicates. */ + public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total"; + + /** Documents deleted by indexing filter. */ + public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total"; + + /** Documents skipped (not modified). */ + public static final String INDEXER_SKIPPED_NOT_MODIFIED_TOTAL = "skipped_not_modified_total"; + + /** Documents skipped by indexing filter. */ + public static final String INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL = "skipped_by_indexing_filter_total"; + + /** Scoring filter errors. */ + public static final String INDEXER_ERRORS_SCORING_FILTER_TOTAL = "errors_scoring_filter_total"; + + /** Indexing filter errors. */ + public static final String INDEXER_ERRORS_INDEXING_FILTER_TOTAL = "errors_indexing_filter_total"; + + /** Documents indexed (added or updated). */ + public static final String INDEXER_INDEXED_TOTAL = "indexed_total"; + + // ========================================================================= + // CrawlDb Counters + // ========================================================================= + + /** URLs filtered during CrawlDb operations. */ + public static final String CRAWLDB_URLS_FILTERED_TOTAL = "urls_filtered_total"; + + /** Gone (404) records removed during CrawlDb operations. */ + public static final String CRAWLDB_GONE_RECORDS_REMOVED_TOTAL = "gone_records_removed_total"; + + /** Orphan records removed during CrawlDb operations. */ + public static final String CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL = "orphan_records_removed_total"; + + // ========================================================================= + // Injector Counters + // ========================================================================= + + /** URLs filtered during injection. */ + public static final String INJECTOR_URLS_FILTERED_TOTAL = "urls_filtered_total"; + + /** URLs injected. */ + public static final String INJECTOR_URLS_INJECTED_TOTAL = "urls_injected_total"; + + /** Unique URLs injected. */ + public static final String INJECTOR_URLS_INJECTED_UNIQUE_TOTAL = "urls_injected_unique_total"; + + /** URLs merged with existing CrawlDb entries. */ + public static final String INJECTOR_URLS_MERGED_TOTAL = "urls_merged_total"; + + /** URLs purged due to 404 status. */ + public static final String INJECTOR_URLS_PURGED_404_TOTAL = "urls_purged_404_total"; + + /** URLs purged by filter. */ + public static final String INJECTOR_URLS_PURGED_FILTER_TOTAL = "urls_purged_filter_total"; + + // ========================================================================= + // HostDb Counters + // ========================================================================= + + /** Malformed URLs in HostDb. */ + public static final String HOSTDB_MALFORMED_URL_TOTAL = "malformed_url_total"; + + /** Records filtered in HostDb. */ + public static final String HOSTDB_FILTERED_RECORDS_TOTAL = "filtered_records_total"; + + /** Total hosts processed. */ + public static final String HOSTDB_TOTAL_HOSTS_TOTAL = "total_hosts_total"; + + /** Hosts skipped (not eligible). */ + public static final String HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL = "skipped_not_eligible_total"; + + /** Hosts where URL limit was not reached. */ + public static final String HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL = "url_limit_not_reached_total"; + + /** New known hosts discovered. */ + public static final String HOSTDB_NEW_KNOWN_HOST_TOTAL = "new_known_host_total"; + + /** Rediscovered hosts. */ + public static final String HOSTDB_REDISCOVERED_HOST_TOTAL = "rediscovered_host_total"; + + /** Existing known hosts. */ + public static final String HOSTDB_EXISTING_KNOWN_HOST_TOTAL = "existing_known_host_total"; + + /** New unknown hosts. */ + public static final String HOSTDB_NEW_UNKNOWN_HOST_TOTAL = "new_unknown_host_total"; + + /** Existing unknown hosts. */ + public static final String HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL = "existing_unknown_host_total"; + + /** Purged unknown hosts. */ + public static final String HOSTDB_PURGED_UNKNOWN_HOST_TOTAL = "purged_unknown_host_total"; + + /** Hosts checked. */ + public static final String HOSTDB_CHECKED_HOSTS_TOTAL = "checked_hosts_total"; + + // ========================================================================= + // Deduplication Counters + // ========================================================================= + + /** Documents marked as duplicate. */ + public static final String DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL = "documents_marked_duplicate_total"; + + // ========================================================================= + // Cleaning Job Counters + // ========================================================================= + + /** Documents deleted during cleaning. */ + public static final String CLEANING_DELETED_DOCUMENTS_TOTAL = "deleted_documents_total"; + + // ========================================================================= + // WebGraph Counters + // ========================================================================= + + /** Links added to WebGraph. */ + public static final String WEBGRAPH_ADDED_LINKS_TOTAL = "added_links_total"; + + /** Links removed from WebGraph. */ + public static final String WEBGRAPH_REMOVED_LINKS_TOTAL = "removed_links_total"; + + // ========================================================================= + // Sitemap Counters + // ========================================================================= + + /** Filtered records in sitemap processing. */ + public static final String SITEMAP_FILTERED_RECORDS_TOTAL = "filtered_records_total"; + + /** Seeds extracted from sitemaps. */ + public static final String SITEMAP_SEEDS_TOTAL = "sitemap_seeds_total"; + + /** Sitemaps discovered from hostname. */ + public static final String SITEMAP_FROM_HOSTNAME_TOTAL = "sitemaps_from_hostname_total"; + + /** Sitemaps filtered from hostname. */ + public static final String SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL = "filtered_sitemaps_from_hostname_total"; + + /** Failed sitemap fetches. */ + public static final String SITEMAP_FAILED_FETCHES_TOTAL = "failed_fetches_total"; + + /** Existing sitemap entries. */ + public static final String SITEMAP_EXISTING_ENTRIES_TOTAL = "existing_sitemap_entries_total"; + + /** New sitemap entries. */ + public static final String SITEMAP_NEW_ENTRIES_TOTAL = "new_sitemap_entries_total"; + + // ========================================================================= + // WARC Exporter Counters + // ========================================================================= + + /** Missing content in WARC export. */ + public static final String WARC_MISSING_CONTENT_TOTAL = "missing_content_total"; + + /** Missing metadata in WARC export. */ + public static final String WARC_MISSING_METADATA_TOTAL = "missing_metadata_total"; + + /** Omitted empty responses in WARC export. */ + public static final String WARC_OMITTED_EMPTY_RESPONSE_TOTAL = "omitted_empty_response_total"; + + /** Invalid URIs in WARC export. */ + public static final String WARC_INVALID_URI_TOTAL = "invalid_uri_total"; + + /** WARC records generated. */ + public static final String WARC_RECORDS_GENERATED_TOTAL = "records_generated_total"; + + /** Exceptions during WARC export. */ + public static final String WARC_EXCEPTION_TOTAL = "exception_total"; + + // ========================================================================= + // Domain Statistics Counters (enum-based, kept for compatibility) + // ========================================================================= + + /** Fetched URLs in domain statistics. */ + public static final String DOMAIN_STATS_FETCHED_TOTAL = "fetched_total"; + + /** Not fetched URLs in domain statistics. */ + public static final String DOMAIN_STATS_NOT_FETCHED_TOTAL = "not_fetched_total"; + + /** Empty results in domain statistics. */ + public static final String DOMAIN_STATS_EMPTY_RESULT_TOTAL = "empty_result_total"; +} + diff --git a/src/java/org/apache/nutch/metrics/package-info.java b/src/java/org/apache/nutch/metrics/package-info.java new file mode 100644 index 0000000000..376605d043 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/package-info.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Metrics infrastructure for Apache Nutch. + * + *

This package provides centralized constants and utilities for Hadoop + * MapReduce metrics/counters following + * Prometheus naming + * conventions. + * + *

The main class is {@link org.apache.nutch.metrics.NutchMetrics} which + * defines all counter group names and counter names as constants. + * + * @since 1.22 + */ +package org.apache.nutch.metrics; + diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 6b2fb5cee7..5ec74ea9fe 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; import org.apache.nutch.scoring.ScoringFilterException; @@ -129,7 +130,8 @@ public void map(WritableComparable key, Content content, Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); - context.getCounter("ParserStatus", + // Dynamic counter based on parse status + context.getCounter(NutchMetrics.GROUP_PARSER, ParseStatus.majorCodes[parseStatus.getMajorCode()]).increment(1); if (!parseStatus.isSuccess()) { diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 4daefcd8f3..0b728a588c 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -58,6 +58,7 @@ import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; @@ -361,14 +362,16 @@ public void reduce(Text key, Iterable values, mostRecent = timestamp; } outlinkList.add(WritableUtils.clone(next, conf)); - context.getCounter("WebGraph.outlinks", "added links").increment(1); + context.getCounter(NutchMetrics.GROUP_WEBGRAPH, + NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL).increment(1); } else if (value instanceof BooleanWritable) { BooleanWritable delete = (BooleanWritable) value; // Actually, delete is always true, otherwise we don't emit it in the // mapper in the first place if (delete.get() == true) { // This page is gone, do not emit it's outlinks - context.getCounter("WebGraph.outlinks", "removed links").increment(1); + context.getCounter(NutchMetrics.GROUP_WEBGRAPH, + NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL).increment(1); return; } } diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index bf824f9b3f..df4f6af057 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -57,6 +57,7 @@ import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; import org.apache.nutch.tools.WARCUtils; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -147,13 +148,15 @@ public void reduce(Text key, Iterable values, // check that we have everything we need if (content == null) { LOG.info("Missing content for {}", key); - context.getCounter("WARCExporter", "missing content").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_MISSING_CONTENT_TOTAL).increment(1); return; } if (cd == null) { LOG.info("Missing fetch datum for {}", key); - context.getCounter("WARCExporter", "missing metadata").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_MISSING_METADATA_TOTAL).increment(1); return; } @@ -161,8 +164,8 @@ public void reduce(Text key, Iterable values, // Empty responses is everything that was not a regular response if (!(cd.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS || cd.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED)) { - context.getCounter("WARCExporter", "omitted empty response") - .increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL).increment(1); return; } } @@ -237,7 +240,8 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); return; } @@ -269,12 +273,14 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC resource record for {} : {}", key, exception.getMessage()); - context.getCounter("WARCExporter", "exception").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); } // Do we need to emit a metadata record too? @@ -316,7 +322,8 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); return; } @@ -332,13 +339,14 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated") - .increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter("WARCExporter", "exception").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); } } @@ -376,7 +384,8 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter("WARCExporter", "invalid URI").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); return; } @@ -392,13 +401,14 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter("WARCExporter", "records generated") - .increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter("WARCExporter", "exception").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, + NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); } } } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index d83a6e358c..7055a6d86a 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -45,6 +45,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.hostdb.HostDatum; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.protocol.Content; @@ -161,11 +162,13 @@ else if (value instanceof Text) { url.startsWith("file:/")) { // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those if((url = filterNormalize(url)) == null) { - context.getCounter("Sitemap", "filtered_records").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).increment(1); return; } - context.getCounter("Sitemap", "sitemap_seeds").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_SEEDS_TOTAL).increment(1); generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context); } else { LOG.info("generateSitemapsFromHostname: {}", key.toString()); @@ -203,7 +206,8 @@ private void generateSitemapsFromHostname(String host, Context context) { (url = filterNormalize("https://" + host + "/")) == null && (url = filterNormalize("ftp://" + host + "/")) == null && (url = filterNormalize("file:/" + host + "/")) == null) { - context.getCounter("Sitemap", "filtered_records").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).increment(1); return; } // We may wish to use the robots.txt content as the third parameter for .getRobotRules @@ -214,11 +218,12 @@ private void generateSitemapsFromHostname(String host, Context context) { sitemaps.add(url + "sitemap.xml"); } for (String sitemap : sitemaps) { - context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_FROM_HOSTNAME_TOTAL).increment(1); sitemap = filterNormalize(sitemap); if (sitemap == null) { - context.getCounter("Sitemap", "filtered_sitemaps_from_hostname") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL).increment(1); } else { generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap), sitemap, context); @@ -254,7 +259,8 @@ private void generateSitemapUrlDatum(Protocol protocol, String url, Context cont if(status.getCode() != ProtocolStatus.SUCCESS) { // If there were any problems fetching the sitemap, log the error and let it go. Not sure how often // sitemaps are redirected. In future we might have to handle redirects. - context.getCounter("Sitemap", "failed_fetches").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL).increment(1); LOG.error("Error while fetching the sitemap. Status code: {} for {}", status.getCode(), url); return; } @@ -373,12 +379,14 @@ public void reduce(Text key, Iterable values, Context context) originalDatum.setModifiedTime(sitemapDatum.getModifiedTime()); } - context.getCounter("Sitemap", "existing_sitemap_entries").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_EXISTING_ENTRIES_TOTAL).increment(1); context.write(key, originalDatum); } else if(sitemapDatum != null) { // For the newly discovered links via sitemap, set the status as unfetched and emit - context.getCounter("Sitemap", "new_sitemap_entries").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP, + NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).increment(1); sitemapDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); context.write(key, sitemapDatum); } From 595cf6c1c7c9a27f7ff4087450b840e5506ecf5c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 25 Feb 2026 21:15:20 +0100 Subject: [PATCH 03/27] NUTCH-3132 Standardize existing Nutch metrics naming and implementation Apply metrics naming conventions to CCF-specific classes and extensions. --- .../apache/nutch/crawl/DedupRedirectsJob.java | 25 +- .../org/apache/nutch/crawl/Generator2.java | 50 ++-- .../apache/nutch/crawl/SitemapInjector.java | 212 ++++++++------- .../apache/nutch/fetcher/FetcherThread.java | 41 ++- .../apache/nutch/metrics/NutchMetrics.java | 246 ++++++++++++++++++ .../org/commoncrawl/tools/UrlCleaner.java | 25 +- .../org/commoncrawl/tools/UrlSampler.java | 23 +- .../org/commoncrawl/tools/UrlSamplerHost.java | 69 +++-- 8 files changed, 516 insertions(+), 175 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java index 5c82b6d6b2..3b77878211 100644 --- a/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java +++ b/src/java/org/apache/nutch/crawl/DedupRedirectsJob.java @@ -36,6 +36,7 @@ import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -154,12 +155,14 @@ public void map(Text key, CrawlDatum value, Context context) // value.getMetaData().put(urlKey, key); Text redirKey = new Text(redirTarget); - context.getCounter("DeduplicationJobStatus", "Redirects in CrawlDb") - .increment(1); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL).increment(1); if (redirKey.equals(key)) { // exclude self-referential redirects - context.getCounter("DeduplicationJobStatus", - "Self-referential redirects in CrawlDb").increment(1); + context + .getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL) + .increment(1); } else { context.write(redirKey, value); } @@ -219,16 +222,15 @@ public void reduce(Text key, Iterable values, Context context) // duplicate! unsetDuplicateStatus(existingDoc); context.write(origURL, existingDoc); - context.getCounter("DeduplicationJobStatus", - "Redirects kept as non-duplicates").increment(1); + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL).increment(1); } else { // (c) it is a self-referential redirect String targetURL = getTargetURL(existingDoc); if (key.toString().equals(targetURL)) { context.write(key, existingDoc); - context - .getCounter("DeduplicationJobStatus", - "Self-referential redirects kept as non-duplicates") + context.getCounter(NutchMetrics.GROUP_DEDUP, + NutchMetrics.DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL) .increment(1); } // else: ignore redirects emitted under original URL because they are @@ -306,9 +308,10 @@ public int run(String[] args) throws IOException { fs.delete(tempDir, true); throw new RuntimeException(message); } - CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus"); + CounterGroup g = job.getCounters().getGroup(NutchMetrics.GROUP_DEDUP); if (g != null) { - Counter counter = g.findCounter("Documents marked as duplicate"); + Counter counter = g + .findCounter(NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL); numDuplicates = counter.getValue(); LOG.info("Deduplication: {} documents marked as duplicates", numDuplicates); diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index 6de2adab81..0e678a7330 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -65,6 +65,7 @@ import org.apache.hadoop.util.hash.MurmurHash; import org.apache.nutch.crawl.Generator2.SelectorReducer.DomainLimits; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -375,12 +376,18 @@ public void map(Text key, CrawlDatum value, Context context) // URLFilters try { if (filters.filter(urlString) == null) { - context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1); + context + .getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL) + .increment(1); return; } } catch (URLFilterException e) { LOG.warn("Couldn't filter url {}: {}", key, e.getMessage()); - context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1); + context + .getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URL_FILTER_EXCEPTION_TOTAL) + .increment(1); } } @@ -388,7 +395,8 @@ public void map(Text key, CrawlDatum value, Context context) if (!schedule.shouldFetch(key, value, curTime)) { LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", key, value.getFetchTime(), curTime); - context.getCounter("Schedule rejected by status", + context.getCounter( + NutchMetrics.GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS, CrawlDatum.getStatusName(value.getStatus())).increment(1); return; } @@ -413,8 +421,10 @@ public void map(Text key, CrawlDatum value, Context context) // consider only entries with a score superior to the threshold if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) { - context.getCounter("Score below threshold by status", - CrawlDatum.getStatusName(value.getStatus())).increment(1); + context + .getCounter(NutchMetrics.GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS, + CrawlDatum.getStatusName(value.getStatus())) + .increment(1); return; } @@ -440,7 +450,8 @@ public void map(Text key, CrawlDatum value, Context context) } catch (Exception e) { LOG.warn("Malformed URL: '{}', skipping ({})", urlString, e.getMessage()); - context.getCounter("Generator", "MALFORMED_URL").increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_MALFORMED_URL_TOTAL).increment(1); return; } @@ -738,7 +749,8 @@ public void reduce(DomainScorePair key, Iterable values, LOG.info( "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", key.getDomain(), maxCountTotal, maxNumSegments); - context.getCounter("Generator", "SKIPPED_DOMAINS_OVERFLOW") + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL) .increment(1); maxUrlsOverflow = true; break; @@ -784,11 +796,14 @@ public void reduce(DomainScorePair key, Iterable values, LOG.info( "Host {}{} (domain: {}) has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", host, domain, domain, maxCountPerHostTotal, maxNumSegments); - context.getCounter("Generator", "SKIPPED_HOSTS_NUM_URLS_OVERFLOW") - .increment(1); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL) + .increment(1); } - context.getCounter("Generator", "SKIPPED_URLS_HOST_OVERFLOW") - .increment(1); + context + .getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL) + .increment(1); maxUrlsPerHostOverflowCount++; counts[0]++; continue; @@ -819,17 +834,19 @@ public void reduce(DomainScorePair key, Iterable values, } } - context.getCounter("Selected by status", + context.getCounter(NutchMetrics.GROUP_GENERATOR_SELECTED_BY_STATUS, CrawlDatum.getStatusName(entry.datum.getStatus())).increment(1); context.write(key.getScore(), entry); } if (maxHostsOverflowCount > 0) { - context.getCounter("Generator", "SKIPPED_DOMAINS_NUM_HOSTS_OVERFLOW") + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL) .increment(1); - context.getCounter("Generator", "SKIPPED_URLS_NUM_HOSTS_OVERFLOW") - .increment(maxHostsOverflowCount); + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL) + .increment(maxHostsOverflowCount); LOG.info( "Domain {} has more than {} hosts, skipped {} URLs from remaining hosts", key.getDomain(), maxHosts, maxHostsOverflowCount); @@ -1022,7 +1039,8 @@ public void reduce(SegmenterKey key, Iterable values, if (count < maxPerSegment) { mos.write("sequenceFiles", entry.url, entry, fileName); } else { - context.getCounter("Generator", "SKIPPED_RECORDS_SEGMENT_OVERFLOW") + context.getCounter(NutchMetrics.GROUP_GENERATOR, + NutchMetrics.GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL) .increment(1); if (count == maxPerSegment) { LOG.info( diff --git a/src/java/org/apache/nutch/crawl/SitemapInjector.java b/src/java/org/apache/nutch/crawl/SitemapInjector.java index f8e108874a..7dff68cf73 100644 --- a/src/java/org/apache/nutch/crawl/SitemapInjector.java +++ b/src/java/org/apache/nutch/crawl/SitemapInjector.java @@ -53,6 +53,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; @@ -343,10 +344,8 @@ public ProtocolOutput call() throws Exception { BaseRobotRules rules = protocol.getRobotRules(turl, null, null); if (!rules.isAllowed(url)) { LOG.info("Fetch of sitemap forbidden by robots.txt: {}", url); - context - .getCounter("SitemapInjector", - "failed to fetch sitemap content, robots.txt disallow") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_ROBOTSTXT_DISALLOW_TOTAL).increment(1); return null; } } @@ -444,15 +443,16 @@ public void process(String url) { try { sitemap = parseSitemap(content, url); } catch (Exception e) { - context.getCounter("SitemapInjector", "sitemaps failed to parse") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1); LOG.warn("failed to parse sitemap {}: {}", url, StringUtils.stringifyException(e)); return; } LOG.info("parsed sitemap {} ({})", url, sitemap.getType()); context - .getCounter("SitemapInjector", "sitemap type: " + sitemap.getType()) + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_TYPE_PREFIX + sitemap.getType()) .increment(1); if (checkCrossSubmits) { @@ -519,14 +519,16 @@ public void processSitemap(AbstractSiteMap sitemap, return; } - context.getCounter("SitemapInjector", "sitemaps processed") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_PROCESSED_TOTAL).increment(1); injectURLs((SiteMap) sitemap); if (totalUrls >= maxUrls) { - LOG.warn("URL limit reached, skipped remaining urls of {}", + LOG.warn( + "Sitemap index URL limit reached, skipped remaining urls of {}", sitemap.getUrl()); context - .getCounter("SitemapInjector", "sitemap index: URL limit reached") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL) .increment(1); } sitemap.setProcessed(true); @@ -543,8 +545,10 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, LOG.warn( "Depth limit reached recursively processing sitemap index {}", sitemapIndex.getUrl()); - context.getCounter("SitemapInjector", - "sitemap index: depth limit reached").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL) + .increment(1); return; } @@ -557,10 +561,8 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, double publishScore = 0.3; if (s.getLastModified() != null) { double elapsedMonthsSincePublished = (System.currentTimeMillis() - - s.getLastModified().getTime()) - / (1000.0 * 60 * 60 * 24 * 30); - publishScore = (1.0 - / Math.log(1.0 + elapsedMonthsSincePublished)); + - s.getLastModified().getTime()) / (1000.0 * 60 * 60 * 24 * 30); + publishScore = (1.0 / Math.log(1.0 + elapsedMonthsSincePublished)); } double score = (1.0 / subSitemaps) + publishScore + Math.random(); sitemaps.add(new ScoredSitemap(score, s)); @@ -574,18 +576,18 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, LOG.warn( "Max. processing time reached, skipped remaining sitemaps of sitemap index {}", sitemapIndex.getUrl()); - context.getCounter("SitemapInjector", - "sitemap index: time limit reached").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL) + .increment(1); return; } - if ((totalUrls == 0) - && (elapsed > (maxSitemapProcessingTime / 2))) { + if ((totalUrls == 0) && (elapsed > (maxSitemapProcessingTime / 2))) { LOG.warn( "Half of processing time elapsed and no URLs injected, skipped remaining sitemaps of sitemap index {}", sitemapIndex.getUrl()); - context - .getCounter("SitemapInjector", - "sitemap index: no URLs after 50% of time limit") + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL) .increment(1); return; } @@ -594,29 +596,34 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, LOG.warn( "Too many failures, skipped remaining sitemaps of sitemap index {}", sitemapIndex.getUrl()); - context.getCounter("SitemapInjector", - "sitemap index: too many failures").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL) + .increment(1); return; } AbstractSiteMap nextSitemap = sitemaps.poll().sitemap; - context.getCounter("SitemapInjector", "sitemap index: processed sitemaps") + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL) .increment(1); String url = nextSitemap.getUrl().toString(); if (processedSitemaps.contains(url)) { LOG.warn("skipped duplicated or recursive sitemap URL {}", url); - context.getCounter("SitemapInjector", - "skipped duplicated or recursive sitemap URLs").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL) + .increment(1); nextSitemap.setProcessed(true); continue; } if (processedSitemaps.size() > maxRecursiveSitemaps) { - LOG.warn( - "{} sitemaps processed for {}, skipped remaining sitemaps", + LOG.warn("{} sitemaps processed for {}, skipped remaining sitemaps", processedSitemaps.size(), sitemapIndex.getUrl()); context - .getCounter("SitemapInjector", "sitemap index limit reached") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL) .increment(1); return; } @@ -624,8 +631,10 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, LOG.warn( "URL limit reached, skipped remaining sitemaps of sitemap index {}", sitemapIndex.getUrl()); - context.getCounter("SitemapInjector", - "sitemap index: URL limit reached").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL) + .increment(1); return; } @@ -634,21 +643,20 @@ private void processSitemapIndex(SiteMapIndex sitemapIndex, Content content = getContent(url); if (content == null) { nextSitemap.setProcessed(true); - context.getCounter("SitemapInjector", "sitemaps failed to fetch") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_TOTAL).increment(1); failedSubSitemaps++; continue; } try { - AbstractSiteMap parsedSitemap = parseSitemap(content, - nextSitemap); + AbstractSiteMap parsedSitemap = parseSitemap(content, nextSitemap); processSitemap(parsedSitemap, processedSitemaps, depth); } catch (Exception e) { LOG.warn("failed to parse sitemap {}: {}", nextSitemap.getUrl(), StringUtils.stringifyException(e)); - context.getCounter("SitemapInjector", "sitemaps failed to parse") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_PARSE_TOTAL).increment(1); failedSubSitemaps++; } nextSitemap.setProcessed(true); @@ -661,8 +669,8 @@ private Content getContent(String url) { LOG.warn( "Not fetching sitemap with overlong URL: {} ... (truncated, length = {} characters)", url.substring(0, maxUrlLength), url.length()); - context.getCounter("SitemapInjector", "sitemap overlong URL") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_OVERLONG_URL_TOTAL).increment(1); return null; } String origUrl = url; @@ -670,7 +678,8 @@ private Content getContent(String url) { if (url == null) { LOG.warn("Sitemap rejected by URL filters: {}", origUrl); context - .getCounter("SitemapInjector", "sitemap rejected by URL filters") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL) .increment(1); return null; } @@ -683,8 +692,10 @@ private Content getContent(String url) { if (failuresPerHost.containsKey(hostName) && failuresPerHost.get(hostName) > maxFailuresPerHost) { LOG.info("Skipped, too many failures per host: {}", url); - context.getCounter("SitemapInjector", - "skipped, too many failures per host").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL) + .increment(1); return null; } Protocol protocol = null; @@ -693,8 +704,8 @@ private Content getContent(String url) { } catch (ProtocolNotFound e) { LOG.error("Protocol not found: {}", url); context - .getCounter("SitemapInjector", - "failed to fetch sitemap content, protocol not found") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL) .increment(1); return null; } @@ -715,14 +726,16 @@ private Content getContent(String url) { } catch (Exception e) { if (e instanceof TimeoutException) { LOG.error("fetch of sitemap {} timed out", url); - context.getCounter("SitemapInjector", - "failed to fetch sitemap content, timeout").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL) + .increment(1); } else { LOG.error("fetch of sitemap {} failed with: {}", url, StringUtils.stringifyException(e)); context - .getCounter("SitemapInjector", - "failed to fetch sitemap content, exception") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL) .increment(1); } task.cancel(true); @@ -737,17 +750,16 @@ private Content getContent(String url) { } if (protocolOutput.getStatus().isRedirect()) { - context.getCounter("SitemapInjector", "sitemap redirect") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_TOTAL).increment(1); String redirUrl = protocolOutput.getStatus().getArgs()[0]; url = filterNormalize(redirUrl); if (url == null) { LOG.info( "Redirect target of sitemap {} rejected by URL filters: {}", origUrl, redirUrl); - context - .getCounter("SitemapInjector", - "sitemap (redirect target) rejected by URL filters") + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL) .increment(1); return null; } @@ -766,8 +778,10 @@ private Content getContent(String url) { redirects++; if (redirects >= maxRedirect) { LOG.warn("sitemap redirect limit exceeded: {}", origUrl); - context.getCounter("SitemapInjector", - "sitemap redirect limit exceeded").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL) + .increment(1); // return to avoid that exceeded redirects are counted twice // (also as non-success fetch status) return null; @@ -779,9 +793,8 @@ private Content getContent(String url) { if (!protocolOutput.getStatus().isSuccess()) { LOG.error("fetch of sitemap {} failed with status code {}", url, protocolOutput.getStatus().getCode()); - context - .getCounter("SitemapInjector", - "failed to fetch sitemap content, HTTP status != 200") + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL) .increment(1); incrementFailuresPerHost(hostName); return null; @@ -791,10 +804,8 @@ private Content getContent(String url) { if (content == null) { LOG.error("No content for {}, status: {}", url, protocolOutput.getStatus().getMessage()); - context - .getCounter("SitemapInjector", - "failed to fetch sitemap content, empty content") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_EMPTY_CONTENT_TOTAL).increment(1); incrementFailuresPerHost(hostName); return null; } @@ -826,7 +837,8 @@ public void injectURLs(SiteMap sitemap) Collection sitemapURLs = sitemap.getSiteMapUrls(); if (sitemapURLs.size() == 0) { LOG.info("No URLs in sitemap {}", sitemap.getUrl()); - context.getCounter("SitemapInjector", "empty sitemap").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_EMPTY_TOTAL).increment(1); return; } LOG.info("Found {} URLs in {}", sitemapURLs.size(), sitemap.getUrl()); @@ -852,8 +864,8 @@ public void injectURLs(SiteMap sitemap) for (SiteMapURL siteMapURL : sitemapURLs) { if (totalUrls >= maxUrls) { - context.getCounter("SitemapInjector", "sitemap URL limit reached") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URL_LIMIT_REACHED_TOTAL).increment(1); LOG.info("URL limit ({}) reached for {}", maxUrls, sitemap.getUrl()); break; @@ -861,7 +873,8 @@ public void injectURLs(SiteMap sitemap) if (random != null) { if (randomSelect > random.nextFloat()) { - context.getCounter("SitemapInjector", "random skip").increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_RANDOM_SKIP_TOTAL).increment(1); continue; } } @@ -889,8 +902,8 @@ public void injectURLs(SiteMap sitemap) && !injectedHosts.contains(host)) { hostLimitRejected++; context - .getCounter("SitemapInjector", - "urls from sitemaps rejected, host limit reached") + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL) .increment(1); continue; } @@ -905,8 +918,8 @@ public void injectURLs(SiteMap sitemap) } if (crossSubmit == null || !crossSubmits.contains(crossSubmit)) { crossSubmitsRejected++; - context.getCounter("SitemapInjector", - "urls from sitemaps rejected, target not allowed by cross-submits") + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL) .increment(1); continue; } @@ -918,8 +931,10 @@ public void injectURLs(SiteMap sitemap) url = null; } if (url == null) { - context.getCounter("SitemapInjector", - "urls from sitemaps rejected by URL filters").increment(1); + context + .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS) + .increment(1); } else { // URL passed normalizers and filters totalUrls++; @@ -939,8 +954,8 @@ public void injectURLs(SiteMap sitemap) url, e.getMessage()); } - context.getCounter("SitemapInjector", "urls from sitemaps injected") - .increment(1); + context.getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, + NutchMetrics.SITEMAP_URLS_INJECTED).increment(1); context.write(value, datum); injectedHosts.add(host); } @@ -1089,7 +1104,7 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, } for (Counter counter : sitemapJob.getCounters() - .getGroup("SitemapInjector")) { + .getGroup(NutchMetrics.GROUP_SITEMAP_INJECTOR)) { LOG.info(String.format("SitemapInjector: %8d %s", counter.getValue(), counter.getName())); } @@ -1171,7 +1186,8 @@ public void usage() { "Usage: SitemapInjector [-D...] [-threads ] [-overwrite|-update] [-noFilter] [-noNormalize] [-filterNormalizeAll]\n"); System.err.println("\nFor sitemap URLs listed in seed input files:"); System.err.println("\t- fetch and parse the sitemap (step 1)"); - System.err.println("\t- inject URLs from sitemaps into the CrawlDb (step 2)"); + System.err + .println("\t- inject URLs from sitemaps into the CrawlDb (step 2)"); System.err.println( "\t- using fetch intervals and scores from sitemaps if applicable"); System.err.println("Options and properties of SitemapInjector"); @@ -1206,25 +1222,25 @@ public int run(String[] args) throws Exception { continue; } switch (args[i]) { - case "-threads": - i++; - if (i == args.length) { - usage("Argument -threads requires parameter"); - return -1; - } - threads = Integer.parseInt(args[i]); - break; - case "-keepTemp": - keepTemp = true; - break; - case "-step1": - runStepOneOnly = true; - break; - case "-step2": - runStepTwoOnly = true; - break; - default: - superArguments.add(args[i]); + case "-threads": + i++; + if (i == args.length) { + usage("Argument -threads requires parameter"); + return -1; + } + threads = Integer.parseInt(args[i]); + break; + case "-keepTemp": + keepTemp = true; + break; + case "-step1": + runStepOneOnly = true; + break; + case "-step2": + runStepTwoOnly = true; + break; + default: + superArguments.add(args[i]); } } if (runStepOneOnly && runStepTwoOnly) { diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 66e560af64..26b3913622 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -185,6 +185,11 @@ public class FetcherThread extends Thread { private Counter aboveExceptionThresholdCounter; private Counter outlinksDetectedCounter; private Counter outlinksFollowingCounter; + private Counter robotsTxtArchivingFilteredCounter; + private Counter ipv4Counter; + private Counter ipv6Counter; + private Counter robotsTxtArchivingFilteredMimeCounter; + private Counter robotsTxtArchivingRobotsDeniedCounter; public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context, @@ -322,6 +327,21 @@ private void initCounters() { NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL); outlinksFollowingCounter = context.getCounter( NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL); + ipv4Counter = context.getCounter( + NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP, + NutchMetrics.FETCHER_IPV4_TOTAL); + ipv6Counter = context.getCounter( + NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP, + NutchMetrics.FETCHER_IPV6_TOTAL); + robotsTxtArchivingFilteredCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL); + robotsTxtArchivingFilteredMimeCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL); + robotsTxtArchivingRobotsDeniedCounter = context.getCounter( + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP, + NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL); } @Override @@ -732,21 +752,24 @@ private void countProtocolVersions(Metadata contentMetadata) { if (versionStr != null) { String[] versions = versionStr.split(","); if (versions.length >= 1) { - context.getCounter("HttpProtocolVersion", versions[0]).increment(1); + context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP, + versions[0]).increment(1); } else { - context.getCounter("HttpProtocolVersion", "unknown").increment(1); + context.getCounter(NutchMetrics.FETCHER_HTTP_PROTOCOL_VERSION_GROUP, + NutchMetrics.FETCHER_HTTP_PROTOCOL_UNKNOWN).increment(1); } for (int i = 1; i < versions.length; i++) { - context.getCounter("TlsProtocolVersion", versions[i]).increment(1); + context.getCounter(NutchMetrics.FETCHER_TLS_PROTOCOL_VERSION_GROUP, + versions[i]).increment(1); } } String ipaddress = contentMetadata.get(Response.IP_ADDRESS); if (ipaddress == null) { // IP address is not recorded } else if (ipaddress.indexOf(':') != -1) { - context.getCounter("IPaddressVersion", "IPv6").increment(1); + ipv6Counter.increment(1); } else { - context.getCounter("IPaddressVersion", "IPv4").increment(1); + ipv4Counter.increment(1); } } @@ -1051,7 +1074,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) { if (robotsTxtArchivingFilterUrlAlways || !u.getFile().equals("/robots.txt")) { LOG.info("Archiving of robots.txt {} skipped by URL filters", url); - context.getCounter("RobotsTxtArchiving", "filtered").increment(1); + robotsTxtArchivingFilteredCounter.increment(1); return false; } @@ -1075,8 +1098,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) { if (!robotsTxtArchivingAcceptedMimeTypes.contains(contentType)) { LOG.info("Archiving of robots.txt {} ({}) skipped by MIME filter", url, contentType); - context.getCounter("RobotsTxtArchiving", "filtered_mime") - .increment(1); + robotsTxtArchivingFilteredMimeCounter.increment(1); return false; } } @@ -1096,8 +1118,7 @@ private boolean robotsTxtArchivingIsAllowed(Content robotsTxt) { LOG.info( "Archiving of redirected robots.txt {} ({}) not allowed by robots.txt", url, robotsTxt.getContentType()); - context.getCounter("RobotsTxtArchiving", "robots_denied") - .increment(1); + robotsTxtArchivingRobotsDeniedCounter.increment(1); return false; } } diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java index e64a8d6d00..658675d27b 100644 --- a/src/java/org/apache/nutch/metrics/NutchMetrics.java +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -16,6 +16,10 @@ */ package org.apache.nutch.metrics; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.FetchSchedule; + /** * Centralized constants for Hadoop metrics counter groups and names. * @@ -138,6 +142,42 @@ private NutchMetrics() { /** Outlinks being followed. */ public static final String FETCHER_OUTLINKS_FOLLOWING_TOTAL = "outlinks_following_total"; + // ========================================================================= + // Fetcher Common Crawl extensions + // ========================================================================= + + /** HTTP protocol version group with dynamic counters. */ + public static final String FETCHER_HTTP_PROTOCOL_VERSION_GROUP = "http_protocol_version"; + + public static final String FETCHER_HTTP_PROTOCOL_UNKNOWN = "unknown"; + + /** SSL/TLS protocol version group with dynamic counters. */ + public static final String FETCHER_TLS_PROTOCOL_VERSION_GROUP = "tls_protocol_version"; + + /** IP address version group with two counters: ipv4 and ipv6. */ + public static final String FETCHER_IP_ADDRESS_VERSION_GROUP = "ip_address_version"; + + /** Number of fetches over IPv4. */ + public static final String FETCHER_IPV4_TOTAL = "ipv4"; + + /** Number of fetches over IPv6. */ + public static final String FETCHER_IPV6_TOTAL = "ipv6"; + + /** Archiving of robots.txt captures. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_GROUP = "robotstxt_archiving"; + + /** Robots.txt not archived: URL rejected by URL filters. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_TOTAL = "filtered"; + + /** Robots.txt not archived: MIME type rejected. */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_FILTERED_MIME_TOTAL = "filtered_mime"; + + /** + * Robots.txt not archived: URL path not /robots.txt and + * disallowed by robots.txt. + */ + public static final String FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL = "robots_denied"; + // ========================================================================= // Generator Counters // ========================================================================= @@ -175,6 +215,37 @@ private NutchMetrics() { /** Hosts affected by per-host overflow. */ public static final String GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL = "hosts_affected_per_host_overflow_total"; + // ========================================================================= + // Generator2-specific Counters + // ========================================================================= + + /** Domains affected by per-domain overflow. All remaining URLs of this domain have been skipped, but were not counted. */ + public static final String GENERATOR_DOMAINS_AFFECTED_PER_DOMAIN_OVERFLOW_TOTAL = "domains_affected_per_domain_overflow_total"; + + /** Domains affected by max. number of hosts per domain overflow. URLs from further hosts below this domain have been skipped. */ + public static final String GENERATOR_DOMAINS_AFFECTED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "domains_affected_num_hosts_overflow_total"; + + /** URLs skipped due to the max. number of hosts per domain overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_MAX_NUM_HOSTS_OVERFLOW_TOTAL = "urls_skipped_per_max_num_host_overflow_total"; + + /** URLs skipped due to per-segment overflow. */ + public static final String GENERATOR_URLS_SKIPPED_PER_SEGMENT_OVERFLOW_TOTAL = "urls_skipped_per_segment_overflow_total"; + + /** + * Counter group for items by status, rejected by the fetch schedule. See + * {@link FetchSchedule#shouldFetch(Text, CrawlDatum, long)}. + */ + public static final String GROUP_GENERATOR_SCHEDULE_REJECTED_BY_STATUS = "schedule_rejected_by_status"; + + /** + * Counter group for items by status, rejected because the generator score is + * lower than the minimum score defined per generate.min.score. + */ + public static final String GROUP_GENERATOR_SCORE_REJECTED_BY_STATUS = "score_rejected_by_status"; + + /** Counter group for items by status, selected for fetch. */ + public static final String GROUP_GENERATOR_SELECTED_BY_STATUS = "selected_by_status"; + // ========================================================================= // Indexer Counters // ========================================================================= @@ -291,6 +362,22 @@ private NutchMetrics() { /** Documents marked as duplicate. */ public static final String DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL = "documents_marked_duplicate_total"; + // ========================================================================= + // Redirect Deduplication Counters + // ========================================================================= + + /** Redirects kept as non-duplicates. */ + public static final String DEDUP_REDIRECTS_NOT_DUPLICATES_TOTAL = "redirects_marked_not_duplicate_total"; + + /** Redirects in CrawlDb. */ + public static final String DEDUP_REDIRECTS_IN_CRAWLDB_TOTAL = "redirects_in_crawldb_total"; + + /** Self-referential redirects in CrawlDb. */ + public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_TOTAL = "redirects_self_referential_total"; + + /** Self-referential redirects kept as non-duplicates. */ + public static final String DEDUP_REDIRECTS_SELF_REFERENTIAL_NOT_DUPLICATES_TOTAL = "redirects_self_referential_marked_not_duplicate_total"; + // ========================================================================= // Cleaning Job Counters // ========================================================================= @@ -333,6 +420,106 @@ private NutchMetrics() { /** New sitemap entries. */ public static final String SITEMAP_NEW_ENTRIES_TOTAL = "new_sitemap_entries_total"; + // ========================================================================= + // SitemapInjector Counters + // ========================================================================= + + /** SitemapInjector counter group. */ + public static final String GROUP_SITEMAP_INJECTOR = "sitemap_injector"; + + /** Failed to fetch sitemap content, disallowed per robots.txt. */ + public static final String SITEMAP_ROBOTSTXT_DISALLOW_TOTAL = "sitemap_robotstxt_disallow"; + + /** Sitemap failed to parse. */ + public static final String SITEMAP_FAILED_TO_PARSE_TOTAL = "sitemaps_failed_to_parse"; + + /** Prefix for sitemap type counter. */ + public static final String SITEMAP_TYPE_PREFIX = "sitemap_type_"; + + /** Sitemaps processed total. */ + public static final String SITEMAP_PROCESSED_TOTAL = "sitemaps_processed"; + + /** Sitemap index: affected by URL limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_URL_LIMIT_TOTAL = "sitemap_index_url_limit"; + + /** Sitemap index: affected by depth limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_DEPTH_LIMIT_TOTAL = "sitemap_index_depth_limit"; + + /** Sitemap index: affected by time limit. */ + public static final String SITEMAP_INDEX_AFFECTED_BY_TIME_LIMIT_TOTAL = "sitemap_index_time_limit"; + + /** Sitemap index: skipped because no URLs found after 50% of time limit. */ + public static final String SITEMAP_INDEX_NO_URLS_AFTER_50_PERCENT_OF_TIME_LIMIT_TOTAL = "sitemap_index_no_urls_after_50_percent_of_time_limit"; + + /** Sitemap index: skipped because of too many fetch failures. */ + public static final String SITEMAP_INDEX_TOO_MANY_FAILURES_TOTAL = "sitemap_index_too_many_failures"; + + /** Sitemap index: processed sitemaps. */ + public static final String SITEMAP_INDEX_PROCESSED_SITEMAPS_TOTAL = "sitemap_index_processed_sitemaps"; + + /** Skipped duplicated or recursive sitemap URLs. */ + public static final String SITEMAP_SKIPPED_DUPLICATE_OR_RECURSIVE_URL_TOTAL = "sitemap_skipped_duplicate_or_recursive_sitemap_url"; + + /** Sitemap index: affected by max. number of sitemaps in index. */ + public static final String SITEMAP_INDEX_MAX_SITEMAPS_LIMIT_TOTAL = "sitemap_index_max_sitemaps_limit"; + + /** Sitemap failed to fetch. */ + public static final String SITEMAP_FAILED_TO_FETCH_TOTAL = "sitemap_failed_to_fetch"; + + /** Sitemap skipped because of overlong URL. */ + public static final String SITEMAP_SKIPPED_OVERLONG_URL_TOTAL = "sitemap_skipped_overlong_url"; + + /** Sitemap rejected by URL filters */ + public static final String SITEMAP_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_rejected_by_url_filters"; + + /** Sitemap skipped, too many failures per host. */ + public static final String SITEMAP_SKIPPED_TOO_MANY_FAILURES_PER_HOST_TOTAL = "sitemap_skipped_too_many_failures_per_host"; + + /** Could not fetch sitemap content, protocol not supported. */ + public static final String SITEMAP_PROTOCOL_NOT_SUPPORTED_TOTAL = "sitemap_protocol_not_supported"; + + /** Failed to fetch sitemap content because of timeout. */ + public static final String SITEMAP_FAILED_TO_FETCH_TIMEOUT_TOTAL = "sitemap_failed_to_fetch_timeout"; + + /** Failed to fetch sitemap content because of exception. */ + public static final String SITEMAP_FAILED_TO_FETCH_EXCEPTION_TOTAL = "sitemap_failed_to_fetch_exception"; + + /** Sitemap redirect. */ + public static final String SITEMAP_REDIRECT_TOTAL = "sitemap_redirect"; + + /** Sitemap redirect target rejected by URL filters */ + public static final String SITEMAP_REDIRECT_TARGET_REJECTED_BY_URL_FILTERS_TOTAL = "sitemap_redirect_target_rejected_by_url_filters"; + + /** Sitemap redirect limit exceeded (max. number of redirects followed). */ + public static final String SITEMAP_REDIRECT_LIMIT_EXCEEDED_TOTAL = "sitemap_redirect_limit_exceeded"; + + /** Failed to fetch sitemap content, HTTP status != 200. */ + public static final String SITEMAP_FAILED_TO_FETCH_CONTENT_HTTP_STATUS_CODE_NOT_200_TOTAL = "sitemap_failed_to_fetch_http_status_code_not_200"; + + /** Failed to fetch sitemap content, empty content. */ + public static final String SITEMAP_EMPTY_CONTENT_TOTAL = "sitemap_empty_content"; + + /** Empty sitemap. */ + public static final String SITEMAP_EMPTY_TOTAL = "sitemap_empty"; + + /** Sitemap URL limit reached. */ + public static final String SITEMAP_URL_LIMIT_REACHED_TOTAL = "sitemap_url_limit_reached"; + + /** URLs randomly skipped. */ + public static final String SITEMAP_RANDOM_SKIP_TOTAL = "urls_random_skip"; + + /** URLs from sitemaps rejected, host limit reached. */ + public static final String SITEMAP_URLS_SKIPPED_HOST_LIMIT_REACHED_TOTAL = "urls_skipped_host_limit_reached"; + + /** URLs from sitemaps rejected, target not allowed by cross-submit. */ + public static final String SITEMAP_URLS_SKIPPED_NOT_ALLOWED_BY_CROSS_SUBMITS_TOTAL = "urls_skipped_not_allowed_by_cross_submits"; + + /** URLs from sitemaps rejected by URL filters. */ + public static final String SITEMAP_URLS_FROM_REJECTED_BY_URL_FILTERS = "urls_from_sitemaps_rejected_by_url_filters"; + + /** URLs from sitemaps injected. */ + public static final String SITEMAP_URLS_INJECTED = "urls_from_sitemaps_injected"; + // ========================================================================= // WARC Exporter Counters // ========================================================================= @@ -367,5 +554,64 @@ private NutchMetrics() { /** Empty results in domain statistics. */ public static final String DOMAIN_STATS_EMPTY_RESULT_TOTAL = "empty_result_total"; + + // ========================================================================= + // UrlCleaner + // ========================================================================= + + public static final String GROUP_URLCLEANER = "urlcleaner"; + + public static final String URLCLEANER_REJECTED_TOTAL = "urls_rejected"; + + public static final String URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL = "urls_rejected_invalid_domain"; + + public static final String URLCLEANER_ACCEPTED_UNCHANGED_TOTAL = "urls_accepted_unchanged"; + + public static final String URLCLEANER_ACCEPTED_NORMALIZED_TOTAL = "urls_accepted_normalized"; + + // ========================================================================= + // UrlSampler and UrlSamplerHost + // ========================================================================= + + public static final String GROUP_URLSAMPLER = "urlsampler"; + + public static final String GROUP_URLSAMPLER_HOST = "urlsamplerhost"; + + public static final String URLSAMPLER_MALFORMED_URL_TOTAL = "malformed_url"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_TOTAL = "skipped_max_urls"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL = "skipped_max_urls_per_host"; + + public static final String URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL = "skipped_max_hosts"; + + public static final String URLSAMPLER_HOSTS = "hosts"; + + public static final String URLSAMPLER_URLS = "urls"; + + public static final String URLSAMPLER_HOSTS_WITH_LIMIT = "hosts_with_limit"; + + public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT = "urls_host_with_limit"; + + public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT = "hosts_without_limit"; + + public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT = "urls_host_without_limit"; + + public static final String URLSAMPLER_URLS_SAMPLED = "urls_sampled"; + + public static final String URLSAMPLER_HOSTS_SAMPLED = "hosts_sampled"; + + public static final String URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED = "hosts_with_limit_sampled"; + + public static final String URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED = "urls_host_with_limit_sampled"; + + public static final String URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED = "hosts_without_limit_sampled"; + + public static final String URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED = "urls_host_without_limit_sampled"; + + public static final String URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST = "skipped_max_urls_per_host"; + + public static final String URLSAMPLER_SKIPPED_RANDOM = "skipped_random"; + } diff --git a/src/java/org/commoncrawl/tools/UrlCleaner.java b/src/java/org/commoncrawl/tools/UrlCleaner.java index a3f26b126b..c4d92ca669 100644 --- a/src/java/org/commoncrawl/tools/UrlCleaner.java +++ b/src/java/org/commoncrawl/tools/UrlCleaner.java @@ -40,6 +40,7 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -134,18 +135,21 @@ public void map(Text key, Text value, Context context) try { url = urlNormalizers.normalize(url, scope); } catch (MalformedURLException e) { - context.getCounter("urlcleaner", "urls_rejected").increment(1); - return; + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); + return; } try { url = filters.filter(url); } catch (URLFilterException e) { - context.getCounter("urlcleaner", "urls_rejected").increment(1); + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); return; } if (url == null) { - context.getCounter("urlcleaner", "urls_rejected").increment(1); + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); return; } @@ -157,21 +161,26 @@ public void map(Text key, Text value, Context context) if (needDomain) { domain = EffectiveTldFinder.getAssignedDomain(host, true, true); if (checkDomain && domain == null) { - context.getCounter("urlcleaner", "urls_rejected_invalid_domain") + context + .getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_INVALID_DOMAIN_TOTAL) .increment(1); return; } } } catch (MalformedURLException e) { - context.getCounter("urlcleaner", "urls_rejected").increment(1); + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_REJECTED_TOTAL).increment(1); return; } } if (url.equals(urlOrig)) { - context.getCounter("urlcleaner", "urls_accepted_unchanged").increment(1); + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_ACCEPTED_UNCHANGED_TOTAL).increment(1); } else { - context.getCounter("urlcleaner", "urls_accepted_normalized").increment(1); + context.getCounter(NutchMetrics.GROUP_URLCLEANER, + NutchMetrics.URLCLEANER_ACCEPTED_NORMALIZED_TOTAL).increment(1); key.set(url); } diff --git a/src/java/org/commoncrawl/tools/UrlSampler.java b/src/java/org/commoncrawl/tools/UrlSampler.java index f28447a4cf..e2060e1f47 100644 --- a/src/java/org/commoncrawl/tools/UrlSampler.java +++ b/src/java/org/commoncrawl/tools/UrlSampler.java @@ -48,6 +48,7 @@ import org.apache.nutch.crawl.Generator2; import org.apache.nutch.crawl.Generator2.DomainScorePair; import org.apache.nutch.crawl.URLPartitioner; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -150,7 +151,8 @@ public void map(Text key, Text value, Context context) domain = URLPartitioner.getDomainName(u.getHost()); } catch (Exception e) { LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage()); - context.getCounter("UrlSampler", "MALFORMED_URL").increment(1); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1); return; } @@ -242,7 +244,8 @@ public void reduce(DomainScorePair key, Iterable values, domain); } } catch (MalformedURLException e) { - context.getCounter("UrlSampler", "MALFORMED_URL").increment(1); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1); continue; } nUrls++; @@ -271,12 +274,14 @@ public void reduce(DomainScorePair key, Iterable values, } if (nUrls == 0) return; - context.getCounter("UrlSampler", "SKIPPED_MAX_URLS") - .increment(skippedMaxUrls); - context.getCounter("UrlSampler", "SKIPPED_MAX_URLS_PER_HOST") + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_TOTAL).increment(skippedMaxUrls); + context + .getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST_TOTAL) .increment(skippedMaxUrlsPerHost); - context.getCounter("UrlSampler", "SKIPPED_MAX_HOSTS") - .increment(skippedMaxHosts); + context.getCounter(NutchMetrics.GROUP_URLSAMPLER, + NutchMetrics.URLSAMPLER_SKIPPED_MAX_HOSTS_TOTAL).increment(skippedMaxHosts); LOG.info( "Sampled for domain {} : {} hosts, {} URLs ({} skipped: {} max. URLs, {} max. per host, {} max. hosts), sum of scores = {}", domain, hosts.size(), nUrlsSampled, (nUrls - nUrlsSampled), @@ -336,8 +341,8 @@ private void sample(Path[] inputs, Path output) throws Exception { } public void usage() { - System.err - .println("Usage: UrlSampler [-D...] ... \n"); + System.err.println( + "Usage: UrlSampler [-D...] ... \n"); } @Override diff --git a/src/java/org/commoncrawl/tools/UrlSamplerHost.java b/src/java/org/commoncrawl/tools/UrlSamplerHost.java index e296ffa90b..bce68ad50f 100644 --- a/src/java/org/commoncrawl/tools/UrlSamplerHost.java +++ b/src/java/org/commoncrawl/tools/UrlSamplerHost.java @@ -44,6 +44,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.Generator2; import org.apache.nutch.crawl.Generator2.DomainScorePair; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.NutchConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,7 +61,8 @@ * * * - *

  • host name (leading www. may be stripped), limits and default score + *
  • host name (leading www. may be stripped), limits and default + * score * *
      * <host_name> \t <rank> \t <max_urls> \t <default_score>
    @@ -180,7 +182,8 @@ public void map(Text key, Text value, Context context)
             }
           } catch (Exception e) {
             LOG.warn("Malformed URL: '{}', skipping ({})", url, e.getMessage());
    -        context.getCounter("UrlSampler", "MALFORMED_URL").increment(1);
    +        context.getCounter(NutchMetrics.GROUP_URLSAMPLER,
    +            NutchMetrics.URLSAMPLER_MALFORMED_URL_TOTAL).increment(1);
             return;
           }
     
    @@ -270,40 +273,59 @@ public void reduce(DomainScorePair key, Iterable values,
             context.write(text, meta);
           }
           // hosts == reduce input groups
    -      context.getCounter("UrlSamplerHost", "HOSTS").increment(1);
    +      context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +          NutchMetrics.URLSAMPLER_HOSTS).increment(1);
           // URLs == map output records, reduce input records
    -      context.getCounter("UrlSamplerHost", "URLS").increment(nUrls);
    +      context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +          NutchMetrics.URLSAMPLER_URLS).increment(nUrls);
           if (nUrls > 0) {
             if (maxUrls > -1) {
    -          context.getCounter("UrlSamplerHost", "HOSTS_WITH_LIMIT").increment(1);
    -          context.getCounter("UrlSamplerHost", "URLS_HOST_WITH_LIMIT")
    +          context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +              NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT).increment(1);
    +          context
    +              .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                  NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT)
                   .increment(nUrls);
             } else {
    -          context.getCounter("UrlSamplerHost", "HOSTS_WITHOUT_LIMIT")
    -              .increment(1);
    -          context.getCounter("UrlSamplerHost", "URLS_HOST_WITHOUT_LIMIT")
    +          context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +              NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT).increment(1);
    +          context
    +              .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                  NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT)
                   .increment(nUrls);
             }
             if (nUrlsSampled > 0) {
    -          context.getCounter("UrlSamplerHost", "URLS_SAMPLED")
    -              .increment(nUrlsSampled);
    -          context.getCounter("UrlSamplerHost", "HOSTS_SAMPLED").increment(1);
    +          context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +              NutchMetrics.URLSAMPLER_URLS_SAMPLED).increment(nUrlsSampled);
    +          context.getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +              NutchMetrics.URLSAMPLER_HOSTS_SAMPLED).increment(1);
               if (maxUrls > -1) {
    -            context.getCounter("UrlSamplerHost", "HOSTS_WITH_LIMIT_SAMPLED")
    +            context
    +                .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                    NutchMetrics.URLSAMPLER_HOSTS_WITH_LIMIT_SAMPLED)
                     .increment(1);
    -            context.getCounter("UrlSamplerHost", "URLS_HOST_WITH_LIMIT_SAMPLED")
    +            context
    +                .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                    NutchMetrics.URLSAMPLER_URLS_HOST_WITH_LIMIT_SAMPLED)
                     .increment(nUrlsSampled);
               } else {
    -            context.getCounter("UrlSamplerHost", "HOSTS_WITHOUT_LIMIT_SAMPLED")
    +            context
    +                .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                    NutchMetrics.URLSAMPLER_HOSTS_WITHOUT_LIMIT_SAMPLED)
                     .increment(1);
                 context
    -                .getCounter("UrlSamplerHost", "URLS_HOST_WITHOUT_LIMIT_SAMPLED")
    +                .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                    NutchMetrics.URLSAMPLER_URLS_HOST_WITHOUT_LIMIT_SAMPLED)
                     .increment(nUrlsSampled);
               }
             }
    -        context.getCounter("UrlSamplerHost", "SKIPPED_MAX_URLS_PER_HOST")
    +        context
    +            .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                NutchMetrics.URLSAMPLER_SKIPPED_MAX_URLS_PER_HOST)
                 .increment(skippedMaxUrlsPerHost);
    -        context.getCounter("UrlSamplerHost", "SKIPPED_RANDOM")
    +        context
    +            .getCounter(NutchMetrics.GROUP_URLSAMPLER_HOST,
    +                NutchMetrics.URLSAMPLER_SKIPPED_RANDOM)
                 .increment(skippedRandom);
             LOG.info(
                 "Sampled for host {} : {} URLs ({} skipped: {} max. per host, {} random), sum of scores = {}",
    @@ -365,8 +387,8 @@ private void sample(Path[] inputs, Path output) throws Exception {
       }
     
       public void usage() {
    -    System.err
    -      .println("Usage: UrlSamplerHost [-D...]  ... \n");
    +    System.err.println(
    +        "Usage: UrlSamplerHost [-D...]  ... \n");
         System.err.println(
             "\nThe host_limits file defines the maximum number of URLs to sample per host.");
         System.err.println("\nProperties:");
    @@ -374,11 +396,12 @@ public void usage() {
             "\t-Durlsample.host.strip.www=(true|false)\tstrip leading www. from host names");
         System.err.println(
             "\t\t\t(depending on whether the limits file uses stripped host names)");
    -    System.err.println("Properties to configure defaults, if host is not in the limits file:");
         System.err.println(
    -        "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host");
    +        "Properties to configure defaults, if host is not in the limits file:");
         System.err.println(
    -        "\t\t\t-1 : sample randomly with low probability (default)");
    +        "\t-Durlsample.urls.per.host\tmax. number of URLs to sample per host");
    +    System.err
    +        .println("\t\t\t-1 : sample randomly with low probability (default)");
         System.err.println(
             "\t-Durlsample.default.score\tdefault score for sampled URLs (default: 0.001)");
       }
    
    From d9571d31e1c7542b3e71610f803e8361002d6f4f Mon Sep 17 00:00:00 2001
    From: Lewis John McGibbney 
    Date: Wed, 17 Dec 2025 19:28:41 -0800
    Subject: [PATCH 04/27] NUTCH-3134 Add latency metrics with percentile support
     to Fetcher, Parser, and Indexer (#876)
    
    ---
     .../apache/nutch/fetcher/FetcherThread.java   |  15 ++
     .../nutch/indexer/IndexerMapReduce.java       |  21 +++
     .../apache/nutch/metrics/LatencyTracker.java  | 144 ++++++++++++++++++
     .../apache/nutch/metrics/NutchMetrics.java    |  21 +++
     .../org/apache/nutch/parse/ParseSegment.java  |  15 +-
     5 files changed, 215 insertions(+), 1 deletion(-)
     create mode 100644 src/java/org/apache/nutch/metrics/LatencyTracker.java
    
    diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
    index 26b3913622..baac1ac05f 100644
    --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
    +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
    @@ -41,6 +41,7 @@
     import org.apache.nutch.crawl.SignatureFactory;
     import org.apache.nutch.fetcher.Fetcher.FetcherRun;
     import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType;
    +import org.apache.nutch.metrics.LatencyTracker;
     import org.apache.nutch.metrics.NutchMetrics;
     import org.apache.nutch.metadata.Metadata;
     import org.apache.nutch.metadata.Nutch;
    @@ -191,6 +192,9 @@ public class FetcherThread extends Thread {
       private Counter robotsTxtArchivingFilteredMimeCounter;
       private Counter robotsTxtArchivingRobotsDeniedCounter;
     
    +  // Latency tracker for fetch timing metrics
    +  private LatencyTracker fetchLatencyTracker;
    +
       public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, 
           QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context,
           AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, 
    @@ -327,6 +331,8 @@ private void initCounters() {
             NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_DETECTED_TOTAL);
         outlinksFollowingCounter = context.getCounter(
             NutchMetrics.GROUP_FETCHER_OUTLINKS, NutchMetrics.FETCHER_OUTLINKS_FOLLOWING_TOTAL);
    +
    +    // Common Crawl specific counters
         ipv4Counter = context.getCounter(
             NutchMetrics.FETCHER_IP_ADDRESS_VERSION_GROUP,
             NutchMetrics.FETCHER_IPV4_TOTAL);
    @@ -342,6 +348,10 @@ private void initCounters() {
         robotsTxtArchivingRobotsDeniedCounter = context.getCounter(
             NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_GROUP,
             NutchMetrics.FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL);
    +    
    +    // Initialize latency tracker for fetch timing
    +    fetchLatencyTracker = new LatencyTracker(
    +        NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY);
       }
     
       @Override
    @@ -475,8 +485,11 @@ public void run() {
                         fit.queueID, fiq.crawlDelay, fit.url);
                   }
                 }
    +            // Track fetch latency
    +            long fetchStart = System.currentTimeMillis();
                 ProtocolOutput output = protocol.getProtocolOutput(fit.url,
                     fit.datum);
    +            fetchLatencyTracker.record(System.currentTimeMillis() - fetchStart);
                 ProtocolStatus status = output.getStatus();
                 Content content = output.getContent();
                 ParseStatus pstatus = null;
    @@ -619,6 +632,8 @@ public void run() {
           if (fit != null) {
             fetchQueues.finishFetchItem(fit);
           }
    +      // Emit fetch latency metrics
    +      fetchLatencyTracker.emitCounters(context);
           activeThreads.decrementAndGet(); // count threads
           LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(),
               Thread.currentThread().getId(), getName(), activeThreads);
    diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    index 33f2f244a6..9086a19839 100644
    --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    @@ -40,6 +40,7 @@
     import org.apache.nutch.crawl.Inlinks;
     import org.apache.nutch.crawl.LinkDb;
     import org.apache.nutch.crawl.NutchWritable;
    +import org.apache.nutch.metrics.LatencyTracker;
     import org.apache.nutch.metrics.NutchMetrics;
     import org.apache.nutch.metadata.Metadata;
     import org.apache.nutch.metadata.Nutch;
    @@ -215,6 +216,9 @@ public static class IndexerReducer extends
         private URLNormalizers urlNormalizers;
         private URLFilters urlFilters;
     
    +    // Latency tracker for indexing timing metrics
    +    private LatencyTracker indexLatencyTracker;
    +
         @Override
         public void setup(Reducer.Context context) {
           Configuration conf = context.getConfiguration();
    @@ -239,6 +243,17 @@ public void setup(Reducer.Context c
           if (filter) {
             urlFilters = new URLFilters(conf);
           }
    +
    +      // Initialize latency tracker for indexing timing
    +      indexLatencyTracker = new LatencyTracker(
    +          NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_LATENCY);
    +    }
    +
    +    @Override
    +    public void cleanup(Reducer.Context context)
    +        throws IOException, InterruptedException {
    +      // Emit indexing latency metrics
    +      indexLatencyTracker.emitCounters(context);
         }
     
         @Override
    @@ -343,6 +358,9 @@ public void reduce(Text key, Iterable values,
             return;
           }
     
    +      // Start timing document indexing
    +      long indexStart = System.currentTimeMillis();
    +
           NutchDocument doc = new NutchDocument();
           doc.add("id", key.toString());
     
    @@ -432,6 +450,9 @@ public void reduce(Text key, Iterable values,
             doc.add("binaryContent", binary);
           }
     
    +      // Record indexing latency
    +      indexLatencyTracker.record(System.currentTimeMillis() - indexStart);
    +
           context.getCounter(NutchMetrics.GROUP_INDEXER,
               NutchMetrics.INDEXER_INDEXED_TOTAL).increment(1);
     
    diff --git a/src/java/org/apache/nutch/metrics/LatencyTracker.java b/src/java/org/apache/nutch/metrics/LatencyTracker.java
    new file mode 100644
    index 0000000000..3777bb29e3
    --- /dev/null
    +++ b/src/java/org/apache/nutch/metrics/LatencyTracker.java
    @@ -0,0 +1,144 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nutch.metrics;
    +
    +import org.apache.hadoop.mapreduce.TaskInputOutputContext;
    +
    +import com.tdunning.math.stats.TDigest;
    +
    +/**
    + * A utility class for tracking latency metrics using TDigest for percentile
    + * calculation.
    + * 
    + * 

    This class wraps a TDigest data structure to collect latency samples and + * emit Hadoop counters with count, sum, and percentile values (p50, p95, p99). + * + *

    Usage: + *

    + * // In mapper/reducer setup
    + * latencyTracker = new LatencyTracker(NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY);
    + * 
    + * // During processing
    + * long start = System.currentTimeMillis();
    + * // ... operation ...
    + * latencyTracker.record(System.currentTimeMillis() - start);
    + * 
    + * // In cleanup
    + * latencyTracker.emitCounters(context);
    + * 
    + * + *

    Emits the following counters: + *

      + *
    • {prefix}_count_total - total number of samples
    • + *
    • {prefix}_sum_ms - sum of all latencies in milliseconds
    • + *
    • {prefix}_p50_ms - 50th percentile (median) latency
    • + *
    • {prefix}_p95_ms - 95th percentile latency
    • + *
    • {prefix}_p99_ms - 99th percentile latency
    • + *
    + * + * @since 1.22 + */ +public class LatencyTracker { + + /** Default compression factor for TDigest (controls accuracy vs memory). */ + private static final double DEFAULT_COMPRESSION = 100.0; + + private final TDigest digest; + private final String group; + private final String prefix; + private long count = 0; + private long sum = 0; + + /** + * Creates a new LatencyTracker. + * + * @param group the Hadoop counter group name + * @param prefix the prefix for counter names (e.g., "fetch_latency") + */ + public LatencyTracker(String group, String prefix) { + this.digest = TDigest.createDigest(DEFAULT_COMPRESSION); + this.group = group; + this.prefix = prefix; + } + + /** + * Records a latency sample. + * + * @param latencyMs the latency in milliseconds + */ + public void record(long latencyMs) { + digest.add(latencyMs); + count++; + sum += latencyMs; + } + + /** + * Returns the number of recorded samples. + * + * @return the count of recorded latency samples + */ + public long getCount() { + return count; + } + + /** + * Returns the sum of all recorded latencies. + * + * @return the sum of latencies in milliseconds + */ + public long getSum() { + return sum; + } + + /** + * Returns the percentile value for the given quantile. + * + * @param quantile the quantile (0.0 to 1.0) + * @return the percentile value in milliseconds + */ + public long getPercentile(double quantile) { + if (count == 0) { + return 0; + } + return (long) digest.quantile(quantile); + } + + /** + * Emits all latency counters to the Hadoop context. + * + *

    Should be called once during cleanup to emit aggregated metrics. + * + * @param context the Hadoop task context + */ + public void emitCounters(TaskInputOutputContext context) { + context.getCounter(group, prefix + "_count_total").setValue(count); + context.getCounter(group, prefix + "_sum_ms").setValue(sum); + + if (count > 0) { + context.getCounter(group, prefix + "_p50_ms").setValue((long) digest.quantile(0.50)); + context.getCounter(group, prefix + "_p95_ms").setValue((long) digest.quantile(0.95)); + context.getCounter(group, prefix + "_p99_ms").setValue((long) digest.quantile(0.99)); + } else { + // Set to 0 if no samples recorded + context.getCounter(group, prefix + "_p50_ms").setValue(0); + context.getCounter(group, prefix + "_p95_ms").setValue(0); + context.getCounter(group, prefix + "_p99_ms").setValue(0); + } + } +} + + diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java index 658675d27b..8b187cf3fb 100644 --- a/src/java/org/apache/nutch/metrics/NutchMetrics.java +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -613,5 +613,26 @@ private NutchMetrics() { public static final String URLSAMPLER_SKIPPED_RANDOM = "skipped_random"; + // ========================================================================= + // Latency Metric Prefixes (used with LatencyTracker) + // ========================================================================= + + /** + * Prefix for fetch latency metrics. + * Used with {@link LatencyTracker} to emit fetch timing counters. + */ + public static final String FETCHER_LATENCY = "fetch_latency"; + + /** + * Prefix for parse latency metrics. + * Used with {@link LatencyTracker} to emit parse timing counters. + */ + public static final String PARSER_LATENCY = "parse_latency"; + + /** + * Prefix for indexer latency metrics. + * Used with {@link LatencyTracker} to emit indexing timing counters. + */ + public static final String INDEXER_LATENCY = "index_latency"; } diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 5ec74ea9fe..a7fbe066ce 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.LatencyTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; @@ -81,12 +82,22 @@ public static class ParseSegmentMapper extends private Text newKey = new Text(); private ScoringFilters scfilters; private boolean skipTruncated; + private LatencyTracker parseLatencyTracker; @Override public void setup(Mapper, Content, Text, ParseImpl>.Context context) { Configuration conf = context.getConfiguration(); scfilters = new ScoringFilters(conf); skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true); + parseLatencyTracker = new LatencyTracker( + NutchMetrics.GROUP_PARSER, NutchMetrics.PARSER_LATENCY); + } + + @Override + public void cleanup(Mapper, Content, Text, ParseImpl>.Context context) + throws IOException, InterruptedException { + // Emit parse latency metrics + parseLatencyTracker.emitCounters(context); } @Override @@ -156,7 +167,9 @@ public void map(WritableComparable key, Content content, } long end = System.currentTimeMillis(); - LOG.info("Parsed ({}ms): {}", (end - start), url); + long parseTime = end - start; + parseLatencyTracker.record(parseTime); + LOG.info("Parsed ({}ms): {}", parseTime, url); context.write( url, From d989f769d527637bd82aaa99af07125ffb91286d Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 11 Dec 2025 18:28:20 +0100 Subject: [PATCH 05/27] NUTCH-3133 Upgrade GitHub workflows to JDK 17 --- .github/workflows/master-build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index f7265e5b52..aa9219d280 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -24,7 +24,7 @@ jobs: javadoc: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -39,7 +39,7 @@ jobs: rat: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -62,7 +62,7 @@ jobs: tests: strategy: matrix: - java: ['11'] + java: ['17'] os: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.os }} timeout-minutes: 30 From e8645686aea9bab0eeea30e633e504e432f0cacb Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Dec 2025 08:32:48 +0100 Subject: [PATCH 06/27] NUTCH-3135 Cache downloaded ant-eclipse.jar --- build.xml | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/build.xml b/build.xml index a4530c40f1..092bb6ae0e 100644 --- a/build.xml +++ b/build.xml @@ -48,6 +48,8 @@ + + @@ -1110,19 +1112,6 @@ - - - - - - - - - - - - - @@ -1132,7 +1121,6 @@ - @@ -1143,18 +1131,24 @@ - + + + + + + + dest="${ivy.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" /> - + - + @@ -1169,7 +1163,7 @@ + classpath="${ant-eclipse.jar}" /> From 1c835c17279ea3c08c02a151ec7b157e85d82d95 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Dec 2025 09:32:14 +0100 Subject: [PATCH 07/27] NUTCH-3136 Upgrade crawler-commons dependency Robots.txt parser: use URL objects in newly introduced methods to avoid the unnecessary parsing of URLs. --- .../apache/nutch/fetcher/FetcherThread.java | 4 ++-- .../org/apache/nutch/protocol/Protocol.java | 21 +++++++++++++++++++ src/java/org/apache/nutch/util/URLUtil.java | 2 +- .../nutch/protocol/http/api/HttpBase.java | 6 ++++++ .../org/apache/nutch/protocol/file/File.java | 10 +++++++++ .../org/apache/nutch/protocol/ftp/Ftp.java | 9 ++++++++ 6 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index baac1ac05f..297126e1bf 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -426,7 +426,7 @@ public void run() { LOG.debug("redirectCount={}", redirectCount); redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.u); - BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, + BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { outputRobotsTxt(robotsTxtContent); @@ -449,7 +449,7 @@ public void run() { } continue; } - if (!rules.isAllowed(fit.url.toString())) { + if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index ab4162c87f..2514eae33e 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.URL; import java.util.List; import org.apache.hadoop.conf.Configurable; @@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable { BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List robotsTxtContent); + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + default BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return getRobotRules(new Text(url.toString()), datum, robotsTxtContent); + } + } diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 158125999e..44c6309d2a 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target) * https://publicsuffix.org/list/public_suffix_list.dat and are compared * using + * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html"> * crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are * used. Because EffectiveTldFinder loads the public suffix list as file * "effective_tld_names.dat" from the Java classpath, it's possible to use the diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 79b45882eb..caa3f861ea 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return this.robots.getRobotRulesSet(this, url, robotsTxtContent); } + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return this.robots.getRobotRulesSet(this, url, robotsTxtContent); + } + /** * Transforming a String[] into a HashMap for faster searching * diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index e4d2010696..877873b64b 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return RobotRulesParser.EMPTY_RULES; } + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return RobotRulesParser.EMPTY_RULES; + } + } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 2a47b63d61..8cf58f75e7 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return robots.getRobotRulesSet(this, url, robotsTxtContent); } + /** + * Get the robots rules for a given url + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); + } + public int getBufferSize() { return BUFFER_SIZE; } From bdbc89772d5faa1c48ef7a208c7ff93456c534dd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Dec 2025 15:14:04 +0100 Subject: [PATCH 08/27] NUTCH-3136 Upgrade crawler-commons dependency Update URLUtil test to adapt to a change in the public suffix list --- src/test/org/apache/nutch/util/TestURLUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index b14b55af09..9c89590a2e 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -146,7 +146,7 @@ public void testGetDomainSuffix() throws Exception { url = new URL("http://www.example.2000.hu"); assertEquals("2000.hu", URLUtil.getDomainSuffix(url)); - // test non-ascii + // test non-ASCII url = new URL("http://www.example.flå.no"); assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.栃木.jp"); From 488eacb5c5849a0ee62f41ccb98fbc0d4ee9cfe4 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Dec 2025 23:30:29 +0100 Subject: [PATCH 09/27] NUTCH-3139 protocol-okhttp: add support for zstd content-encoding - upgrade to OkHttp 5.3.2 - enable support for zstd content-encoding --- src/plugin/protocol-okhttp/ivy.xml | 7 ++++--- src/plugin/protocol-okhttp/plugin.xml | 16 +++++++++------- .../org/apache/nutch/protocol/okhttp/OkHttp.java | 13 ++++++++----- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml index 0768def785..28f355d7b9 100644 --- a/src/plugin/protocol-okhttp/ivy.xml +++ b/src/plugin/protocol-okhttp/ivy.xml @@ -37,8 +37,9 @@ - - + + + - + diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml index e2183d2b50..51f65f5d25 100755 --- a/src/plugin/protocol-okhttp/plugin.xml +++ b/src/plugin/protocol-okhttp/plugin.xml @@ -28,13 +28,15 @@ - - - - - - - + + + + + + + + + diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 954c3f6df1..a9d2b14d42 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -52,15 +52,19 @@ import org.slf4j.LoggerFactory; import okhttp3.Authenticator; +import okhttp3.CompressionInterceptor; import okhttp3.Connection; import okhttp3.ConnectionPool; +import okhttp3.Gzip; import okhttp3.Handshake; import okhttp3.Headers; import okhttp3.Interceptor; import okhttp3.OkHttpClient; import okhttp3.Protocol; import okhttp3.Request; -import okhttp3.brotli.BrotliInterceptor; +import okhttp3.brotli.Brotli; +import okhttp3.zstd.Zstd; + public class OkHttp extends HttpBase { @@ -156,13 +160,11 @@ public boolean verify(String hostname, SSLSession session) { String proxyUsername = conf.get("http.proxy.username"); if (proxyUsername == null) { ProxySelector selector = new ProxySelector() { - @SuppressWarnings("serial") private final List noProxyList = new ArrayList() { { add(Proxy.NO_PROXY); } }; - @SuppressWarnings("serial") private final List proxyList = new ArrayList() { { add(proxy); @@ -224,8 +226,9 @@ public Request authenticate(okhttp3.Route route, builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } - // enable support for Brotli compression (Content-Encoding) - builder.addInterceptor(BrotliInterceptor.INSTANCE); + // enable support for Zstd, Brotli, Gzip Content-Encoding + builder.addInterceptor(new CompressionInterceptor(Zstd.INSTANCE, + Brotli.INSTANCE, Gzip.INSTANCE)); // instantiate connection pool(s), cf. // https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html From 2df25d171639a1c0f33fe32cb832c25268a1fddc Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 8 Jan 2026 09:33:06 -0800 Subject: [PATCH 10/27] NUTCH-3141 Cache Hadoop Counter References in Hot Paths (#878) --- src/java/org/apache/nutch/crawl/CrawlDb.java | 3 +- .../apache/nutch/crawl/DeduplicationJob.java | 10 ++- .../org/apache/nutch/fetcher/QueueFeeder.java | 34 ++++++--- .../nutch/hostdb/UpdateHostDbMapper.java | 23 +++--- .../nutch/hostdb/UpdateHostDbReducer.java | 23 ++++-- .../nutch/indexer/IndexerMapReduce.java | 72 +++++++++++++------ .../apache/nutch/tools/warc/WARCExporter.java | 62 +++++++++------- .../apache/nutch/util/SitemapProcessor.java | 64 +++++++++++------ 8 files changed, 197 insertions(+), 94 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 01598a5f18..32081e1d61 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -43,6 +43,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; @@ -145,7 +146,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, if (filter) { long urlsFiltered = job.getCounters() - .findCounter("CrawlDB filter", "URLs filtered").getValue(); + .findCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).getValue(); LOG.info( "CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}", urlsFiltered); diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index cdb291fe85..d5f983a273 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -335,12 +335,10 @@ public int run(String[] args) throws IOException { fs.delete(tempDir, true); throw new RuntimeException(message); } - CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus"); - if (g != null) { - Counter counter = g.findCounter("Documents marked as duplicate"); - long dups = counter.getValue(); - LOG.info("Deduplication: {} documents marked as duplicates", dups); - } + long dups = job.getCounters() + .findCounter(NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL) + .getValue(); + LOG.info("Deduplication: {} documents marked as duplicates", dups); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationJob:", e); fs.delete(tempDir, true); diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java b/src/java/org/apache/nutch/fetcher/QueueFeeder.java index 6ee973dd3b..5dfa24fd06 100644 --- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java +++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java @@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.fetcher.FetchItemQueues.QueuingStatus; import org.apache.nutch.fetcher.Fetcher.FetcherRun; @@ -48,6 +49,12 @@ public class QueueFeeder extends Thread { private URLNormalizers urlNormalizers = null; private String urlNormalizerScope = URLNormalizers.SCOPE_DEFAULT; + // Cached counter references to avoid repeated lookups in hot paths + private Counter hitByTimeoutCounter; + private Counter hitByTimelimitCounter; + private Counter filteredCounter; + private Counter aboveExceptionThresholdCounter; + public QueueFeeder(FetcherRun.Context context, FetchItemQueues queues, int size) { this.context = context; @@ -62,6 +69,21 @@ public QueueFeeder(FetcherRun.Context context, if (conf.getBoolean("fetcher.normalize.urls", false)) { urlNormalizers = new URLNormalizers(conf, urlNormalizerScope); } + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters() { + hitByTimeoutCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL); + hitByTimelimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + filteredCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_FILTERED_TOTAL); + aboveExceptionThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL); } /** Filter and normalize the url */ @@ -95,16 +117,14 @@ public void run() { LOG.info("QueueFeeder stopping, timeout reached."); } queuingStatus[qstatus]++; - context.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(1); + hitByTimeoutCounter.increment(1); } else { int qstatus = QueuingStatus.HIT_BY_TIMELIMIT.ordinal(); if (queuingStatus[qstatus] == 0) { LOG.info("QueueFeeder stopping, timelimit exceeded."); } queuingStatus[qstatus]++; - context.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(1); + hitByTimelimitCounter.increment(1); } try { hasMore = context.nextKeyValue(); @@ -136,8 +156,7 @@ public void run() { String u = filterNormalize(url.toString()); if (u == null) { // filtered or failed to normalize - context.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_FILTERED_TOTAL).increment(1); + filteredCounter.increment(1); continue; } url = new Text(u); @@ -154,8 +173,7 @@ public void run() { QueuingStatus status = queues.addFetchItem(url, datum); queuingStatus[status.ordinal()]++; if (status == QueuingStatus.ABOVE_EXCEPTION_THRESHOLD) { - context.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_ABOVE_EXCEPTION_THRESHOLD_TOTAL).increment(1); + aboveExceptionThresholdCounter.increment(1); } cnt++; feed--; diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java index 1495f74914..8de2dcdf2c 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java @@ -24,6 +24,7 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.conf.Configuration; @@ -61,6 +62,10 @@ public class UpdateHostDbMapper protected URLFilters filters = null; protected URLNormalizers normalizers = null; + // Cached counter references to avoid repeated lookups in hot paths + protected Counter malformedUrlCounter; + protected Counter filteredRecordsCounter; + @Override public void setup(Mapper.Context context) { Configuration conf = context.getConfiguration(); @@ -72,6 +77,12 @@ public void setup(Mapper.Context context) { filters = new URLFilters(conf); if (normalize) normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); + + // Initialize cached counter references + malformedUrlCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_MALFORMED_URL_TOTAL); + filteredRecordsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL); } /** @@ -137,8 +148,7 @@ public void map(Text key, Writable value, try { url = new URL(keyStr); } catch (MalformedURLException e) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_MALFORMED_URL_TOTAL).increment(1); + malformedUrlCounter.increment(1); return; } String hostName = URLUtil.getHost(url); @@ -148,8 +158,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} crawldatum has been filtered", hostName); return; } @@ -222,8 +231,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} hostdatum has been filtered", keyStr); return; } @@ -247,8 +255,7 @@ public void map(Text key, Writable value, // Filtered out? if (buffer == null) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL).increment(1); + filteredRecordsCounter.increment(1); LOG.debug("UpdateHostDb: {} score has been filtered", keyStr); return; } diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java index 039fa5ba13..6c979f222e 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java @@ -31,6 +31,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.StringUtils; @@ -73,6 +74,11 @@ public class UpdateHostDbReducer protected BlockingQueue queue = new SynchronousQueue<>(); protected ThreadPoolExecutor executor = null; + // Cached counter references to avoid repeated lookups in hot paths + protected Counter urlLimitNotReachedCounter; + protected Counter totalHostsCounter; + protected Counter skippedNotEligibleCounter; + /** * Configures the thread pool and prestarts all resolver threads. */ @@ -146,6 +152,14 @@ public void setup(Reducer.Context context) // Run all threads in the pool executor.prestartAllCoreThreads(); } + + // Initialize cached counter references + urlLimitNotReachedCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL); + totalHostsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL); + skippedNotEligibleCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL); } /** @@ -380,14 +394,12 @@ else if (value instanceof FloatWritable) { // Impose limits on minimum number of URLs? if (urlLimit > -1l) { if (hostDatum.numRecords() < urlLimit) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL).increment(1); + urlLimitNotReachedCounter.increment(1); return; } } - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_TOTAL_HOSTS_TOTAL).increment(1); + totalHostsCounter.increment(1); // See if this record is to be checked if (shouldCheck(hostDatum)) { @@ -404,8 +416,7 @@ else if (value instanceof FloatWritable) { // Do not progress, the datum will be written in the resolver thread return; } else if (checkAny) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_SKIPPED_NOT_ELIGIBLE_TOTAL).increment(1); + skippedNotEligibleCounter.increment(1); LOG.debug("UpdateHostDb: {}: skipped_not_eligible", key); } diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index 9086a19839..b61a7f99cd 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; @@ -219,6 +220,18 @@ public static class IndexerReducer extends // Latency tracker for indexing timing metrics private LatencyTracker indexLatencyTracker; + // Cached counter references to avoid repeated lookups in hot paths + private Counter deletedRobotsNoIndexCounter; + private Counter deletedGoneCounter; + private Counter deletedRedirectsCounter; + private Counter deletedDuplicatesCounter; + private Counter skippedNotModifiedCounter; + private Counter errorsScoringFilterCounter; + private Counter errorsIndexingFilterCounter; + private Counter deletedByIndexingFilterCounter; + private Counter skippedByIndexingFilterCounter; + private Counter indexedCounter; + @Override public void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); @@ -247,6 +260,35 @@ public void setup(Reducer.Context c // Initialize latency tracker for indexing timing indexLatencyTracker = new LatencyTracker( NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_LATENCY); + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Reducer.Context context) { + deletedRobotsNoIndexCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL); + deletedGoneCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_GONE_TOTAL); + deletedRedirectsCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL); + deletedDuplicatesCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL); + skippedNotModifiedCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL); + errorsScoringFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_ERRORS_SCORING_FILTER_TOTAL); + errorsIndexingFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_ERRORS_INDEXING_FILTER_TOTAL); + deletedByIndexingFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL); + skippedByIndexingFilterCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL); + indexedCounter = context.getCounter( + NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_INDEXED_TOTAL); } @Override @@ -299,8 +341,7 @@ public void reduce(Text key, Iterable values, .indexOf("noindex") != -1) { // Delete it! context.write(key, DELETE_ACTION); - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_DELETED_ROBOTS_NOINDEX_TOTAL).increment(1); + deletedRobotsNoIndexCounter.increment(1); return; } } @@ -317,8 +358,7 @@ public void reduce(Text key, Iterable values, if (delete && fetchDatum != null) { if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_DELETED_GONE_TOTAL).increment(1); + deletedGoneCounter.increment(1); context.write(key, DELETE_ACTION); return; } @@ -327,8 +367,7 @@ public void reduce(Text key, Iterable values, || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL).increment(1); + deletedRedirectsCounter.increment(1); context.write(key, DELETE_ACTION); return; } @@ -340,16 +379,14 @@ public void reduce(Text key, Iterable values, // Whether to delete pages marked as duplicates if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL).increment(1); + deletedDuplicatesCounter.increment(1); context.write(key, DELETE_ACTION); return; } // Whether to skip DB_NOTMODIFIED pages if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL).increment(1); + skippedNotModifiedCounter.increment(1); return; } @@ -379,8 +416,7 @@ public void reduce(Text key, Iterable values, boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost); } catch (final ScoringFilterException e) { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_ERRORS_SCORING_FILTER_TOTAL).increment(1); + errorsScoringFilterCounter.increment(1); LOG.warn("Error calculating score {}: {}", key, e); return; } @@ -415,8 +451,7 @@ public void reduce(Text key, Iterable values, doc = filters.filter(doc, parse, key, fetchDatum, inlinks); } catch (final IndexingException e) { LOG.warn("Error indexing {}: ", key, e); - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_ERRORS_INDEXING_FILTER_TOTAL).increment(1); + errorsIndexingFilterCounter.increment(1); return; } @@ -426,11 +461,9 @@ public void reduce(Text key, Iterable values, if (deleteSkippedByIndexingFilter) { NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE); context.write(key, action); - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL).increment(1); + deletedByIndexingFilterCounter.increment(1); } else { - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL).increment(1); + skippedByIndexingFilterCounter.increment(1); } return; } @@ -453,8 +486,7 @@ public void reduce(Text key, Iterable values, // Record indexing latency indexLatencyTracker.record(System.currentTimeMillis() - indexStart); - context.getCounter(NutchMetrics.GROUP_INDEXER, - NutchMetrics.INDEXER_INDEXED_TOTAL).increment(1); + indexedCounter.increment(1); NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD); context.write(key, action); diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index df4f6af057..96e8c5a974 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -41,6 +41,7 @@ import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.Job; @@ -112,6 +113,31 @@ public static class WARCReducer // Metadata to JSON Gson gson = new Gson(); + // Cached counter references to avoid repeated lookups in hot paths + private Counter missingContentCounter; + private Counter missingMetadataCounter; + private Counter omittedEmptyResponseCounter; + private Counter invalidUriCounter; + private Counter recordsGeneratedCounter; + private Counter exceptionCounter; + + @Override + public void setup(Context context) { + // Initialize cached counter references + missingContentCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_CONTENT_TOTAL); + missingMetadataCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_METADATA_TOTAL); + omittedEmptyResponseCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL); + invalidUriCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_INVALID_URI_TOTAL); + recordsGeneratedCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_RECORDS_GENERATED_TOTAL); + exceptionCounter = context.getCounter( + NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_EXCEPTION_TOTAL); + } + @Override public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { @@ -148,15 +174,13 @@ public void reduce(Text key, Iterable values, // check that we have everything we need if (content == null) { LOG.info("Missing content for {}", key); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_MISSING_CONTENT_TOTAL).increment(1); + missingContentCounter.increment(1); return; } if (cd == null) { LOG.info("Missing fetch datum for {}", key); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_MISSING_METADATA_TOTAL).increment(1); + missingMetadataCounter.increment(1); return; } @@ -164,8 +188,7 @@ public void reduce(Text key, Iterable values, // Empty responses is everything that was not a regular response if (!(cd.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS || cd.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED)) { - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL).increment(1); + omittedEmptyResponseCounter.increment(1); return; } } @@ -240,8 +263,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); + invalidUriCounter.increment(1); return; } @@ -273,14 +295,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC resource record for {} : {}", key, exception.getMessage()); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); + exceptionCounter.increment(1); } // Do we need to emit a metadata record too? @@ -322,8 +342,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); + invalidUriCounter.increment(1); return; } @@ -339,14 +358,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); + exceptionCounter.increment(1); } } @@ -384,8 +401,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_INVALID_URI_TOTAL).increment(1); + invalidUriCounter.increment(1); return; } @@ -401,14 +417,12 @@ public void reduce(Text key, Iterable values, new ByteArrayInputStream(bos.toByteArray())); WARCRecord record = new WARCRecord(in); context.write(NullWritable.get(), new WARCWritable(record)); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_RECORDS_GENERATED_TOTAL).increment(1); + recordsGeneratedCounter.increment(1); } catch (IOException | IllegalStateException exception) { LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - context.getCounter(NutchMetrics.GROUP_WARC_EXPORTER, - NutchMetrics.WARC_EXCEPTION_TOTAL).increment(1); + exceptionCounter.increment(1); } } } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 7055a6d86a..a0378ec63d 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -114,6 +115,13 @@ private static class SitemapMapper extends Mapper values, Context context) originalDatum.setModifiedTime(sitemapDatum.getModifiedTime()); } - context.getCounter(NutchMetrics.GROUP_SITEMAP, - NutchMetrics.SITEMAP_EXISTING_ENTRIES_TOTAL).increment(1); + existingEntriesCounter.increment(1); context.write(key, originalDatum); } else if(sitemapDatum != null) { // For the newly discovered links via sitemap, set the status as unfetched and emit - context.getCounter(NutchMetrics.GROUP_SITEMAP, - NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).increment(1); + newEntriesCounter.increment(1); sitemapDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); context.write(key, sitemapDatum); } @@ -465,11 +487,11 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); - long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); - long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); - long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); - long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); - long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); + long filteredRecords = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL).getValue(); + long fromHostname = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FROM_HOSTNAME_TOTAL).getValue(); + long fromSeeds = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_SEEDS_TOTAL).getValue(); + long failedFetches = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL).getValue(); + long newSitemapEntries = job.getCounters().findCounter(NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_NEW_ENTRIES_TOTAL).getValue(); LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); From 1a22db333367ab25d88903267a68026319794ba4 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 11 Jan 2026 20:45:20 -0800 Subject: [PATCH 11/27] NUTCH-3143 GitHub workflow does not run all unit tests (#884) --- .github/workflows/junit-report.yml | 30 +++++++++++++++++++++++------- .github/workflows/master-build.yml | 11 +++++++---- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index e7658ffea6..80958285ce 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -25,33 +25,49 @@ jobs: checks: runs-on: ubuntu-latest steps: - - name: Download Test Report + - name: Download Test Report (Ubuntu) uses: dawidd6/action-download-artifact@v11 with: - name: junit-test-results + name: junit-test-results-ubuntu-latest workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} + path: ./results-ubuntu + continue-on-error: true + - name: Download Test Report (macOS) + uses: dawidd6/action-download-artifact@v11 + with: + name: junit-test-results-macos-latest + workflow: master-build.yml + run_id: ${{ github.event.workflow_run.id }} + path: ./results-macos + continue-on-error: true - name: Publish Test Report uses: mikepenz/action-junit-report@v5 with: report_paths: |- - ./test/TEST-*.xml - ./**/test/TEST-*.xml + ./results-ubuntu/**/TEST-*.xml + ./results-macos/**/TEST-*.xml check_name: |- JUnit Test Report JUnit Test Report Plugins commit: ${{ github.event.workflow_run.head_sha }} fail_on_failure: false - fail_on_parse_error: false # temporary while debugging missing result for TestMimeUtil + fail_on_parse_error: false require_tests: true require_passed_tests: true include_passed: false - include_skipped: true check_annotations: true + annotate_notice: true job_summary: true + detailed_summary: true + flaky_summary: true skip_success_summary: true include_time_in_summary: true + group_suite: true comment: true + updateComment: true + skip_comment_without_tests: true job_name: tests truncate_stack_traces: false - pr_id: ${{ github.event.workflow_run.pull_requests[0].number }} + annotations_limit: 50 + pr_id: ${{ github.event.workflow_run.pull_requests[0].number || '' }} diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index aa9219d280..495c4e3182 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -65,7 +65,7 @@ jobs: java: ['17'] os: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.os }} - timeout-minutes: 30 + timeout-minutes: 45 steps: - uses: actions/checkout@v5 - name: Set up JDK ${{ matrix.java }} @@ -99,13 +99,16 @@ jobs: - name: test plugins if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-plugins -buildfile build.xml + # fallback: run all tests if no specific filter matched (e.g., docs-only changes) + - name: test all (fallback) + if: ${{ steps.filter.outputs.buildconf == 'false' && steps.filter.outputs.core == 'false' && steps.filter.outputs.plugins == 'false' }} + run: ant clean test -buildfile build.xml - name: Upload Test Report uses: actions/upload-artifact@v4 if: always() with: - name: junit-test-results + name: junit-test-results-${{ matrix.os }} path: | ./build/test/TEST-*.xml ./build/**/test/TEST-*.xml - retention-days: 1 - overwrite: true \ No newline at end of file + retention-days: 1 \ No newline at end of file From e632e551507a8a95346895bd6679509dd35d05cc Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Mon, 12 Jan 2026 13:12:38 -0800 Subject: [PATCH 12/27] NUTCH-3143 GitHub workflow does not run all unit tests (#885) --- .github/workflows/junit-report.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index 80958285ce..23a251a58a 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -41,6 +41,23 @@ jobs: run_id: ${{ github.event.workflow_run.id }} path: ./results-macos continue-on-error: true + - name: Debug XML files + if: always() + run: | + echo "=== Listing downloaded artifacts ===" + find ./results-ubuntu ./results-macos -name "TEST-*.xml" 2>/dev/null | head -20 || echo "No files found" + echo "" + echo "=== TestCommonCrawlDataDumper.xml (macOS) ===" + cat ./results-macos/test/TEST-org.apache.nutch.tools.TestCommonCrawlDataDumper.xml 2>/dev/null || echo "File not found" + echo "" + echo "=== TestCommonCrawlDataDumper.xml (Ubuntu) ===" + cat ./results-ubuntu/test/TEST-org.apache.nutch.tools.TestCommonCrawlDataDumper.xml 2>/dev/null || echo "File not found" + echo "" + echo "=== TestPrefixStringMatcher.xml (Ubuntu) ===" + cat ./results-ubuntu/test/TEST-org.apache.nutch.util.TestPrefixStringMatcher.xml 2>/dev/null || echo "File not found" + echo "" + echo "=== TestPrefixStringMatcher.xml (macOS) ===" + cat ./results-macos/test/TEST-org.apache.nutch.util.TestPrefixStringMatcher.xml 2>/dev/null || echo "File not found" - name: Publish Test Report uses: mikepenz/action-junit-report@v5 with: From e3d0af384adf55fc9bced3a42571aad6016abd90 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 7 Jan 2026 23:05:54 +0100 Subject: [PATCH 13/27] NUTCH-3144 URLUtil unit tests fail after upgrade to crawler-commons 1.6 - adapt unit tests to changes introduced in https://github.com/crawler-commons/crawler-commons/pull/478 - test for example given in Javadoc of getDomainSuffix --- src/test/org/apache/nutch/util/TestURLUtil.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 9c89590a2e..4d8ae07971 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -147,6 +147,8 @@ public void testGetDomainSuffix() throws Exception { assertEquals("2000.hu", URLUtil.getDomainSuffix(url)); // test non-ASCII + url = new URL("https://www.taiuru.māori.nz/"); + assertEquals("xn--mori-qsa.nz", URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.flå.no"); assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.栃木.jp"); From 226ac7e8fee061f7dfafb83d9b2f0ce2f2fc4d85 Mon Sep 17 00:00:00 2001 From: Isabelle Giguere Date: Sat, 3 Jan 2026 16:53:59 -0500 Subject: [PATCH 14/27] NUTCH-1564: fix immediate refetch for pages not modified In setFetchSchedule, make sure 'refTime' is not in the past. Add unit test to reproduce the situation described in Jira. Unrelated fix in FetcherThread --- .../nutch/crawl/AdaptiveFetchSchedule.java | 15 ++++-- .../apache/nutch/fetcher/FetcherThread.java | 2 +- .../crawl/TestAdaptiveFetchSchedule.java | 52 +++++++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 6575ccb886..38e3162b19 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -219,15 +219,15 @@ private void setHostSpecificIntervals(String fileName, // The custom intervals should respect the boundaries of the default values. if (m < defaultMin) { LOG.error( - "Min. interval out of bounds on line {} in the config. file: `{}`", - lineNo, line); + "Min. interval out of bounds ({}) on line {} in the config. file: `{}`", + defaultMin, lineNo, line); continue; } if (M > defaultMax) { LOG.error( - "Max. interval out of bounds on line {} in the config. file: `{}`", - lineNo, line); + "Max. interval out of bounds ({}) on line {} in the config. file: `{}`", + defaultMax, lineNo, line); continue; } @@ -338,6 +338,10 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, if (delta > interval) interval = delta; refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000); + // make sure we are not in the past + if (refTime < fetchTime) { + refTime = fetchTime; + } } // Ensure the interval does not fall outside of bounds @@ -389,7 +393,8 @@ public static void main(String[] args) throws Exception { (p.getFetchInterval() / SECONDS_PER_DAY), miss); if (p.getFetchTime() <= curTime) { fetchCnt++; - fs.setFetchSchedule(new Text("http://www.example.com"), p, p + // why was "http://www.example.com" hard-coded here? + fs.setFetchSchedule(new Text(""), p, p .getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 297126e1bf..bfdf71d398 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -449,7 +449,7 @@ public void run() { } continue; } - if (!rules.isAllowed(fit.u)) { + if (!rules.isAllowed(fit.u.toString())) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); diff --git a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java index 377d49ec81..2ae06ecff9 100644 --- a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java +++ b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java @@ -24,6 +24,12 @@ import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.time.Duration; +import java.time.Instant; +import java.util.Date; +import java.util.Properties; /** * Test cases for AdaptiveFetchSchedule. @@ -36,6 +42,8 @@ public class TestAdaptiveFetchSchedule { private Configuration conf; private long curTime, lastModified; private int changed, interval, calculateInterval; + + private static final long ONE_DAY = 86400; @BeforeEach public void setUp() throws Exception { @@ -117,5 +125,49 @@ private void validateFetchInterval(int changed, int getInterval) { } } + + /** + * Test https://issues.apache.org/jira/browse/NUTCH-1564 + */ + @Test + public void testSetFetchSchedule() { + conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); + conf.set("db.fetch.schedule.adaptive.sync_delta", "true"); // default + conf.set("db.fetch.schedule.adaptive.sync_delta_rate", "0.3"); // default + conf.set("db.fetch.interval.default", String.valueOf(ONE_DAY * 2)); // 2 days + conf.set("db.fetch.schedule.adaptive.min_interval", String.valueOf(ONE_DAY)); // 1 day + conf.set("db.fetch.schedule.adaptive.max_interval", String.valueOf(ONE_DAY * 7)); // 7 days + conf.set("db.fetch.interval.max", String.valueOf(ONE_DAY * 7)); // 7 days + + // ignore adaptive-host-specific-intervals.txt + Text url = new Text("http://www.example2.com"); + + AdaptiveFetchSchedule fs = new AdaptiveFetchSchedule(); + fs.setConf(conf); + + CrawlDatum datum = prepareCrawlDatum(); + Date fetchTime = Date.from(Instant.now()); + // previous fetch 3 days ago + Date previousFetchTime = Date.from(Instant.now().minus(Duration.ofDays(3))); + // last modified 1 month ago + Date modifiedTime = Date.from(Instant.now().minus(Duration.ofDays(30))); + datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + datum.setRetriesSinceFetch(0); + datum.setModifiedTime(modifiedTime.getTime()); + datum.setFetchTime(fetchTime.getTime()); + + System.out.println("CrawlDatum fetchTime: " + fetchTime + "; modifiedTime: " + modifiedTime); + + fs.setFetchSchedule(url, datum, previousFetchTime.getTime(), modifiedTime.getTime(), + fetchTime.getTime(), modifiedTime.getTime(), CrawlDatum.STATUS_DB_NOTMODIFIED); + + Date nextFetchTime = new Date(datum.getFetchTime()); + System.out.println("CrawlDatum next fetchTime: " + nextFetchTime); + + assertTrue(nextFetchTime.after(fetchTime)); + // adapt milliseconds to seconds + assertTrue((nextFetchTime.getTime() - fetchTime.getTime()) / 1000 >= ONE_DAY); + assertTrue((nextFetchTime.getTime() - fetchTime.getTime()) / 1000 <= ONE_DAY * 7); + } } From 89e6b87c51d2f917407fb74765fecb1ac01a947a Mon Sep 17 00:00:00 2001 From: Isabelle Giguere Date: Sat, 3 Jan 2026 22:18:37 -0500 Subject: [PATCH 15/27] NUTCH-1564: fix AdaptiveFetchSchedule for unmodified pages Convert the fraction of the delta to a ratio of max interval, to avoid next fetchTime in the past. Add unit tests for different scenarios. --- .../nutch/crawl/AdaptiveFetchSchedule.java | 29 +++++--- .../crawl/TestAdaptiveFetchSchedule.java | 74 ++++++++++++++----- 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 38e3162b19..aae385174a 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -35,6 +35,7 @@ import java.lang.invoke.MethodHandles; import java.net.URI; import java.net.URISyntaxException; +import java.time.Duration; /** * This class implements an adaptive re-fetch algorithm. This works as follows: @@ -332,21 +333,29 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, case FetchSchedule.STATUS_UNKNOWN: break; } + + // Ensure the interval does not fall outside of bounds + float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL; + float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL; + if (SYNC_DELTA) { // try to synchronize with the time of change - long delta = (fetchTime - modifiedTime) / 1000L; - if (delta > interval) - interval = delta; - refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000); - // make sure we are not in the past - if (refTime < fetchTime) { - refTime = fetchTime; + long delta = (fetchTime - modifiedTime); + if (delta > (interval * 1000)) + interval = delta / 1000L; + // offset: a fraction (sync_delta_rate) of the difference between the last modification time, and the last fetch time. + long offset = Math.round(delta * SYNC_DELTA_RATE); + long maxIntervalMillis = (long) maxInterval * 1000L; + LOG.trace("delta (days): " + Duration.ofMillis(delta).toDays() + + "; offset (days): " + Duration.ofMillis(offset).toDays() + + "; maxInterval (days): " + Duration.ofMillis(maxIntervalMillis).toDays()); + // convert the offset to a ratio of max interval: avoid next fetchTime in the past, and mimic fetches within max interval + if (delta > 0 && offset > maxIntervalMillis) { + offset = offset / delta * maxIntervalMillis; // ex: 9/30*7 = 2.1 } + refTime = fetchTime - offset; } - // Ensure the interval does not fall outside of bounds - float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL; - float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL; if (interval < minInterval) { interval = minInterval; } else if (interval > maxInterval) { diff --git a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java index 2ae06ecff9..c06ae30076 100644 --- a/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java +++ b/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java @@ -42,8 +42,6 @@ public class TestAdaptiveFetchSchedule { private Configuration conf; private long curTime, lastModified; private int changed, interval, calculateInterval; - - private static final long ONE_DAY = 86400; @BeforeEach public void setUp() throws Exception { @@ -130,14 +128,57 @@ private void validateFetchInterval(int changed, int getInterval) { * Test https://issues.apache.org/jira/browse/NUTCH-1564 */ @Test - public void testSetFetchSchedule() { - conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); - conf.set("db.fetch.schedule.adaptive.sync_delta", "true"); // default - conf.set("db.fetch.schedule.adaptive.sync_delta_rate", "0.3"); // default - conf.set("db.fetch.interval.default", String.valueOf(ONE_DAY * 2)); // 2 days - conf.set("db.fetch.schedule.adaptive.min_interval", String.valueOf(ONE_DAY)); // 1 day - conf.set("db.fetch.schedule.adaptive.max_interval", String.valueOf(ONE_DAY * 7)); // 7 days - conf.set("db.fetch.interval.max", String.valueOf(ONE_DAY * 7)); // 7 days + public void testSetFetchSchedule1() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 172800 (2 days) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 604800 (7 days) + // db.fetch.interval.max = 604800 (7 days) + // 3-days cycle + // 30 days since last modified + doTestSetFetchSchedule(0.3, 2, 1, 7, 7, 3, 30); + } + + @Test + public void testSetFetchSchedule2() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 86400 (1 day) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 172800 (2 days) + // db.fetch.interval.max = 604800 (7 days) + // 1-day cycle + // 10 days since last modified + doTestSetFetchSchedule(0.3, 1, 1, 2, 7, 1, 10); + } + + @Test + public void testSetFetchSchedule3() { + // db.fetch.schedule.adaptive.sync_delta_rate = 0.3 (default) + // db.fetch.interval.default = 172800 (2 days) + // db.fetch.schedule.adaptive.min_interval = 86400 (1 day) + // db.fetch.schedule.adaptive.max_interval = 864000 (10 days) + // db.fetch.interval.max = 864000 (10 days) + // 3-days cycle + // 180 days since last modified + doTestSetFetchSchedule(0.3, 2, 1, 10, 10, 3, 180); + } + + private void doTestSetFetchSchedule(double deltaRate, int intervalDefaultDays, + int minIntervalDays, int maxIntervalDays, int intervalMaxDays, + int previousFetchTimeDays, int modifiedTimeDays) { + // need to properly override defaults + Properties props = new Properties(); + props.setProperty("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); + props.setProperty("db.fetch.schedule.adaptive.sync_delta", "true"); // default + props.setProperty("db.fetch.schedule.adaptive.sync_delta_rate", String.valueOf(deltaRate)); + props.setProperty("db.fetch.interval.default", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalDefaultDays)); + props.setProperty("db.fetch.schedule.adaptive.min_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * minIntervalDays)); + props.setProperty("db.fetch.schedule.adaptive.max_interval", String.valueOf(FetchSchedule.SECONDS_PER_DAY * maxIntervalDays)); + props.setProperty("db.fetch.interval.max", String.valueOf(FetchSchedule.SECONDS_PER_DAY * intervalMaxDays)); + + conf = NutchConfiguration.create(true, props); + inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); // default + dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); // default // ignore adaptive-host-specific-intervals.txt Text url = new Text("http://www.example2.com"); @@ -147,14 +188,12 @@ public void testSetFetchSchedule() { CrawlDatum datum = prepareCrawlDatum(); Date fetchTime = Date.from(Instant.now()); - // previous fetch 3 days ago - Date previousFetchTime = Date.from(Instant.now().minus(Duration.ofDays(3))); - // last modified 1 month ago - Date modifiedTime = Date.from(Instant.now().minus(Duration.ofDays(30))); + Date previousFetchTime = Date.from(Instant.now().minus(Duration.ofDays(previousFetchTimeDays))); + Date modifiedTime = Date.from(Instant.now().minus(Duration.ofDays(modifiedTimeDays))); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); datum.setRetriesSinceFetch(0); datum.setModifiedTime(modifiedTime.getTime()); - datum.setFetchTime(fetchTime.getTime()); + datum.setFetchTime(fetchTime.getTime()); System.out.println("CrawlDatum fetchTime: " + fetchTime + "; modifiedTime: " + modifiedTime); @@ -166,8 +205,9 @@ public void testSetFetchSchedule() { assertTrue(nextFetchTime.after(fetchTime)); // adapt milliseconds to seconds - assertTrue((nextFetchTime.getTime() - fetchTime.getTime()) / 1000 >= ONE_DAY); - assertTrue((nextFetchTime.getTime() - fetchTime.getTime()) / 1000 <= ONE_DAY * 7); + long fetchTimeDiff = (nextFetchTime.getTime() - fetchTime.getTime()) / 1000L ; + assertTrue(fetchTimeDiff >= FetchSchedule.SECONDS_PER_DAY * minIntervalDays); + assertTrue(fetchTimeDiff <= FetchSchedule.SECONDS_PER_DAY * maxIntervalDays); } } From 366a601d273f9bdce75b07351f0e14e8bc97abec Mon Sep 17 00:00:00 2001 From: Isabelle Giguere Date: Thu, 8 Jan 2026 10:38:02 -0500 Subject: [PATCH 16/27] NUTCH-1564: address code review comments. Add TestCrawlDbStatesExtended (was TODOTestCrawlDbStates) --- .../org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 11 ++++++----- src/java/org/apache/nutch/fetcher/FetcherThread.java | 2 +- ...wlDbStates.java => TestCrawlDbStatesExtended.java} | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) rename src/test/org/apache/nutch/crawl/{TODOTestCrawlDbStates.java => TestCrawlDbStatesExtended.java} (99%) diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index aae385174a..68d65ba1ad 100644 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -346,9 +346,10 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, // offset: a fraction (sync_delta_rate) of the difference between the last modification time, and the last fetch time. long offset = Math.round(delta * SYNC_DELTA_RATE); long maxIntervalMillis = (long) maxInterval * 1000L; - LOG.trace("delta (days): " + Duration.ofMillis(delta).toDays() - + "; offset (days): " + Duration.ofMillis(offset).toDays() - + "; maxInterval (days): " + Duration.ofMillis(maxIntervalMillis).toDays()); + if (LOG.isTraceEnabled()) { + LOG.trace("delta (days): {}; offset (days): {}; maxInterval (days): {}", + Duration.ofMillis(delta).toDays(), Duration.ofMillis(offset).toDays(), Duration.ofMillis(maxIntervalMillis).toDays()); + } // convert the offset to a ratio of max interval: avoid next fetchTime in the past, and mimic fetches within max interval if (delta > 0 && offset > maxIntervalMillis) { offset = offset / delta * maxIntervalMillis; // ex: 9/30*7 = 2.1 @@ -402,8 +403,8 @@ public static void main(String[] args) throws Exception { (p.getFetchInterval() / SECONDS_PER_DAY), miss); if (p.getFetchTime() <= curTime) { fetchCnt++; - // why was "http://www.example.com" hard-coded here? - fs.setFetchSchedule(new Text(""), p, p + // Text (url) required by the API, but not relevant here. + fs.setFetchSchedule(new Text(), p, p .getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index bfdf71d398..297126e1bf 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -449,7 +449,7 @@ public void run() { } continue; } - if (!rules.isAllowed(fit.u.toString())) { + if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); diff --git a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java similarity index 99% rename from src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java rename to src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java index dfad393512..2e6ea55af1 100644 --- a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java @@ -29,7 +29,7 @@ import static org.apache.nutch.crawl.CrawlDatum.*; import static org.junit.jupiter.api.Assertions.fail; -public class TODOTestCrawlDbStates extends TestCrawlDbStates { +public class TestCrawlDbStatesExtended extends TestCrawlDbStates { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); From fb8538bccce9a2b9ad2890710a181119472b7bf0 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Mon, 12 Jan 2026 14:53:56 -0800 Subject: [PATCH 17/27] NUTCH-3148 Cache Ivy dependencies in GitHub CI builds (#886) --- .github/workflows/master-build.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 495c4e3182..153c09b936 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -34,6 +34,13 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Javadoc run: ant clean javadoc -buildfile build.xml rat: @@ -49,6 +56,13 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Run Apache Rat run: ant clean run-rat -buildfile build.xml - name: Cache unknown licenses @@ -73,6 +87,13 @@ jobs: with: java-version: ${{ matrix.java }} distribution: 'temurin' + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 id: filter with: From cc74d716bcc112446958667b39d9bbf5a7694d2e Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 25 Feb 2026 22:41:11 +0100 Subject: [PATCH 18/27] NUTCH-3148 Cache Ivy dependencies in GitHub CI builds Integrate Ivy cache in Common Crawl specific workflow. --- .github/workflows/cc-build.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cc-build.yml b/.github/workflows/cc-build.yml index e382c8771a..1e8f23a691 100644 --- a/.github/workflows/cc-build.yml +++ b/.github/workflows/cc-build.yml @@ -29,9 +29,9 @@ jobs: os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.java }} distribution: 'temurin' @@ -53,5 +53,12 @@ jobs: - name: Install recent public suffix list run: | curl https://publicsuffix.org/list/public_suffix_list.dat -o conf/effective_tld_names.dat + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Test run: ant clean test -buildfile build.xml From e742fc5663997baac3a7422b270f438652ab89ea Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Tue, 20 Jan 2026 21:04:49 -0800 Subject: [PATCH 19/27] NUTCH-3143 GitHub workflow does not run all unit tests (#889) --- .github/workflows/junit-report.yml | 33 ++++-------------------------- .github/workflows/master-build.yml | 17 ++++++++++----- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index 23a251a58a..06be656a98 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -31,48 +31,23 @@ jobs: name: junit-test-results-ubuntu-latest workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} - path: ./results-ubuntu continue-on-error: true - - name: Download Test Report (macOS) - uses: dawidd6/action-download-artifact@v11 - with: - name: junit-test-results-macos-latest - workflow: master-build.yml - run_id: ${{ github.event.workflow_run.id }} - path: ./results-macos - continue-on-error: true - - name: Debug XML files - if: always() - run: | - echo "=== Listing downloaded artifacts ===" - find ./results-ubuntu ./results-macos -name "TEST-*.xml" 2>/dev/null | head -20 || echo "No files found" - echo "" - echo "=== TestCommonCrawlDataDumper.xml (macOS) ===" - cat ./results-macos/test/TEST-org.apache.nutch.tools.TestCommonCrawlDataDumper.xml 2>/dev/null || echo "File not found" - echo "" - echo "=== TestCommonCrawlDataDumper.xml (Ubuntu) ===" - cat ./results-ubuntu/test/TEST-org.apache.nutch.tools.TestCommonCrawlDataDumper.xml 2>/dev/null || echo "File not found" - echo "" - echo "=== TestPrefixStringMatcher.xml (Ubuntu) ===" - cat ./results-ubuntu/test/TEST-org.apache.nutch.util.TestPrefixStringMatcher.xml 2>/dev/null || echo "File not found" - echo "" - echo "=== TestPrefixStringMatcher.xml (macOS) ===" - cat ./results-macos/test/TEST-org.apache.nutch.util.TestPrefixStringMatcher.xml 2>/dev/null || echo "File not found" - name: Publish Test Report uses: mikepenz/action-junit-report@v5 with: report_paths: |- - ./results-ubuntu/**/TEST-*.xml - ./results-macos/**/TEST-*.xml + ./test/TEST-*.xml + ./**/test/TEST-*.xml check_name: |- JUnit Test Report JUnit Test Report Plugins commit: ${{ github.event.workflow_run.head_sha }} fail_on_failure: false - fail_on_parse_error: false + fail_on_parse_error: true require_tests: true require_passed_tests: true include_passed: false + include_skipped: true check_annotations: true annotate_notice: true job_summary: true diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 153c09b936..d73bb3a693 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -120,13 +120,20 @@ jobs: - name: test plugins if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }} run: ant clean test-plugins -buildfile build.xml - # fallback: run all tests if no specific filter matched (e.g., docs-only changes) - - name: test all (fallback) - if: ${{ steps.filter.outputs.buildconf == 'false' && steps.filter.outputs.core == 'false' && steps.filter.outputs.plugins == 'false' }} - run: ant clean test -buildfile build.xml + - name: Check for test results + id: check_tests + if: always() && matrix.os == 'ubuntu-latest' + run: | + shopt -s globstar nullglob + files=(./build/test/TEST-*.xml ./build/**/test/TEST-*.xml) + if [ ${#files[@]} -gt 0 ]; then + echo "has_results=true" >> $GITHUB_OUTPUT + else + echo "has_results=false" >> $GITHUB_OUTPUT + fi - name: Upload Test Report uses: actions/upload-artifact@v4 - if: always() + if: always() && matrix.os == 'ubuntu-latest' && steps.check_tests.outputs.has_results == 'true' with: name: junit-test-results-${{ matrix.os }} path: | From b8d1fc965f5cfc06c8465381bf0b84e0bd974963 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Tue, 20 Jan 2026 21:56:58 -0800 Subject: [PATCH 20/27] NUTCH-3143 GitHub workflow does not run all unit tests (#890) --- .github/workflows/junit-report.yml | 2 +- build.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index 06be656a98..e2359737ba 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -33,7 +33,7 @@ jobs: run_id: ${{ github.event.workflow_run.id }} continue-on-error: true - name: Publish Test Report - uses: mikepenz/action-junit-report@v5 + uses: mikepenz/action-junit-report@v6 with: report_paths: |- ./test/TEST-*.xml diff --git a/build.xml b/build.xml index 092bb6ae0e..d8ee908824 100644 --- a/build.xml +++ b/build.xml @@ -497,7 +497,7 @@ - + @@ -512,7 +512,7 @@ - + From 1db8e7d5c3eb85f2a162835574371e67eff8cd27 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 5 Feb 2026 14:44:03 -0800 Subject: [PATCH 21/27] NUTCH-3142 Add Error Context to Metrics (#882) --- ivy/ivy.xml | 11 +- .../apache/nutch/crawl/CrawlDbReducer.java | 7 + .../org/apache/nutch/crawl/Generator.java | 14 +- .../org/apache/nutch/crawl/Generator2.java | 13 +- src/java/org/apache/nutch/crawl/Injector.java | 5 + .../apache/nutch/fetcher/FetcherThread.java | 28 +- .../apache/nutch/hostdb/ResolverThread.java | 14 + .../nutch/hostdb/UpdateHostDbMapper.java | 9 +- .../nutch/indexer/IndexerMapReduce.java | 16 +- .../apache/nutch/metrics/ErrorTracker.java | 383 +++++++++++++ .../apache/nutch/metrics/NutchMetrics.java | 81 ++- .../org/apache/nutch/parse/ParseSegment.java | 6 + .../apache/nutch/tools/warc/WARCExporter.java | 22 +- .../apache/nutch/util/SitemapProcessor.java | 6 + .../nutch/metrics/TestErrorTracker.java | 514 ++++++++++++++++++ 15 files changed, 1061 insertions(+), 68 deletions(-) create mode 100644 src/java/org/apache/nutch/metrics/ErrorTracker.java create mode 100644 src/test/org/apache/nutch/metrics/TestErrorTracker.java diff --git a/ivy/ivy.xml b/ivy/ivy.xml index a13894110c..9d396ee7b1 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -142,11 +142,14 @@ - + - - - + + + + + + diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java index e263f8463c..3ba1734478 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java @@ -31,6 +31,7 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.PriorityQueue; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; @@ -49,6 +50,7 @@ public class CrawlDbReducer extends private boolean additionsAllowed; private int maxInterval; private FetchSchedule schedule; + private ErrorTracker errorTracker; @Override public void setup(Reducer.Context context) { @@ -60,6 +62,8 @@ public void setup(Reducer.Context context) { schedule = FetchScheduleFactory.getFetchSchedule(conf); int maxLinks = conf.getInt("db.update.max.inlinks", 10000); linked = new InlinkPriorityQueue(maxLinks); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB, context); } @Override @@ -162,6 +166,7 @@ public void reduce(Text key, Iterable values, scfilters.orphanedScore(key, old); } catch (ScoringFilterException e) { LOG.warn("Couldn't update orphaned score, key={}: {}", key, e); + errorTracker.incrementCounters(e); } context.write(key, old); // Dynamic counter based on status name @@ -208,6 +213,7 @@ public void reduce(Text key, Iterable values, } catch (ScoringFilterException e) { LOG.warn("Cannot filter init score for url {}, using default: {}", key, e.getMessage()); + errorTracker.incrementCounters(e); result.setScore(0.0f); } } @@ -317,6 +323,7 @@ public void reduce(Text key, Iterable values, scfilters.updateDbScore(key, oldSet ? old : null, result, linkList); } catch (Exception e) { LOG.warn("Couldn't update score, key={}: {}", key, e); + errorTracker.incrementCounters(e); } // remove generation time, if any result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index db15f0426e..456ba689a9 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -67,6 +67,7 @@ import org.apache.hadoop.io.WritableComparator; import org.apache.nutch.hostdb.HostDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; @@ -191,6 +192,7 @@ public static class SelectorMapper private int intervalThreshold = -1; private byte restrictStatus = -1; private JexlScript expr = null; + private ErrorTracker errorTracker; @Override public void setup( @@ -215,6 +217,8 @@ public void setup( restrictStatus = CrawlDatum.getStatusByName(restrictStatusString); } expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null)); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); } @Override @@ -231,8 +235,7 @@ public void map(Text key, CrawlDatum value, Context context) return; } } catch (URLFilterException e) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_URL_FILTER_EXCEPTION_TOTAL).increment(1); + errorTracker.incrementCounters(e); LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage()); } } @@ -261,6 +264,7 @@ public void map(Text key, CrawlDatum value, Context context) try { sort = scfilters.generatorSortValue(key, crawlDatum, sort); } catch (ScoringFilterException sfe) { + errorTracker.incrementCounters(sfe); LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe); } @@ -326,6 +330,7 @@ public static class SelectorReducer extends private JexlScript maxCountExpr = null; private JexlScript fetchDelayExpr = null; private Map hostDatumCache = new HashMap<>(); + private ErrorTracker errorTracker; public void readHostDb() throws IOException { if (conf.get(GENERATOR_HOSTDB) == null) { @@ -419,6 +424,8 @@ public void setup(Context context) throws IOException { fetchDelayExpr = JexlUtil .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null)); } + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); readHostDb(); } @@ -516,8 +523,7 @@ public void reduce(FloatWritable key, Iterable values, } catch (MalformedURLException e) { LOG.warn("Malformed URL: '{}', skipping ({})", urlString, StringUtils.stringifyException(e)); - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_MALFORMED_URL_TOTAL).increment(1); + errorTracker.incrementCounters(e); continue; } diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index 0e678a7330..6b619445b7 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -65,6 +65,7 @@ import org.apache.hadoop.util.hash.MurmurHash; import org.apache.nutch.crawl.Generator2.SelectorReducer.DomainLimits; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; @@ -336,6 +337,7 @@ public static class SelectorMapper private int intervalThreshold = -1; private String restrictStatus = null; private DomainScorePair outputKey = new DomainScorePair(); + private ErrorTracker errorTracker; @Override public void setup( @@ -363,6 +365,9 @@ public void setup( if (GENERATOR_COUNT_VALUE_DOMAIN.equals(conf.get(GENERATOR_COUNT_MODE))) { byDomain = true; } + + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); } /** Select & invert subset due for fetch. */ @@ -384,10 +389,7 @@ public void map(Text key, CrawlDatum value, Context context) } } catch (URLFilterException e) { LOG.warn("Couldn't filter url {}: {}", key, e.getMessage()); - context - .getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_URL_FILTER_EXCEPTION_TOTAL) - .increment(1); + errorTracker.incrementCounters(e); } } @@ -450,8 +452,7 @@ public void map(Text key, CrawlDatum value, Context context) } catch (Exception e) { LOG.warn("Malformed URL: '{}', skipping ({})", urlString, e.getMessage()); - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_MALFORMED_URL_TOTAL).increment(1); + errorTracker.incrementCounters(e); return; } diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 4845e4363d..de963c9530 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -36,6 +36,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -127,6 +128,7 @@ public static class InjectMapper private boolean url404Purging; private String scope; private boolean filterNormalizeAll = false; + private ErrorTracker errorTracker; @Override public void setup(Context context) { @@ -147,6 +149,8 @@ public void setup(Context context) { curTime = conf.getLong("injector.current.time", System.currentTimeMillis()); url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_INJECTOR, context); } /* Filter and normalize the input url */ @@ -239,6 +243,7 @@ public void map(Text key, Writable value, Context context) LOG.warn( "Cannot filter injected score for url {}, using default ({})", url, e.getMessage()); + errorTracker.incrementCounters(e); } context.getCounter(NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).increment(1); diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 297126e1bf..23c2e23542 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -41,6 +41,7 @@ import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.fetcher.Fetcher.FetcherRun; import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.LatencyTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; @@ -195,6 +196,9 @@ public class FetcherThread extends Thread { // Latency tracker for fetch timing metrics private LatencyTracker fetchLatencyTracker; + // Error tracker for categorized error metrics + private ErrorTracker errorTracker; + public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context, AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, @@ -352,6 +356,9 @@ private void initCounters() { // Initialize latency tracker for fetch timing fetchLatencyTracker = new LatencyTracker( NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_LATENCY); + + // Initialize error tracker for categorized error metrics + errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); } @Override @@ -612,15 +619,7 @@ public void run() { } catch (Throwable t) { // unexpected exception // unblock fetchQueues.finishFetchItem(fit); - String message; - if (LOG.isDebugEnabled()) { - message = StringUtils.stringifyException(t); - } else if (logUtil.logShort(t)) { - message = t.getClass().getName(); - } else { - message = StringUtils.stringifyException(t); - } - logError(fit.url, message); + logError(fit.url, t); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } @@ -634,6 +633,8 @@ public void run() { } // Emit fetch latency metrics fetchLatencyTracker.emitCounters(context); + // Emit error metrics + errorTracker.emitCounters(context); activeThreads.decrementAndGet(); // count threads LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads); @@ -753,10 +754,19 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem fit) return fit; } + private void logError(Text url, Throwable t) { + String message = t.getClass().getName() + ": " + t.getMessage(); + LOG.info("{} {} fetch of {} failed with: {}", getName(), + Thread.currentThread().getId(), url, message); + errors.incrementAndGet(); + errorTracker.recordError(t); + } + private void logError(Text url, String message) { LOG.info("{} {} fetch of {} failed with: {}", getName(), Thread.currentThread().getId(), url, message); errors.incrementAndGet(); + errorTracker.recordError(ErrorTracker.ErrorType.OTHER); } private void countProtocolVersions(Metadata contentMetadata) { diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java index 2690a73fad..4c42c02b4b 100644 --- a/src/java/org/apache/nutch/hostdb/ResolverThread.java +++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java @@ -24,6 +24,7 @@ import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.slf4j.Logger; @@ -124,11 +125,24 @@ public void run() { // Dynamic counter based on failure count - can't cache context.getCounter(NutchMetrics.GROUP_HOSTDB, createFailureCounterLabel(datum)).increment(1); + // Common error counters for consistency + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.ERROR_TOTAL).increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.ERROR_NETWORK_TOTAL).increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.ERROR_TOTAL).increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + ErrorTracker.getCounterName(ioe)).increment(1); } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + NutchMetrics.ERROR_TOTAL).increment(1); + context.getCounter(NutchMetrics.GROUP_HOSTDB, + ErrorTracker.getCounterName(e)).increment(1); } context.getCounter(NutchMetrics.GROUP_HOSTDB, diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java index 8de2dcdf2c..10a08d55a0 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java @@ -31,6 +31,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -63,8 +64,8 @@ public class UpdateHostDbMapper protected URLNormalizers normalizers = null; // Cached counter references to avoid repeated lookups in hot paths - protected Counter malformedUrlCounter; protected Counter filteredRecordsCounter; + protected ErrorTracker errorTracker; @Override public void setup(Mapper.Context context) { @@ -79,10 +80,10 @@ public void setup(Mapper.Context context) { normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); // Initialize cached counter references - malformedUrlCounter = context.getCounter( - NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_MALFORMED_URL_TOTAL); filteredRecordsCounter = context.getCounter( NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_HOSTDB, context); } /** @@ -148,7 +149,7 @@ public void map(Text key, Writable value, try { url = new URL(keyStr); } catch (MalformedURLException e) { - malformedUrlCounter.increment(1); + errorTracker.incrementCounters(e); return; } String hostName = URLUtil.getHost(url); diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java index b61a7f99cd..50da12b8a2 100644 --- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java +++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java @@ -41,6 +41,7 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.LatencyTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.metadata.Metadata; @@ -226,11 +227,12 @@ public static class IndexerReducer extends private Counter deletedRedirectsCounter; private Counter deletedDuplicatesCounter; private Counter skippedNotModifiedCounter; - private Counter errorsScoringFilterCounter; - private Counter errorsIndexingFilterCounter; private Counter deletedByIndexingFilterCounter; private Counter skippedByIndexingFilterCounter; private Counter indexedCounter; + + // Error tracker with cached counters + private ErrorTracker errorTracker; @Override public void setup(Reducer.Context context) { @@ -279,16 +281,14 @@ private void initCounters(Reducer.C NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL); skippedNotModifiedCounter = context.getCounter( NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL); - errorsScoringFilterCounter = context.getCounter( - NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_ERRORS_SCORING_FILTER_TOTAL); - errorsIndexingFilterCounter = context.getCounter( - NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_ERRORS_INDEXING_FILTER_TOTAL); deletedByIndexingFilterCounter = context.getCounter( NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL); skippedByIndexingFilterCounter = context.getCounter( NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL); indexedCounter = context.getCounter( NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_INDEXED_TOTAL); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_INDEXER, context); } @Override @@ -416,7 +416,7 @@ public void reduce(Text key, Iterable values, boost = scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost); } catch (final ScoringFilterException e) { - errorsScoringFilterCounter.increment(1); + errorTracker.incrementCounters(e); LOG.warn("Error calculating score {}: {}", key, e); return; } @@ -451,7 +451,7 @@ public void reduce(Text key, Iterable values, doc = filters.filter(doc, parse, key, fetchDatum, inlinks); } catch (final IndexingException e) { LOG.warn("Error indexing {}: ", key, e); - errorsIndexingFilterCounter.increment(1); + errorTracker.incrementCounters(e); return; } diff --git a/src/java/org/apache/nutch/metrics/ErrorTracker.java b/src/java/org/apache/nutch/metrics/ErrorTracker.java new file mode 100644 index 0000000000..1921071605 --- /dev/null +++ b/src/java/org/apache/nutch/metrics/ErrorTracker.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metrics; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.UnknownHostException; +import java.util.EnumMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.hadoop.mapreduce.TaskInputOutputContext; + +/** + * A utility class for tracking errors by category with automatic classification. + * + *

    This class provides thread-safe error counting with automatic categorization + * based on exception type. It uses a bounded set of error categories to stay within + * Hadoop's counter limits (~120 counters). + * + *

    Usage: + *

    + * // In mapper/reducer setup or thread initialization
    + * errorTracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER);
    + * 
    + * // When catching exceptions
    + * try {
    + *     // ... operation ...
    + * } catch (Exception e) {
    + *     errorTracker.recordError(e);  // Auto-categorizes
    + * }
    + * 
    + * // Or with manual categorization
    + * errorTracker.recordError(ErrorTracker.ErrorType.NETWORK);
    + * 
    + * // In cleanup - emit all error counters
    + * errorTracker.emitCounters(context);
    + * 
    + * + *

    Emits the following counters: + *

      + *
    • errors_total - total number of errors across all categories
    • + *
    • errors_network_total - network-related errors
    • + *
    • errors_protocol_total - protocol errors
    • + *
    • errors_parsing_total - parsing errors
    • + *
    • errors_url_total - URL-related errors
    • + *
    • errors_scoring_total - scoring filter errors
    • + *
    • errors_indexing_total - indexing filter errors
    • + *
    • errors_timeout_total - timeout errors
    • + *
    • errors_other_total - uncategorized errors
    • + *
    + * + * @since 1.22 + */ +public class ErrorTracker { + + /** + * Error type categories for classification. + * Uses a bounded set to stay within Hadoop's counter limits. + */ + public enum ErrorType { + /** Network-related errors (IOException, SocketException, etc.) */ + NETWORK, + /** Protocol errors (ProtocolException, ProtocolNotFound) */ + PROTOCOL, + /** Parsing errors (ParseException, ParserNotFound) */ + PARSING, + /** URL-related errors (MalformedURLException, URLFilterException) */ + URL, + /** Scoring filter errors */ + SCORING, + /** Indexing filter errors */ + INDEXING, + /** Timeout errors (SocketTimeoutException) */ + TIMEOUT, + /** Other uncategorized errors */ + OTHER + } + + private final String group; + private final Map counts; + private final AtomicLong totalCount; + + // Cached counter references for performance (optional - set via initCounters) + private org.apache.hadoop.mapreduce.Counter cachedTotalCounter; + private final Map cachedCounters; + + /** + * Creates a new ErrorTracker for the specified counter group. + * + *

    This constructor creates an ErrorTracker without cached counters. + * Call {@link #initCounters(TaskInputOutputContext)} in setup() to cache + * counter references for better performance. + * + * @param group the Hadoop counter group name (e.g., NutchMetrics.GROUP_FETCHER) + */ + public ErrorTracker(String group) { + this.group = group; + this.counts = new EnumMap<>(ErrorType.class); + this.cachedCounters = new EnumMap<>(ErrorType.class); + this.totalCount = new AtomicLong(0); + + // Initialize all counts to 0 + for (ErrorType type : ErrorType.values()) { + counts.put(type, new AtomicLong(0)); + } + } + + /** + * Creates a new ErrorTracker with cached counter references. + * + *

    This constructor caches all counter references at creation time, + * avoiding repeated counter lookups in hot paths. + * + * @param group the Hadoop counter group name + * @param context the Hadoop task context for caching counters + */ + public ErrorTracker(String group, TaskInputOutputContext context) { + this(group); + initCounters(context); + } + + /** + * Initializes cached counter references from the Hadoop context. + * + *

    Call this method in the mapper/reducer setup() method to cache + * counter references and avoid repeated lookups during processing. + * + * @param context the Hadoop task context + */ + public void initCounters(TaskInputOutputContext context) { + cachedTotalCounter = context.getCounter(group, NutchMetrics.ERROR_TOTAL); + for (ErrorType type : ErrorType.values()) { + cachedCounters.put(type, context.getCounter(group, getCounterName(type))); + } + } + + /** + * Records an error with automatic categorization based on the throwable type. + * + * @param t the throwable to categorize and record + */ + public void recordError(Throwable t) { + recordError(categorize(t)); + } + + /** + * Records an error with explicit category. + * + * @param type the error type category + */ + public void recordError(ErrorType type) { + counts.get(type).incrementAndGet(); + totalCount.incrementAndGet(); + } + + /** + * Returns the count for a specific error type. + * + * @param type the error type + * @return the count for that error type + */ + public long getCount(ErrorType type) { + return counts.get(type).get(); + } + + /** + * Returns the total count of all errors. + * + * @return the total error count + */ + public long getTotalCount() { + return totalCount.get(); + } + + /** + * Emits all error counters to the Hadoop context. + * + *

    Should be called once during cleanup to emit aggregated metrics. + * Only emits counters for error types that have non-zero counts. + * + *

    If counters were cached via {@link #initCounters(TaskInputOutputContext)}, + * uses the cached references for better performance. + * + * @param context the Hadoop task context + */ + public void emitCounters(TaskInputOutputContext context) { + // Use cached counters if available, otherwise look up + if (cachedTotalCounter != null) { + cachedTotalCounter.increment(totalCount.get()); + for (ErrorType type : ErrorType.values()) { + long count = counts.get(type).get(); + if (count > 0) { + cachedCounters.get(type).increment(count); + } + } + } else { + // Fallback to direct lookup + context.getCounter(group, NutchMetrics.ERROR_TOTAL).increment(totalCount.get()); + for (ErrorType type : ErrorType.values()) { + long count = counts.get(type).get(); + if (count > 0) { + context.getCounter(group, getCounterName(type)).increment(count); + } + } + } + } + + /** + * Directly increments cached error counters without local accumulation. + * + *

    Use this method when you want to immediately update Hadoop counters + * rather than accumulating locally and emitting in cleanup. + * Requires {@link #initCounters(TaskInputOutputContext)} to have been called. + * + * @param t the throwable to categorize and count + * @throws IllegalStateException if counters have not been initialized + */ + public void incrementCounters(Throwable t) { + incrementCounters(categorize(t)); + } + + /** + * Directly increments cached error counters without local accumulation. + * + *

    Use this method when you want to immediately update Hadoop counters + * rather than accumulating locally and emitting in cleanup. + * Requires {@link #initCounters(TaskInputOutputContext)} to have been called. + * + * @param type the error type to count + * @throws IllegalStateException if counters have not been initialized + */ + public void incrementCounters(ErrorType type) { + if (cachedTotalCounter == null) { + throw new IllegalStateException( + "Counters not initialized. Call initCounters() first."); + } + cachedTotalCounter.increment(1); + cachedCounters.get(type).increment(1); + } + + /** + * Categorizes a throwable into an error type. + * + *

    The categorization checks the exception class hierarchy to determine + * the most appropriate category. Timeout exceptions are checked first as + * they are a subclass of IOException. + * + * @param t the throwable to categorize + * @return the appropriate ErrorType for the throwable + */ + public static ErrorType categorize(Throwable t) { + if (t == null) { + return ErrorType.OTHER; + } + + String className = t.getClass().getName(); + + // Check for timeout first (before general IOException) + if (t instanceof SocketTimeoutException + || className.contains("TimeoutException") + || className.contains("Timeout")) { + return ErrorType.TIMEOUT; + } + + // Network errors + if (t instanceof SocketException + || t instanceof UnknownHostException + || className.contains("ConnectException") + || className.contains("NoRouteToHostException") + || className.contains("ConnectionRefusedException")) { + return ErrorType.NETWORK; + } + + // URL errors (check before general IOException since MalformedURLException extends IOException) + if (t instanceof MalformedURLException + || className.contains("URLFilterException") + || className.contains("URISyntaxException")) { + return ErrorType.URL; + } + + // General IOException (but not the specific subtypes above) + if (t instanceof IOException) { + return ErrorType.NETWORK; + } + + // Protocol errors + if (className.contains("ProtocolException") + || className.contains("ProtocolNotFound")) { + return ErrorType.PROTOCOL; + } + + // Parsing errors + if (className.contains("ParseException") + || className.contains("ParserNotFound") + || className.contains("SAXException") + || className.contains("ParserConfigurationException")) { + return ErrorType.PARSING; + } + + // Scoring errors + if (className.contains("ScoringFilterException")) { + return ErrorType.SCORING; + } + + // Indexing errors + if (className.contains("IndexingException")) { + return ErrorType.INDEXING; + } + + // Check cause chain for more specific categorization + Throwable cause = t.getCause(); + if (cause != null && cause != t) { + ErrorType causeType = categorize(cause); + if (causeType != ErrorType.OTHER) { + return causeType; + } + } + + return ErrorType.OTHER; + } + + /** + * Gets the counter name constant for a given error type. + * + * @param type the error type + * @return the counter name constant from NutchMetrics + */ + public static String getCounterName(ErrorType type) { + switch (type) { + case NETWORK: + return NutchMetrics.ERROR_NETWORK_TOTAL; + case PROTOCOL: + return NutchMetrics.ERROR_PROTOCOL_TOTAL; + case PARSING: + return NutchMetrics.ERROR_PARSING_TOTAL; + case URL: + return NutchMetrics.ERROR_URL_TOTAL; + case SCORING: + return NutchMetrics.ERROR_SCORING_TOTAL; + case INDEXING: + return NutchMetrics.ERROR_INDEXING_TOTAL; + case TIMEOUT: + return NutchMetrics.ERROR_TIMEOUT_TOTAL; + case OTHER: + default: + return NutchMetrics.ERROR_OTHER_TOTAL; + } + } + + /** + * Gets the counter name for a throwable based on its categorization. + * + *

    This is a convenience method for direct use in catch blocks: + *

    +   * } catch (Exception e) {
    +   *     context.getCounter(group, ErrorTracker.getCounterName(e)).increment(1);
    +   * }
    +   * 
    + * + * @param t the throwable to get the counter name for + * @return the counter name constant from NutchMetrics + */ + public static String getCounterName(Throwable t) { + return getCounterName(categorize(t)); + } +} diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java index 8b187cf3fb..1f70db09dd 100644 --- a/src/java/org/apache/nutch/metrics/NutchMetrics.java +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -185,9 +185,6 @@ private NutchMetrics() { /** URLs rejected by URL filters. */ public static final String GENERATOR_URL_FILTERS_REJECTED_TOTAL = "url_filters_rejected_total"; - /** URL filter exceptions. */ - public static final String GENERATOR_URL_FILTER_EXCEPTION_TOTAL = "url_filter_exception_total"; - /** URLs rejected by fetch schedule. */ public static final String GENERATOR_SCHEDULE_REJECTED_TOTAL = "schedule_rejected_total"; @@ -206,9 +203,6 @@ private NutchMetrics() { /** URLs rejected due to fetch interval exceeding threshold. */ public static final String GENERATOR_INTERVAL_REJECTED_TOTAL = "interval_rejected_total"; - /** Malformed URLs encountered. */ - public static final String GENERATOR_MALFORMED_URL_TOTAL = "malformed_url_total"; - /** URLs skipped due to per-host overflow. */ public static final String GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL = "urls_skipped_per_host_overflow_total"; @@ -271,12 +265,6 @@ private NutchMetrics() { /** Documents skipped by indexing filter. */ public static final String INDEXER_SKIPPED_BY_INDEXING_FILTER_TOTAL = "skipped_by_indexing_filter_total"; - /** Scoring filter errors. */ - public static final String INDEXER_ERRORS_SCORING_FILTER_TOTAL = "errors_scoring_filter_total"; - - /** Indexing filter errors. */ - public static final String INDEXER_ERRORS_INDEXING_FILTER_TOTAL = "errors_indexing_filter_total"; - /** Documents indexed (added or updated). */ public static final String INDEXER_INDEXED_TOTAL = "indexed_total"; @@ -319,9 +307,6 @@ private NutchMetrics() { // HostDb Counters // ========================================================================= - /** Malformed URLs in HostDb. */ - public static final String HOSTDB_MALFORMED_URL_TOTAL = "malformed_url_total"; - /** Records filtered in HostDb. */ public static final String HOSTDB_FILTERED_RECORDS_TOTAL = "filtered_records_total"; @@ -533,15 +518,9 @@ private NutchMetrics() { /** Omitted empty responses in WARC export. */ public static final String WARC_OMITTED_EMPTY_RESPONSE_TOTAL = "omitted_empty_response_total"; - /** Invalid URIs in WARC export. */ - public static final String WARC_INVALID_URI_TOTAL = "invalid_uri_total"; - /** WARC records generated. */ public static final String WARC_RECORDS_GENERATED_TOTAL = "records_generated_total"; - /** Exceptions during WARC export. */ - public static final String WARC_EXCEPTION_TOTAL = "exception_total"; - // ========================================================================= // Domain Statistics Counters (enum-based, kept for compatibility) // ========================================================================= @@ -634,5 +613,65 @@ private NutchMetrics() { * Used with {@link LatencyTracker} to emit indexing timing counters. */ public static final String INDEXER_LATENCY = "index_latency"; + + // ========================================================================= + // Common Error Counter Names (used with component-specific groups) + // These constants are shared across all components for consistent error + // categorization. Use with ErrorTracker for automatic classification. + // ========================================================================= + + /** + * Total errors across all categories. + * This is incremented alongside any category-specific error counter. + */ + public static final String ERROR_TOTAL = "errors_total"; + + /** + * Network-related errors. + * Includes: IOException, SocketException, ConnectException, UnknownHostException + */ + public static final String ERROR_NETWORK_TOTAL = "errors_network_total"; + + /** + * Protocol errors. + * Includes: ProtocolException, ProtocolNotFound + */ + public static final String ERROR_PROTOCOL_TOTAL = "errors_protocol_total"; + + /** + * Parsing errors. + * Includes: ParseException, ParserNotFound + */ + public static final String ERROR_PARSING_TOTAL = "errors_parsing_total"; + + /** + * URL-related errors. + * Includes: MalformedURLException, URLFilterException + */ + public static final String ERROR_URL_TOTAL = "errors_url_total"; + + /** + * Scoring filter errors. + * Includes: ScoringFilterException + */ + public static final String ERROR_SCORING_TOTAL = "errors_scoring_total"; + + /** + * Indexing filter errors. + * Includes: IndexingException + */ + public static final String ERROR_INDEXING_TOTAL = "errors_indexing_total"; + + /** + * Timeout errors. + * Includes: SocketTimeoutException, connection timeouts + */ + public static final String ERROR_TIMEOUT_TOTAL = "errors_timeout_total"; + + /** + * Other uncategorized errors. + * Used as fallback for exceptions not matching any specific category. + */ + public static final String ERROR_OTHER_TOTAL = "errors_other_total"; } diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index a7fbe066ce..0b2a6f2290 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.LatencyTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.protocols.Response; @@ -83,6 +84,7 @@ public static class ParseSegmentMapper extends private ScoringFilters scfilters; private boolean skipTruncated; private LatencyTracker parseLatencyTracker; + private ErrorTracker errorTracker; @Override public void setup(Mapper, Content, Text, ParseImpl>.Context context) { @@ -91,6 +93,8 @@ public void setup(Mapper, Content, Text, ParseImpl>.Contex skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true); parseLatencyTracker = new LatencyTracker( NutchMetrics.GROUP_PARSER, NutchMetrics.PARSER_LATENCY); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_PARSER, context); } @Override @@ -133,6 +137,7 @@ public void map(WritableComparable key, Content content, parseResult = parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: {}: {}", key, StringUtils.stringifyException(e)); + errorTracker.incrementCounters(e); return; } @@ -164,6 +169,7 @@ public void map(WritableComparable key, Content content, scfilters.passScoreAfterParsing(url, content, parse); } catch (ScoringFilterException e) { LOG.warn("Error passing score: {}: {}", url, e.getMessage()); + errorTracker.incrementCounters(ErrorTracker.ErrorType.SCORING); } long end = System.currentTimeMillis(); diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index 96e8c5a974..f271adfe94 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -58,6 +58,7 @@ import org.apache.nutch.parse.ParseText; import org.apache.nutch.protocol.Content; import org.apache.nutch.tools.WARCUtils; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; @@ -117,9 +118,8 @@ public static class WARCReducer private Counter missingContentCounter; private Counter missingMetadataCounter; private Counter omittedEmptyResponseCounter; - private Counter invalidUriCounter; private Counter recordsGeneratedCounter; - private Counter exceptionCounter; + private ErrorTracker errorTracker; @Override public void setup(Context context) { @@ -130,12 +130,10 @@ public void setup(Context context) { NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_METADATA_TOTAL); omittedEmptyResponseCounter = context.getCounter( NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL); - invalidUriCounter = context.getCounter( - NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_INVALID_URI_TOTAL); recordsGeneratedCounter = context.getCounter( NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_RECORDS_GENERATED_TOTAL); - exceptionCounter = context.getCounter( - NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_EXCEPTION_TOTAL); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER, context); } @Override @@ -263,7 +261,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - invalidUriCounter.increment(1); + errorTracker.incrementCounters(e); return; } @@ -300,7 +298,7 @@ public void reduce(Text key, Iterable values, LOG.error( "Exception when generating WARC resource record for {} : {}", key, exception.getMessage()); - exceptionCounter.increment(1); + errorTracker.incrementCounters(exception); } // Do we need to emit a metadata record too? @@ -342,7 +340,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - invalidUriCounter.increment(1); + errorTracker.incrementCounters(e); return; } @@ -363,7 +361,7 @@ public void reduce(Text key, Iterable values, LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - exceptionCounter.increment(1); + errorTracker.incrementCounters(exception); } } @@ -401,7 +399,7 @@ public void reduce(Text key, Iterable values, .append(uri.toASCIIString()).append(CRLF); } catch (Exception e) { LOG.error("Invalid URI {} ", key); - invalidUriCounter.increment(1); + errorTracker.incrementCounters(e); return; } @@ -422,7 +420,7 @@ public void reduce(Text key, Iterable values, LOG.error( "Exception when generating WARC metadata record for {} : {}", key, exception.getMessage(), exception); - exceptionCounter.increment(1); + errorTracker.incrementCounters(exception); } } } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index a0378ec63d..4b55a72ebb 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -46,6 +46,7 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.hostdb.HostDatum; +import org.apache.nutch.metrics.ErrorTracker; import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -121,6 +122,7 @@ private static class SitemapMapper extends Mapper mockContext; + + @Mock + private Counter mockCounter; + + @BeforeEach + void setUp() { + // Configure mock context to return mock counter for any counter request + lenient().when(mockContext.getCounter(anyString(), anyString())).thenReturn(mockCounter); + } + + // ========================================================================= + // Network Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeNetworkErrors() { + // Test IOException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new IOException("Connection failed"))); + + // Test SocketException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new SocketException("Socket closed"))); + + // Test UnknownHostException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new UnknownHostException("example.com"))); + + // Test ConnectException + assertEquals(ErrorType.NETWORK, + ErrorTracker.categorize(new ConnectException("Connection refused"))); + } + + // ========================================================================= + // Timeout Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeTimeoutErrors() { + // Test SocketTimeoutException + assertEquals(ErrorType.TIMEOUT, + ErrorTracker.categorize(new SocketTimeoutException("Read timed out"))); + } + + @Test + public void testCategorizeTimeoutByClassName() { + // Test custom exception with "Timeout" in class name + // The categorize method checks className.contains("Timeout") + Exception customTimeout = new CustomTimeoutException("Custom timeout"); + assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(customTimeout)); + } + + // Custom exception class for testing class name-based detection + private static class CustomTimeoutException extends Exception { + CustomTimeoutException(String message) { + super(message); + } + } + + // ========================================================================= + // URL Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeUrlErrors() { + // Test MalformedURLException + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new MalformedURLException("Invalid URL"))); + + // Test URISyntaxException + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new URISyntaxException("bad uri", "Invalid syntax"))); + } + + @Test + public void testCategorizeUrlFilterException() { + // Test URLFilterException (Nutch-specific) + assertEquals(ErrorType.URL, + ErrorTracker.categorize(new URLFilterException("URL filtered"))); + } + + // ========================================================================= + // Protocol Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeProtocolErrors() { + // Test ProtocolException (Nutch-specific) + assertEquals(ErrorType.PROTOCOL, + ErrorTracker.categorize(new ProtocolException("Protocol error"))); + + // Test ProtocolNotFound (Nutch-specific) + assertEquals(ErrorType.PROTOCOL, + ErrorTracker.categorize(new ProtocolNotFound("ftp"))); + } + + // ========================================================================= + // Parsing Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeParsingErrors() { + // Test ParseException (Nutch-specific) + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new ParseException("Parse failed"))); + + // Test ParserNotFound (Nutch-specific) + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new ParserNotFound("text/unknown"))); + + // Test SAXException + assertEquals(ErrorType.PARSING, + ErrorTracker.categorize(new SAXException("XML parse error"))); + } + + // ========================================================================= + // Scoring Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeScoringErrors() { + // Test ScoringFilterException (Nutch-specific) + assertEquals(ErrorType.SCORING, + ErrorTracker.categorize(new ScoringFilterException("Scoring failed"))); + } + + // ========================================================================= + // Indexing Error Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeIndexingErrors() { + // Test IndexingException (Nutch-specific) + assertEquals(ErrorType.INDEXING, + ErrorTracker.categorize(new IndexingException("Indexing failed"))); + } + + // ========================================================================= + // Other/Fallback Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeNullThrowable() { + // Null should return OTHER + assertEquals(ErrorType.OTHER, ErrorTracker.categorize(null)); + } + + @Test + public void testCategorizeGenericException() { + // Generic Exception should return OTHER + assertEquals(ErrorType.OTHER, + ErrorTracker.categorize(new Exception("Generic error"))); + + // RuntimeException should return OTHER + assertEquals(ErrorType.OTHER, + ErrorTracker.categorize(new RuntimeException("Runtime error"))); + } + + // ========================================================================= + // Cause Chain Categorization Tests + // ========================================================================= + + @Test + public void testCategorizeCauseChain() { + // Exception with a network cause should be categorized as NETWORK + IOException cause = new IOException("Root cause"); + Exception wrapper = new Exception("Wrapper", cause); + assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(wrapper)); + + // Exception with a timeout cause should be categorized as TIMEOUT + SocketTimeoutException timeoutCause = new SocketTimeoutException("Timeout"); + Exception timeoutWrapper = new Exception("Wrapper", timeoutCause); + assertEquals(ErrorType.TIMEOUT, ErrorTracker.categorize(timeoutWrapper)); + } + + @Test + public void testCategorizeNestedCauseChain() { + // Deep nested cause chain: RuntimeException -> Exception -> IOException + IOException rootCause = new IOException("Root cause"); + Exception middleWrapper = new Exception("Middle", rootCause); + RuntimeException outerWrapper = new RuntimeException("Outer", middleWrapper); + assertEquals(ErrorType.NETWORK, ErrorTracker.categorize(outerWrapper)); + + // Deep nested with Nutch-specific exception + ScoringFilterException scoringCause = new ScoringFilterException("Scoring error"); + Exception wrapper1 = new Exception("Wrapper 1", scoringCause); + Exception wrapper2 = new Exception("Wrapper 2", wrapper1); + assertEquals(ErrorType.SCORING, ErrorTracker.categorize(wrapper2)); + } + + // ========================================================================= + // Record Error Tests (Local Accumulation) + // ========================================================================= + + @Test + public void testRecordErrorByType() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Initially all counts should be 0 + assertEquals(0, tracker.getTotalCount()); + assertEquals(0, tracker.getCount(ErrorType.NETWORK)); + + // Record a NETWORK error + tracker.recordError(ErrorType.NETWORK); + assertEquals(1, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.NETWORK)); + assertEquals(0, tracker.getCount(ErrorType.TIMEOUT)); + + // Record another NETWORK error + tracker.recordError(ErrorType.NETWORK); + assertEquals(2, tracker.getTotalCount()); + assertEquals(2, tracker.getCount(ErrorType.NETWORK)); + + // Record a TIMEOUT error + tracker.recordError(ErrorType.TIMEOUT); + assertEquals(3, tracker.getTotalCount()); + assertEquals(2, tracker.getCount(ErrorType.NETWORK)); + assertEquals(1, tracker.getCount(ErrorType.TIMEOUT)); + } + + @Test + public void testRecordErrorByThrowable() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Record an IOException (should be categorized as NETWORK) + tracker.recordError(new IOException("Test")); + assertEquals(1, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.NETWORK)); + + // Record a SocketTimeoutException (should be categorized as TIMEOUT) + tracker.recordError(new SocketTimeoutException("Test")); + assertEquals(2, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.TIMEOUT)); + + // Record a MalformedURLException (should be categorized as URL) + tracker.recordError(new MalformedURLException("Test")); + assertEquals(3, tracker.getTotalCount()); + assertEquals(1, tracker.getCount(ErrorType.URL)); + } + + // ========================================================================= + // Counter Name Mapping Tests + // ========================================================================= + + @Test + public void testGetCounterName() { + // Test counter name mapping + assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL, + ErrorTracker.getCounterName(ErrorType.NETWORK)); + assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL, + ErrorTracker.getCounterName(ErrorType.PROTOCOL)); + assertEquals(NutchMetrics.ERROR_PARSING_TOTAL, + ErrorTracker.getCounterName(ErrorType.PARSING)); + assertEquals(NutchMetrics.ERROR_URL_TOTAL, + ErrorTracker.getCounterName(ErrorType.URL)); + assertEquals(NutchMetrics.ERROR_SCORING_TOTAL, + ErrorTracker.getCounterName(ErrorType.SCORING)); + assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL, + ErrorTracker.getCounterName(ErrorType.INDEXING)); + assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL, + ErrorTracker.getCounterName(ErrorType.TIMEOUT)); + assertEquals(NutchMetrics.ERROR_OTHER_TOTAL, + ErrorTracker.getCounterName(ErrorType.OTHER)); + } + + @Test + public void testGetCounterNameForThrowable() { + // Test getting counter name directly from throwable + assertEquals(NutchMetrics.ERROR_NETWORK_TOTAL, + ErrorTracker.getCounterName(new IOException("Test"))); + assertEquals(NutchMetrics.ERROR_TIMEOUT_TOTAL, + ErrorTracker.getCounterName(new SocketTimeoutException("Test"))); + assertEquals(NutchMetrics.ERROR_URL_TOTAL, + ErrorTracker.getCounterName(new MalformedURLException("Test"))); + assertEquals(NutchMetrics.ERROR_OTHER_TOTAL, + ErrorTracker.getCounterName(new RuntimeException("Test"))); + + // Test Nutch-specific exceptions + assertEquals(NutchMetrics.ERROR_PROTOCOL_TOTAL, + ErrorTracker.getCounterName(new ProtocolException("Test"))); + assertEquals(NutchMetrics.ERROR_PARSING_TOTAL, + ErrorTracker.getCounterName(new ParseException("Test"))); + assertEquals(NutchMetrics.ERROR_SCORING_TOTAL, + ErrorTracker.getCounterName(new ScoringFilterException("Test"))); + assertEquals(NutchMetrics.ERROR_INDEXING_TOTAL, + ErrorTracker.getCounterName(new IndexingException("Test"))); + } + + // ========================================================================= + // Hadoop Context Integration Tests (Using Mocks) + // ========================================================================= + + @Test + public void testConstructorWithContext() { + // Create ErrorTracker with context - should initialize counters + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Verify counters were requested from context + // Total counter + 8 error type counters = 9 calls + verify(mockContext, atLeast(9)).getCounter(anyString(), anyString()); + } + + @Test + public void testInitCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Initialize counters + tracker.initCounters(mockContext); + + // Verify counters were requested + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL); + } + + @Test + public void testIncrementCountersWithType() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Increment counters directly + tracker.incrementCounters(ErrorType.NETWORK); + + // Verify counter was incremented (total + specific type) + verify(mockCounter, times(2)).increment(1); + } + + @Test + public void testIncrementCountersWithThrowable() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Increment counters with throwable + tracker.incrementCounters(new IOException("Test")); + + // Verify counter was incremented (total + NETWORK type) + verify(mockCounter, times(2)).increment(1); + } + + @Test + public void testIncrementCountersWithoutInit() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Should throw IllegalStateException when counters not initialized + assertThrows(IllegalStateException.class, () -> { + tracker.incrementCounters(ErrorType.NETWORK); + }); + } + + @Test + public void testEmitCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Record some errors locally + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.TIMEOUT); + + // Emit counters (without cached counters - uses fallback) + tracker.emitCounters(mockContext); + + // Verify counters were requested and incremented + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_NETWORK_TOTAL); + verify(mockContext).getCounter(NutchMetrics.GROUP_FETCHER, NutchMetrics.ERROR_TIMEOUT_TOTAL); + } + + @Test + public void testEmitCountersWithCachedCounters() { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER, mockContext); + + // Reset mock to clear constructor calls + reset(mockCounter); + + // Record some errors locally + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.NETWORK); + tracker.recordError(ErrorType.TIMEOUT); + + // Emit counters (with cached counters) + tracker.emitCounters(mockContext); + + // Verify cached counters were used (increment called with accumulated values) + verify(mockCounter).increment(3L); // total count + verify(mockCounter).increment(2L); // NETWORK count + verify(mockCounter).increment(1L); // TIMEOUT count + } + + // ========================================================================= + // Thread Safety Tests + // ========================================================================= + + @Test + public void testThreadSafety() throws InterruptedException { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Create multiple threads that record errors concurrently + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + for (int j = 0; j < 100; j++) { + tracker.recordError(ErrorType.NETWORK); + } + }); + } + + // Start all threads + for (Thread thread : threads) { + thread.start(); + } + + // Wait for all threads to complete + for (Thread thread : threads) { + thread.join(); + } + + // Verify counts + assertEquals(1000, tracker.getTotalCount()); + assertEquals(1000, tracker.getCount(ErrorType.NETWORK)); + } + + @Test + public void testThreadSafetyMixedErrorTypes() throws InterruptedException { + ErrorTracker tracker = new ErrorTracker(NutchMetrics.GROUP_FETCHER); + + // Create threads that record different error types concurrently + Thread networkThread = new Thread(() -> { + for (int i = 0; i < 500; i++) { + tracker.recordError(ErrorType.NETWORK); + } + }); + + Thread timeoutThread = new Thread(() -> { + for (int i = 0; i < 300; i++) { + tracker.recordError(ErrorType.TIMEOUT); + } + }); + + Thread urlThread = new Thread(() -> { + for (int i = 0; i < 200; i++) { + tracker.recordError(ErrorType.URL); + } + }); + + networkThread.start(); + timeoutThread.start(); + urlThread.start(); + + networkThread.join(); + timeoutThread.join(); + urlThread.join(); + + // Verify counts + assertEquals(1000, tracker.getTotalCount()); + assertEquals(500, tracker.getCount(ErrorType.NETWORK)); + assertEquals(300, tracker.getCount(ErrorType.TIMEOUT)); + assertEquals(200, tracker.getCount(ErrorType.URL)); + } +} From 2e2374daa19b69c5fa0387e0b757cc3f5ee7c4c2 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Tue, 10 Feb 2026 11:09:16 -0800 Subject: [PATCH 22/27] NUTCH-3150 Expand Caching Hadoop Counter References (#892) --- .../org/apache/nutch/crawl/CrawlDbFilter.java | 30 +++++-- .../apache/nutch/crawl/CrawlDbReducer.java | 21 ++++- .../apache/nutch/crawl/DeduplicationJob.java | 17 +++- .../org/apache/nutch/crawl/Generator.java | 80 ++++++++++++++----- src/java/org/apache/nutch/crawl/Injector.java | 58 +++++++++++--- .../org/apache/nutch/fetcher/Fetcher.java | 41 +++++++--- .../apache/nutch/hostdb/ResolverThread.java | 69 +++++++++++----- .../nutch/hostdb/UpdateHostDbMapper.java | 11 ++- .../nutch/hostdb/UpdateHostDbReducer.java | 7 ++ .../org/apache/nutch/indexer/CleaningJob.java | 18 ++++- .../nutch/scoring/webgraph/WebGraph.java | 23 +++++- .../apache/nutch/tools/warc/WARCExporter.java | 11 ++- .../apache/nutch/util/DomainStatistics.java | 31 +++++-- .../apache/nutch/util/SitemapProcessor.java | 18 ++++- 14 files changed, 343 insertions(+), 92 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java index 7f28a3a85a..912c6e4abf 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java @@ -22,6 +22,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metrics.NutchMetrics; @@ -50,6 +51,11 @@ public class CrawlDbFilter extends private String scope; + // Cached counter references for performance + private Counter goneRecordsRemovedCounter; + private Counter orphanRecordsRemovedCounter; + private Counter urlsFilteredCounter; + private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); @@ -68,6 +74,21 @@ public void setup(Mapper.Context context) { scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB); normalizers = new URLNormalizers(conf, scope); } + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + goneRecordsRemovedCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL); + orphanRecordsRemovedCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL); + urlsFilteredCounter = context.getCounter( + NutchMetrics.GROUP_CRAWLDB_FILTER, NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL); } private Text newKey = new Text(); @@ -81,15 +102,13 @@ public void map(Text key, CrawlDatum value, // https://issues.apache.org/jira/browse/NUTCH-1101 check status first, // cheaper than normalizing or filtering if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) { - context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, - NutchMetrics.CRAWLDB_GONE_RECORDS_REMOVED_TOTAL).increment(1); + goneRecordsRemovedCounter.increment(1); return; } // Whether to remove orphaned pages // https://issues.apache.org/jira/browse/NUTCH-1932 if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) { - context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, - NutchMetrics.CRAWLDB_ORPHAN_RECORDS_REMOVED_TOTAL).increment(1); + orphanRecordsRemovedCounter.increment(1); return; } if (url != null && urlNormalizers) { @@ -109,8 +128,7 @@ public void map(Text key, CrawlDatum value, } } if (url == null) { - context.getCounter(NutchMetrics.GROUP_CRAWLDB_FILTER, - NutchMetrics.CRAWLDB_URLS_FILTERED_TOTAL).increment(1); + urlsFilteredCounter.increment(1); } else { // URL has passed filters newKey.set(url); // collect it diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java index 3ba1734478..3454116575 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java @@ -18,13 +18,16 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; @@ -52,6 +55,9 @@ public class CrawlDbReducer extends private FetchSchedule schedule; private ErrorTracker errorTracker; + // Cached counter references for status-based metrics + private Map statusCounters = new HashMap<>(); + @Override public void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); @@ -66,6 +72,15 @@ public void setup(Reducer.Context context) { errorTracker = new ErrorTracker(NutchMetrics.GROUP_CRAWLDB, context); } + /** + * Get counter for status, caching for subsequent lookups. + */ + private Counter getStatusCounter(byte status, Context context) { + return statusCounters.computeIfAbsent(status, + s -> context.getCounter(NutchMetrics.GROUP_CRAWLDB, + CrawlDatum.getStatusName(s))); + } + @Override public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { @@ -170,8 +185,7 @@ public void reduce(Text key, Iterable values, } context.write(key, old); // Dynamic counter based on status name - context.getCounter(NutchMetrics.GROUP_CRAWLDB, - CrawlDatum.getStatusName(old.getStatus())).increment(1); + getStatusCounter(old.getStatus(), context).increment(1); } else { LOG.warn("Missing fetch and old value, signature={}", StringUtil.toHexString(signature)); @@ -329,8 +343,7 @@ public void reduce(Text key, Iterable values, result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); context.write(key, result); // Dynamic counter based on status name - context.getCounter(NutchMetrics.GROUP_CRAWLDB, - CrawlDatum.getStatusName(result.getStatus())).increment(1); + getStatusCounter(result.getStatus(), context).increment(1); } } diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index d5f983a273..50aa4cd7bd 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -128,11 +128,25 @@ public static class DedupReducer protected String[] compareOrder; + // Cached counter reference for performance + private Counter documentsMarkedDuplicateCounter; + @Override public void setup( Reducer.Context context) { Configuration conf = context.getConfiguration(); compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(","); + + // Initialize cached counter reference + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + documentsMarkedDuplicateCounter = context.getCounter( + NutchMetrics.GROUP_DEDUP, NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL); } protected void writeOutAsDuplicate(CrawlDatum datum, @@ -140,8 +154,7 @@ protected void writeOutAsDuplicate(CrawlDatum datum, throws IOException, InterruptedException { datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE); Text key = (Text) datum.getMetaData().remove(urlKey); - context.getCounter(NutchMetrics.GROUP_DEDUP, - NutchMetrics.DEDUP_DOCUMENTS_MARKED_DUPLICATE_TOTAL).increment(1); + documentsMarkedDuplicateCounter.increment(1); context.write(key, datum); } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 456ba689a9..57bf7f4766 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -194,6 +194,17 @@ public static class SelectorMapper private JexlScript expr = null; private ErrorTracker errorTracker; + // Cached counter references for performance + private Counter urlFiltersRejectedCounter; + private Counter scheduleRejectedCounter; + private Counter waitForUpdateCounter; + private Counter exprRejectedCounter; + private Counter statusRejectedCounter; + private Counter scoreTooLowCounter; + private Counter intervalRejectedCounter; + private Counter hostsAffectedPerHostOverflowCounter; + private Counter urlsSkippedPerHostOverflowCounter; + @Override public void setup( Mapper.Context context) @@ -219,6 +230,32 @@ public void setup( expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null)); // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlFiltersRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL); + scheduleRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL); + waitForUpdateCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL); + exprRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL); + statusRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL); + scoreTooLowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL); + intervalRejectedCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL); + hostsAffectedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL); + urlsSkippedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL); } @Override @@ -230,8 +267,7 @@ public void map(Text key, CrawlDatum value, Context context) // URLFilters try { if (filters.filter(url.toString()) == null) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_URL_FILTERS_REJECTED_TOTAL).increment(1); + urlFiltersRejectedCounter.increment(1); return; } } catch (URLFilterException e) { @@ -245,8 +281,7 @@ public void map(Text key, CrawlDatum value, Context context) if (!schedule.shouldFetch(url, crawlDatum, curTime)) { LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url, crawlDatum.getFetchTime(), curTime); - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_SCHEDULE_REJECTED_TOTAL).increment(1); + scheduleRejectedCounter.increment(1); return; } @@ -255,8 +290,7 @@ public void map(Text key, CrawlDatum value, Context context) if (oldGenTime != null) { // awaiting fetch & update if (oldGenTime.get() + genDelay > curTime) { // still wait for // update - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_WAIT_FOR_UPDATE_TOTAL).increment(1); + waitForUpdateCounter.increment(1); return; } } @@ -271,22 +305,19 @@ public void map(Text key, CrawlDatum value, Context context) // check expr if (expr != null) { if (!crawlDatum.execute(expr, key.toString())) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_EXPR_REJECTED_TOTAL).increment(1); + exprRejectedCounter.increment(1); return; } } if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_STATUS_REJECTED_TOTAL).increment(1); + statusRejectedCounter.increment(1); return; } // consider only entries with a score superior to the threshold if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_SCORE_TOO_LOW_TOTAL).increment(1); + scoreTooLowCounter.increment(1); return; } @@ -294,8 +325,7 @@ public void map(Text key, CrawlDatum value, Context context) // threshold if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_INTERVAL_REJECTED_TOTAL).increment(1); + intervalRejectedCounter.increment(1); return; } @@ -332,6 +362,10 @@ public static class SelectorReducer extends private Map hostDatumCache = new HashMap<>(); private ErrorTracker errorTracker; + // Cached counter references for performance + private Counter hostsAffectedPerHostOverflowCounter; + private Counter urlsSkippedPerHostOverflowCounter; + public void readHostDb() throws IOException { if (conf.get(GENERATOR_HOSTDB) == null) { return; @@ -426,10 +460,22 @@ public void setup(Context context) throws IOException { } // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context); + // Initialize cached counter references + initReducerCounters(context); readHostDb(); } + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initReducerCounters(Context context) { + hostsAffectedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL); + urlsSkippedPerHostOverflowCounter = context.getCounter( + NutchMetrics.GROUP_GENERATOR, NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL); + } + @Override public void cleanup(Context context) throws IOException, InterruptedException { @@ -555,15 +601,13 @@ public void reduce(FloatWritable key, Iterable values, hostCount[1] = 1; } else { if (hostCount[1] == (maxCount+1)) { - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_HOSTS_AFFECTED_PER_HOST_OVERFLOW_TOTAL).increment(1); + hostsAffectedPerHostOverflowCounter.increment(1); LOG.info( "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.", hostordomain, maxCount, maxNumSegments); } // skip this entry - context.getCounter(NutchMetrics.GROUP_GENERATOR, - NutchMetrics.GENERATOR_URLS_SKIPPED_PER_HOST_OVERFLOW_TOTAL).increment(1); + urlsSkippedPerHostOverflowCounter.increment(1); continue; } } diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index de963c9530..ae154350ef 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -24,6 +24,7 @@ import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -130,6 +131,12 @@ public static class InjectMapper private boolean filterNormalizeAll = false; private ErrorTracker errorTracker; + // Cached counter references for performance + private Counter urlsFilteredCounter; + private Counter urlsInjectedCounter; + private Counter urlsPurged404Counter; + private Counter urlsPurgedFilterCounter; + @Override public void setup(Context context) { Configuration conf = context.getConfiguration(); @@ -151,6 +158,22 @@ public void setup(Context context) { url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false); // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_INJECTOR, context); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlsFilteredCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL); + urlsInjectedCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL); + urlsPurged404Counter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL); + urlsPurgedFilterCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL); } /* Filter and normalize the input url */ @@ -223,8 +246,7 @@ public void map(Text key, Writable value, Context context) url = filterNormalize(url); if (url == null) { - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_FILTERED_TOTAL).increment(1); + urlsFilteredCounter.increment(1); } else { CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); @@ -245,8 +267,7 @@ public void map(Text key, Writable value, Context context) url, e.getMessage()); errorTracker.incrementCounters(e); } - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_INJECTED_TOTAL).increment(1); + urlsInjectedCounter.increment(1); context.write(key, datum); } } else if (value instanceof CrawlDatum) { @@ -256,16 +277,14 @@ public void map(Text key, Writable value, Context context) // remove 404 urls if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus()) { - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_PURGED_404_TOTAL).increment(1); + urlsPurged404Counter.increment(1); return; } if (filterNormalizeAll) { String url = filterNormalize(key.toString()); if (url == null) { - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_PURGED_FILTER_TOTAL).increment(1); + urlsPurgedFilterCounter.increment(1); } else { key.set(url); context.write(key, datum); @@ -285,6 +304,10 @@ public static class InjectReducer private CrawlDatum old = new CrawlDatum(); private CrawlDatum injected = new CrawlDatum(); + // Cached counter references for performance + private Counter urlsInjectedUniqueCounter; + private Counter urlsMergedCounter; + @Override public void setup(Context context) { Configuration conf = context.getConfiguration(); @@ -292,6 +315,19 @@ public void setup(Context context) { update = conf.getBoolean("db.injector.update", false); LOG.info("Injector: overwrite: {}", overwrite); LOG.info("Injector: update: {}", update); + + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + urlsInjectedUniqueCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL); + urlsMergedCounter = context.getCounter( + NutchMetrics.GROUP_INJECTOR, NutchMetrics.INJECTOR_URLS_MERGED_TOTAL); } /** @@ -351,11 +387,9 @@ public void reduce(Text key, Iterable values, Context context) } } if (injectedSet) { - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_INJECTED_UNIQUE_TOTAL).increment(1); + urlsInjectedUniqueCounter.increment(1); if (oldSet) { - context.getCounter(NutchMetrics.GROUP_INJECTOR, - NutchMetrics.INJECTOR_URLS_MERGED_TOTAL).increment(1); + urlsMergedCounter.increment(1); } } context.write(key, result); diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 4a139f5d08..0a08e9da2e 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; @@ -159,6 +160,13 @@ public static class FetcherRun extends private boolean storingContent; private boolean parsing; + // Cached counter references for performance + private Counter bytesDownloadedCounter; + private Counter hitByThroughputThresholdCounter; + private Counter hitByTimelimitCounter; + private Counter hungThreadsCounter; + private Counter hitByTimeoutCounter; + private AtomicInteger getActiveThreads() { return activeThreads; } @@ -197,11 +205,28 @@ public void setup(Mapper.Context context) parsing = isParsing(conf); } + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + bytesDownloadedCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL); + hitByThroughputThresholdCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL); + hitByTimelimitCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL); + hungThreadsCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HUNG_THREADS_TOTAL); + hitByTimeoutCounter = context.getCounter( + NutchMetrics.GROUP_FETCHER, NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL); + } + @Override public void run(Context innerContext) throws IOException, InterruptedException { setup(innerContext); + initCounters(innerContext); try { Configuration conf = innerContext.getConfiguration(); LinkedList fetcherThreads = new LinkedList<>(); @@ -296,8 +321,7 @@ public void run(Context innerContext) pagesLastSec = pages.get() - pagesLastSec; bytesLastSec = (int) bytes.get() - bytesLastSec; - innerContext.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_BYTES_DOWNLOADED_TOTAL).increment(bytesLastSec); + bytesDownloadedCounter.increment(bytesLastSec); reportStatus(innerContext, fetchQueues, pagesLastSec, bytesLastSec); @@ -335,9 +359,7 @@ public void run(Context innerContext) int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) - innerContext.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HIT_BY_THROUGHPUT_THRESHOLD_TOTAL) - .increment(hitByThrougputThreshold); + hitByThroughputThresholdCounter.increment(hitByThrougputThreshold); } } } @@ -418,8 +440,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) - innerContext.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HIT_BY_TIMELIMIT_TOTAL).increment(hitByTimeLimit); + hitByTimelimitCounter.increment(hitByTimeLimit); } /* @@ -435,8 +456,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { timeout); LOG.warn("Aborting with {} hung threads{}.", activeThreads, feeder.isAlive() ? " (queue feeder still alive)" : ""); - innerContext.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HUNG_THREADS_TOTAL).increment(activeThreads.get()); + hungThreadsCounter.increment(activeThreads.get()); for (int i = 0; i < fetcherThreads.size(); i++) { FetcherThread thread = fetcherThreads.get(i); if (thread.isAlive()) { @@ -471,8 +491,7 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) { fetchQueues.getTotalSize(), fetchQueues.getQueueCount(), feeder.isAlive() ? " (queue feeder still alive)" : ""); int hitByTimeout = fetchQueues.emptyQueues(); - innerContext.getCounter(NutchMetrics.GROUP_FETCHER, - NutchMetrics.FETCHER_HIT_BY_TIMEOUT_TOTAL).increment(hitByTimeout); + hitByTimeoutCounter.increment(hitByTimeout); return; } diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java index 4c42c02b4b..05e4a940c8 100644 --- a/src/java/org/apache/nutch/hostdb/ResolverThread.java +++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java @@ -21,6 +21,7 @@ import java.net.UnknownHostException; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.util.StringUtils; @@ -44,6 +45,17 @@ public class ResolverThread implements Runnable { protected Context context; protected int purgeFailedHostsThreshold; + // Cached counter references for performance + private Counter newKnownHostCounter; + private Counter rediscoveredHostCounter; + private Counter existingKnownHostCounter; + private Counter newUnknownHostCounter; + private Counter existingUnknownHostCounter; + private Counter purgedUnknownHostCounter; + private Counter checkedHostsCounter; + private Counter errorsCounter; + private Counter errorsNetworkCounter; + /** * Overloaded constructor. * @param host name of the host to lookup @@ -61,6 +73,33 @@ public ResolverThread(String host, HostDatum datum, this.datum = datum; this.context = context; this.purgeFailedHostsThreshold = purgeFailedHostsThreshold; + + // Initialize cached counters for performance + initCounters(); + } + + /** + * Initialize cached counter references to avoid repeated lookups. + */ + private void initCounters() { + newKnownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL); + rediscoveredHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL); + existingKnownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL); + newUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL); + existingUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL); + purgedUnknownHostCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL); + checkedHostsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL); + errorsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_TOTAL); + errorsNetworkCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.ERROR_NETWORK_TOTAL); } /** @@ -75,19 +114,16 @@ public void run() { InetAddress inetAddr = InetAddress.getByName(host); if (datum.isEmpty()) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_NEW_KNOWN_HOST_TOTAL).increment(1); + newKnownHostCounter.increment(1); datum.setLastCheck(); LOG.info("{}: new_known_host {}", host, datum); } else if (datum.getDnsFailures() > 0) { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_REDISCOVERED_HOST_TOTAL).increment(1); + rediscoveredHostCounter.increment(1); datum.setLastCheck(); datum.setDnsFailures(0l); LOG.info("{}: rediscovered_host {}", host, datum); } else { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_EXISTING_KNOWN_HOST_TOTAL).increment(1); + existingKnownHostCounter.increment(1); datum.setLastCheck(); LOG.info("{}: existing_known_host {}", host, datum); } @@ -101,8 +137,7 @@ public void run() { datum.setLastCheck(); datum.setDnsFailures(1l); context.write(hostText, datum); - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_NEW_UNKNOWN_HOST_TOTAL).increment(1); + newUnknownHostCounter.increment(1); LOG.info("{}: new_unknown_host {}", host, datum); } else { datum.setLastCheck(); @@ -113,12 +148,10 @@ public void run() { purgeFailedHostsThreshold < datum.getDnsFailures()) { context.write(hostText, datum); - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_EXISTING_UNKNOWN_HOST_TOTAL).increment(1); + existingUnknownHostCounter.increment(1); LOG.info("{}: existing_unknown_host {}", host, datum); } else { - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_PURGED_UNKNOWN_HOST_TOTAL).increment(1); + purgedUnknownHostCounter.increment(1); LOG.info("{}: purged_unknown_host {}", host, datum); } } @@ -126,10 +159,8 @@ public void run() { // Dynamic counter based on failure count - can't cache context.getCounter(NutchMetrics.GROUP_HOSTDB, createFailureCounterLabel(datum)).increment(1); // Common error counters for consistency - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.ERROR_TOTAL).increment(1); - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.ERROR_NETWORK_TOTAL).increment(1); + errorsCounter.increment(1); + errorsNetworkCounter.increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); context.getCounter(NutchMetrics.GROUP_HOSTDB, @@ -139,14 +170,12 @@ public void run() { } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.ERROR_TOTAL).increment(1); + errorsCounter.increment(1); context.getCounter(NutchMetrics.GROUP_HOSTDB, ErrorTracker.getCounterName(e)).increment(1); } - context.getCounter(NutchMetrics.GROUP_HOSTDB, - NutchMetrics.HOSTDB_CHECKED_HOSTS_TOTAL).increment(1); + checkedHostsCounter.increment(1); } private String createFailureCounterLabel(HostDatum datum) { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java index 10a08d55a0..b1736348b8 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java @@ -80,12 +80,19 @@ public void setup(Mapper.Context context) { normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); // Initialize cached counter references - filteredRecordsCounter = context.getCounter( - NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL); + initCounters(context); // Initialize error tracker with cached counters errorTracker = new ErrorTracker(NutchMetrics.GROUP_HOSTDB, context); } + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + filteredRecordsCounter = context.getCounter( + NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_FILTERED_RECORDS_TOTAL); + } + /** * Filters and or normalizes the input hostname by applying the configured URL * filters and normalizers the URL "http://hostname/". diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java index 6c979f222e..878216b3c6 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java @@ -154,6 +154,13 @@ public void setup(Reducer.Context context) } // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Reducer.Context context) { urlLimitNotReachedCounter = context.getCounter( NutchMetrics.GROUP_HOSTDB, NutchMetrics.HOSTDB_URL_LIMIT_NOT_REACHED_TOTAL); totalHostsCounter = context.getCounter( diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index ae01e4b0d1..dc466dad06 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -26,6 +26,7 @@ import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -89,6 +90,9 @@ public static class DeleterReducer extends IndexWriters writers = null; + // Cached counter reference for performance + private Counter deletedDocumentsCounter; + @Override public void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); @@ -99,6 +103,17 @@ public void setup(Reducer.Context contex throw new RuntimeException(e); } noCommit = conf.getBoolean("noCommit", false); + + // Initialize cached counter reference + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + deletedDocumentsCounter = context.getCounter( + NutchMetrics.GROUP_CLEANING, NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL); } @Override @@ -119,8 +134,7 @@ public void reduce(ByteWritable key, Iterable values, for (Text document : values) { writers.delete(document.toString()); totalDeleted++; - context.getCounter(NutchMetrics.GROUP_CLEANING, - NutchMetrics.CLEANING_DELETED_DOCUMENTS_TOTAL).increment(1); + deletedDocumentsCounter.increment(1); } } } diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 0b728a588c..fee0921d0a 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -48,6 +48,7 @@ import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; import org.apache.hadoop.mapreduce.Mapper; @@ -328,6 +329,10 @@ public static class OutlinkDbReducer extends // url normalizers, filters and job configuration private Configuration conf; + // Cached counter references for performance + private Counter addedLinksCounter; + private Counter removedLinksCounter; + /** * Configures the OutlinkDb job reducer. Sets up internal links and link limiting. */ @@ -340,6 +345,18 @@ public void setup(Reducer.Context context) limitPages = conf.getBoolean("link.ignore.limit.page", true); limitDomains = conf.getBoolean("link.ignore.limit.domain", true); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + addedLinksCounter = context.getCounter( + NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL); + removedLinksCounter = context.getCounter( + NutchMetrics.GROUP_WEBGRAPH, NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL); } @Override @@ -362,16 +379,14 @@ public void reduce(Text key, Iterable values, mostRecent = timestamp; } outlinkList.add(WritableUtils.clone(next, conf)); - context.getCounter(NutchMetrics.GROUP_WEBGRAPH, - NutchMetrics.WEBGRAPH_ADDED_LINKS_TOTAL).increment(1); + addedLinksCounter.increment(1); } else if (value instanceof BooleanWritable) { BooleanWritable delete = (BooleanWritable) value; // Actually, delete is always true, otherwise we don't emit it in the // mapper in the first place if (delete.get() == true) { // This page is gone, do not emit it's outlinks - context.getCounter(NutchMetrics.GROUP_WEBGRAPH, - NutchMetrics.WEBGRAPH_REMOVED_LINKS_TOTAL).increment(1); + removedLinksCounter.increment(1); return; } } diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index f271adfe94..14b59ac85c 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -124,6 +124,15 @@ public static class WARCReducer @Override public void setup(Context context) { // Initialize cached counter references + initCounters(context); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER, context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { missingContentCounter = context.getCounter( NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_MISSING_CONTENT_TOTAL); missingMetadataCounter = context.getCounter( @@ -132,8 +141,6 @@ public void setup(Context context) { NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_OMITTED_EMPTY_RESPONSE_TOTAL); recordsGeneratedCounter = context.getCounter( NutchMetrics.GROUP_WARC_EXPORTER, NutchMetrics.WARC_RECORDS_GENERATED_TOTAL); - // Initialize error tracker with cached counters - errorTracker = new ErrorTracker(NutchMetrics.GROUP_WARC_EXPORTER, context); } @Override diff --git a/src/java/org/apache/nutch/util/DomainStatistics.java b/src/java/org/apache/nutch/util/DomainStatistics.java index 5ee09c846a..4057795d52 100644 --- a/src/java/org/apache/nutch/util/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/DomainStatistics.java @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -38,6 +39,7 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metrics.NutchMetrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,10 +54,6 @@ public class DomainStatistics extends Configured implements Tool { private static final Text FETCHED_TEXT = new Text("FETCHED"); private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED"); - public static enum MyCounter { - FETCHED, NOT_FETCHED, EMPTY_RESULT - }; - private static final int MODE_HOST = 1; private static final int MODE_DOMAIN = 2; private static final int MODE_SUFFIX = 3; @@ -158,10 +156,29 @@ static class DomainStatisticsMapper extends Mapper { int mode = 0; + // Cached counter references for performance + private Counter fetchedCounter; + private Counter notFetchedCounter; + private Counter emptyResultCounter; + @Override public void setup(Context context) { mode = context.getConfiguration().getInt("domain.statistics.mode", MODE_DOMAIN); + // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { + fetchedCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_FETCHED_TOTAL); + notFetchedCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_NOT_FETCHED_TOTAL); + emptyResultCounter = context.getCounter( + NutchMetrics.GROUP_DOMAIN_STATS, NutchMetrics.DOMAIN_STATS_EMPTY_RESULT_TOTAL); } @Override @@ -197,17 +214,17 @@ public void map(Text urlText, CrawlDatum datum, Context context) } if (out.trim().equals("")) { LOG.info("url : {}", url); - context.getCounter(MyCounter.EMPTY_RESULT).increment(1); + emptyResultCounter.increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } - context.getCounter(MyCounter.FETCHED).increment(1); + fetchedCounter.increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { - context.getCounter(MyCounter.NOT_FETCHED).increment(1); + notFetchedCounter.increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 4b55a72ebb..21362223cd 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -151,6 +151,15 @@ public void setup(Context context) { } // Initialize cached counter references + initCounters(context); + // Initialize error tracker with cached counters + errorTracker = new ErrorTracker(NutchMetrics.GROUP_SITEMAP, context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { filteredRecordsCounter = context.getCounter( NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FILTERED_RECORDS_TOTAL); seedsCounter = context.getCounter( @@ -161,8 +170,6 @@ public void setup(Context context) { NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FILTERED_FROM_HOSTNAME_TOTAL); failedFetchesCounter = context.getCounter( NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_FAILED_FETCHES_TOTAL); - // Initialize error tracker with cached counters - errorTracker = new ErrorTracker(NutchMetrics.GROUP_SITEMAP, context); } @Override @@ -377,6 +384,13 @@ public void setup(Context context) { this.overwriteExisting = conf.getBoolean(SITEMAP_OVERWRITE_EXISTING, false); // Initialize cached counter references + initCounters(context); + } + + /** + * Initialize cached counter references to avoid repeated lookups in hot paths. + */ + private void initCounters(Context context) { existingEntriesCounter = context.getCounter( NutchMetrics.GROUP_SITEMAP, NutchMetrics.SITEMAP_EXISTING_ENTRIES_TOTAL); newEntriesCounter = context.getCounter( From fef49b98d8173b9ad6b175de98c8904b60781a6c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sun, 8 Feb 2026 23:08:37 +0100 Subject: [PATCH 23/27] NUTCH-3152 Job counters getGroup to use metrics constants --- src/java/org/apache/nutch/crawl/Generator.java | 11 +++++++---- src/java/org/apache/nutch/indexer/IndexingJob.java | 13 +++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 57bf7f4766..102ce39b94 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -1018,10 +1018,13 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, } LOG.info("Generator: number of items rejected during selection:"); - for (Counter counter : job.getCounters().getGroup("Generator")) { - LOG.info("Generator: {} {}", - String.format(Locale.ROOT, "%6d", counter.getValue()), - counter.getName()); + for (Counter counter : job.getCounters() + .getGroup(NutchMetrics.GROUP_GENERATOR)) { + long counterValue = counter.getValue(); + if (counterValue > 0) { + LOG.info("Generator: {} {}", + String.format(Locale.ROOT, "%6d", counterValue), counter.getName()); + } } if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { /* diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index fc2c44a064..224b4118e6 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -30,6 +30,7 @@ import org.apache.commons.lang3.time.StopWatch; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -155,10 +156,14 @@ public void index(Path crawlDb, Path linkDb, List segments, throw e; } LOG.info("Indexer: number of documents indexed, deleted, or skipped:"); - for (Counter counter : job.getCounters().getGroup("IndexerStatus")) { - LOG.info("Indexer: {} {}", - String.format(Locale.ROOT, "%6d", counter.getValue()), - counter.getName()); + for (Counter counter : job.getCounters() + .getGroup(NutchMetrics.GROUP_INDEXER)) { + long counterValue = counter.getValue(); + if (counterValue > 0) { + LOG.info("Indexer: {} {}", + String.format(Locale.ROOT, "%6d", counterValue), + counter.getName()); + } } stopWatch.stop(); LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime( From 023010a29b20a244f8a8a30ea0be0f3b21e7a469 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 11 Feb 2026 20:08:23 +0100 Subject: [PATCH 24/27] NUTCH-3153 Update of license and notice files --- LICENSE-binary | 38 ++--- NOTICE-binary | 157 ++++++++++-------- licenses-binary/LICENSE-bsd-licence.txt | 39 +++++ ...on-2-gpl2-with-the-classpath-exception.txt | 15 -- ...reme-lab-software-license-vesion-1.1.1.txt | 0 5 files changed, 146 insertions(+), 103 deletions(-) create mode 100644 licenses-binary/LICENSE-bsd-licence.txt delete mode 100644 licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt delete mode 100644 licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt diff --git a/LICENSE-binary b/LICENSE-binary index 538e3baf7c..addc4a2824 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -245,7 +245,6 @@ com.google.inject.extensions:guice-servlet com.google.j2objc:j2objc-annotations com.healthmarketscience.jackcess:jackcess com.healthmarketscience.jackcess:jackcess-encrypt -com.intellij:annotations com.maxmind.db:maxmind-db com.maxmind.geoip2:geoip2 com.nimbusds:nimbus-jose-jwt @@ -257,7 +256,12 @@ com.rometools:rome-utils com.shapesecurity:salvation2 com.squareup.okhttp3:okhttp com.squareup.okhttp3:okhttp-brotli +com.squareup.okhttp3:okhttp-jvm +com.squareup.okhttp3:okhttp-zstd com.squareup.okio:okio +com.squareup.okio:okio-jvm +com.squareup.zstd:zstd-kmp-jvm +com.squareup.zstd:zstd-kmp-okio-jvm com.tdunning:t-digest com.typesafe.netty:netty-reactive-streams com.typesafe.scala-logging:scala-logging_2.12 @@ -275,13 +279,14 @@ commons-lang:commons-lang commons-logging:commons-logging commons-net:commons-net commons-validator:commons-validator +de.l3s.boilerpipe:boilerpipe de.vandermeer:ascii-utf-themes de.vandermeer:asciitable de.vandermeer:char-translation de.vandermeer:skb-interfaces dev.failsafe:failsafe +info.picocli:picocli io.dropwizard.metrics:metrics-core -io.netty:netty io.netty:netty-all io.netty:netty-buffer io.netty:netty-codec @@ -378,7 +383,7 @@ org.apache.hadoop:hadoop-yarn-api org.apache.hadoop:hadoop-yarn-client org.apache.hadoop:hadoop-yarn-common org.apache.hadoop.thirdparty:hadoop-shaded-guava -org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7 +org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25 org.apache.httpcomponents:httpasyncclient org.apache.httpcomponents:httpclient org.apache.httpcomponents:httpcore @@ -398,21 +403,13 @@ org.apache.kafka:kafka-storage org.apache.kafka:kafka-storage-api org.apache.kafka:kafka-tools-api org.apache.kafka:kafka_2.12 -org.apache.kerby:kerb-admin -org.apache.kerby:kerb-client -org.apache.kerby:kerb-common org.apache.kerby:kerb-core org.apache.kerby:kerb-crypto -org.apache.kerby:kerb-identity -org.apache.kerby:kerb-server -org.apache.kerby:kerb-simplekdc org.apache.kerby:kerb-util org.apache.kerby:kerby-asn1 org.apache.kerby:kerby-config org.apache.kerby:kerby-pkix org.apache.kerby:kerby-util -org.apache.kerby:kerby-xdr -org.apache.kerby:token-provider org.apache.logging.log4j:log4j-api org.apache.logging.log4j:log4j-core org.apache.logging.log4j:log4j-slf4j2-impl @@ -435,6 +432,7 @@ org.apache.pdfbox:fontbox org.apache.pdfbox:jbig2-imageio org.apache.pdfbox:jempbox org.apache.pdfbox:pdfbox +org.apache.pdfbox:pdfbox-io org.apache.pdfbox:pdfbox-tools org.apache.pdfbox:xmpbox org.apache.poi:poi @@ -443,6 +441,7 @@ org.apache.poi:poi-ooxml-lite org.apache.poi:poi-scratchpad org.apache.solr:solr-solrj org.apache.tika:tika-core +org.apache.tika:tika-handler-boilerpipe org.apache.tika:tika-langdetect-optimaize org.apache.tika:tika-parser-apple-module org.apache.tika:tika-parser-audiovideo-module @@ -476,8 +475,6 @@ org.asynchttpclient:async-http-client org.asynchttpclient:async-http-client-netty-utils org.bitbucket.b_c:jose4j org.ccil.cowan.tagsoup:tagsoup -org.codehaus.jackson:jackson-core-asl -org.codehaus.jackson:jackson-mapper-asl org.codehaus.jettison:jettison org.eclipse.jetty:jetty-alpn-client org.eclipse.jetty:jetty-alpn-java-client @@ -515,9 +512,6 @@ org.gagravarr:vorbis-java-core org.gagravarr:vorbis-java-tika org.jetbrains:annotations org.jetbrains.kotlin:kotlin-stdlib -org.jetbrains.kotlin:kotlin-stdlib-common -org.jetbrains.kotlin:kotlin-stdlib-jdk7 -org.jetbrains.kotlin:kotlin-stdlib-jdk8 org.jspecify:jspecify org.littleshoot:littleproxy org.locationtech.spatial4j:spatial4j @@ -595,9 +589,7 @@ BSD 2-Clause com.barchart.udt:barchart-udt-bundle com.github.luben:zstd-jni -com.google.protobuf:protobuf-java dk.brics:automaton -dnsjava:dnsjava org.codehaus.woodstox:stax2-api org.jline:jline @@ -609,6 +601,7 @@ BSD 3-Clause com.adobe.xmp:xmpcore com.github.virtuald:curvesapi +dnsjava:dnsjava org.fusesource.leveldbjni:leveldbjni-all org.ow2.asm:asm @@ -633,7 +626,7 @@ Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) -org.bouncycastle:bcmail-jdk18on +org.bouncycastle:bcjmail-jdk18on org.bouncycastle:bcpkix-jdk18on org.bouncycastle:bcprov-jdk18on org.bouncycastle:bcutil-jdk18on @@ -717,6 +710,8 @@ jakarta.jws:jakarta.jws-api jakarta.xml.bind:jakarta.xml.bind-api jakarta.xml.soap:jakarta.xml.soap-api jakarta.xml.ws:jakarta.xml.ws-api +org.eclipse.angus:angus-activation +org.glassfish.jaxb:jaxb-core org.glassfish.jaxb:jaxb-runtime org.glassfish.jaxb:txw2 @@ -724,6 +719,8 @@ org.glassfish.jaxb:txw2 Eclipse Public License - Version 2.0 ------------------------------------ +(licenses-binary/LICENSE-eclipse-public-license---version-2.0.txt) + org.eclipse.jetty:jetty-http org.eclipse.jetty:jetty-io org.eclipse.jetty:jetty-security @@ -734,6 +731,8 @@ org.eclipse.jetty:jetty-util MIT --- +(licenses-binary/LICENSE-mit-license.txt) + net.sourceforge.argparse4j:argparse4j org.slf4j:slf4j-api @@ -781,7 +780,6 @@ Public Domain (licenses-binary/LICENSE-public-domain.txt) aopalliance:aopalliance -org.tukaani:xz Public Domain, per Creative Commons CC0 diff --git a/NOTICE-binary b/NOTICE-binary index 99fea523a4..412ce7d38e 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -48,7 +48,7 @@ Apache projects # org.apache.avro:avro -Apache Avro (http://avro.apache.org) +Apache Avro (https://avro.apache.org) # org.apache.commons:commons-collections4 Apache Commons Collections (https://commons.apache.org/proper/commons-collections/) @@ -60,6 +60,8 @@ Apache Commons Configuration (https://commons.apache.org/proper/commons-configur Apache Commons CSV (https://commons.apache.org/proper/commons-csv/) # org.apache.commons:commons-exec Apache Commons Exec (http://commons.apache.org/proper/commons-exec/) +# org.apache.commons:commons-exec +Apache Commons Exec (https://commons.apache.org/proper/commons-exec/) # org.apache.commons:commons-jexl3 Apache Commons JEXL (https://commons.apache.org/proper/commons-jexl/) # org.apache.commons:commons-lang3 @@ -68,8 +70,6 @@ Apache Commons Lang (https://commons.apache.org/proper/commons-lang/) Apache Commons Lang (http://commons.apache.org/proper/commons-lang/) # org.apache.commons:commons-math3 Apache Commons Math (http://commons.apache.org/proper/commons-math/) -# org.apache.commons:commons-math3 -Apache Commons Math (http://commons.apache.org/math/) # org.apache.commons:commons-text Apache Commons Text (https://commons.apache.org/proper/commons-text) @@ -132,8 +132,8 @@ Apache Hadoop YARN Common # org.apache.hadoop.thirdparty:hadoop-shaded-guava Apache Hadoop shaded Guava -# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_7 -Apache Hadoop shaded Protobuf 3.7 +# org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25 +Apache Hadoop shaded Protobuf # org.apache.httpcomponents:httpasyncclient Apache HttpAsyncClient (http://hc.apache.org/httpcomponents-asyncclient) @@ -146,6 +146,8 @@ Apache HttpCore (http://hc.apache.org/httpcomponents-core-ga) # org.apache.httpcomponents:httpcore-nio Apache HttpCore NIO (http://hc.apache.org/httpcomponents-core-ga) # org.apache.httpcomponents:httpmime +Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client-ga) +# org.apache.httpcomponents:httpmime Apache HttpClient Mime (http://hc.apache.org/httpcomponents-client) # org.apache.james:apache-mime4j-core @@ -178,22 +180,10 @@ Apache Kafka (https://kafka.apache.org) # org.apache.kafka:kafka_2.12 Apache Kafka (https://kafka.apache.org) -# org.apache.kerby:kerb-admin -Apache Kerby-kerb Admin -# org.apache.kerby:kerb-client -Apache Kerby-kerb Client -# org.apache.kerby:kerb-common -Apache Kerby-kerb Common # org.apache.kerby:kerb-core Apache Kerby-kerb core # org.apache.kerby:kerb-crypto Apache Kerby-kerb Crypto -# org.apache.kerby:kerb-identity -Apache Kerby-kerb Identity -# org.apache.kerby:kerb-server -Apache Kerby-kerb Server -# org.apache.kerby:kerb-simplekdc -Apache Kerb Simple Kdc # org.apache.kerby:kerb-util Apache Kerby-kerb Util # org.apache.kerby:kerby-asn1 @@ -204,10 +194,6 @@ Apache Kerby Config Apache Kerby PKIX Project # org.apache.kerby:kerby-util Apache Kerby Util -# org.apache.kerby:kerby-xdr -Apache Kerby XDR Project -# org.apache.kerby:token-provider -Apache Token provider # org.apache.logging.log4j:log4j-api Apache Log4j API @@ -258,6 +244,8 @@ Apache PDFBox JBIG2 ImageIO plugin Apache JempBox # org.apache.pdfbox:pdfbox Apache PDFBox +# org.apache.pdfbox:pdfbox-io +Apache PDFBox io # org.apache.pdfbox:pdfbox-tools Apache PDFBox tools # org.apache.pdfbox:xmpbox @@ -277,6 +265,8 @@ Apache Solr Solrj # org.apache.tika:tika-core Apache Tika core (https://tika.apache.org/) +# org.apache.tika:tika-handler-boilerpipe +Apache # org.apache.tika:tika-langdetect-optimaize Apache Tika Optimaize langdetect # org.apache.tika:tika-parser-apple-module @@ -391,10 +381,10 @@ Jackson-annotations (http://github.com/FasterXML/jackson) Jackson-annotations (https://github.com/FasterXML/jackson) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-core -Jackson-core (https://github.com/FasterXML/jackson) +Jackson-core (https://github.com/FasterXML/jackson-core) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-core -Jackson-core (https://github.com/FasterXML/jackson-core) +Jackson-core (https://github.com/FasterXML/jackson) - license: The Apache Software License, Version 2.0 # com.fasterxml.jackson.core:jackson-databind jackson-databind (http://github.com/FasterXML/jackson) @@ -519,10 +509,10 @@ error-prone annotations # com.google.guava:failureaccess Guava InternalFutureFailureAccess and InternalFutures -- license: The Apache Software License, Version 2.0 +- license: Apache License, Version 2.0 # com.google.guava:failureaccess Guava InternalFutureFailureAccess and InternalFutures -- license: Apache License, Version 2.0 +- license: The Apache Software License, Version 2.0 # com.google.guava:guava Guava: Google Core Libraries for Java (https://github.com/google/guava) - license: Apache License, Version 2.0 @@ -548,14 +538,10 @@ J2ObjC Annotations (https://github.com/google/j2objc/) J2ObjC Annotations (https://github.com/google/j2objc/) - license: The Apache Software License, Version 2.0 -# com.google.protobuf:protobuf-java -Protocol Buffer Java API (http://code.google.com/p/protobuf) -- license: New BSD license - (licenses-binary/LICENSE-bsd-2-clause.txt) - # com.google.re2j:re2j re2j (http://github.com/google/re2j) - license: The Go license + (licenses-binary/LICENSE-the-go-license.txt) # com.googlecode.juniversalchardet:juniversalchardet juniversalchardet (http://juniversalchardet.googlecode.com/) @@ -577,10 +563,7 @@ Jackcess Encrypt (http://jackcessencrypt.sf.net) # com.ibm.icu:icu4j ICU4J (https://icu.unicode.org/) - license: Unicode-3.0 - -# com.intellij:annotations -IntelliJ IDEA Annotations (http://www.jetbrains.org) -- license: Apache License 2 + (licenses-binary/LICENSE-unicode-icu-license.txt) # com.jcraft:jsch JSch (http://www.jcraft.com/jsch/) @@ -633,14 +616,30 @@ salvation (http://cspvalidator.org) - license: Apache License, Version 2.0 # com.squareup.okhttp3:okhttp -OkHttp (https://square.github.io/okhttp/) +okhttp (https://square.github.io/okhttp/) - license: The Apache Software License, Version 2.0 # com.squareup.okhttp3:okhttp-brotli okhttp-brotli (https://square.github.io/okhttp/) - license: The Apache Software License, Version 2.0 +# com.squareup.okhttp3:okhttp-jvm +okhttp (https://square.github.io/okhttp/) +- license: The Apache Software License, Version 2.0 +# com.squareup.okhttp3:okhttp-zstd +okhttp-zstd (https://square.github.io/okhttp/) +- license: The Apache Software License, Version 2.0 # com.squareup.okio:okio -Okio (https://github.com/square/okio/) +okio (https://github.com/square/okio/) +- license: The Apache Software License, Version 2.0 +# com.squareup.okio:okio-jvm +okio (https://github.com/square/okio/) +- license: The Apache Software License, Version 2.0 + +# com.squareup.zstd:zstd-kmp-jvm +zstd-kmp (https://github.com/square/okio-zstd/) +- license: The Apache Software License, Version 2.0 +# com.squareup.zstd:zstd-kmp-okio-jvm +zstd-kmp-okio (https://github.com/square/okio-zstd/) - license: The Apache Software License, Version 2.0 # com.sun.activation:jakarta.activation @@ -778,6 +777,10 @@ Apache Commons Net (https://commons.apache.org/proper/commons-net/) Apache Commons Validator (http://commons.apache.org/proper/commons-validator/) - license: Apache License, Version 2.0 +# de.l3s.boilerpipe:boilerpipe +Apache License 2.0 (http://code.google.com/p/boilerpipe/) +- license: Apache License 2.0 + # de.vandermeer:ascii-utf-themes ASCII and UTF Themes (https://github.com/vdmeer/ascii-utf-themes) - license: Apache 2 @@ -801,17 +804,18 @@ dk.brics.automaton (https://www.brics.dk/automaton) (licenses-binary/LICENSE-bsd-2-clause.txt) # dnsjava:dnsjava -dnsjava (http://www.dnsjava.org) -- license: BSD 2-Clause license - (licenses-binary/LICENSE-bsd-2-clause.txt) +dnsjava (https://github.com/dnsjava/dnsjava) +- license: BSD-3-Clause + (licenses-binary/LICENSE-bsd-3-clause.txt) + +# info.picocli:picocli +picocli (https://picocli.info) +- license: The Apache Software License, version 2.0 # io.dropwizard.metrics:metrics-core Metrics Core - license: Apache License 2.0 -# io.netty:netty -Netty (http://netty.io/) -- license: Apache License, Version 2.0 # io.netty:netty-all Netty/All-in-One (https://netty.io/netty-all/) - license: Apache License, Version 2.0 @@ -969,6 +973,10 @@ Google S2 geometry library (https://github.com/sgr-io/s2-geometry-library-java) # jakarta.activation:jakarta.activation-api Jakarta Activation API jar +- license: EDL 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) +# jakarta.activation:jakarta.activation-api +Jakarta Activation API (https://github.com/jakartaee/jaf-api) - license: EDL 1.0 (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) @@ -1019,7 +1027,7 @@ javax.ws.rs-api (https://github.com/eclipse-ee4j/jaxrs-api) (licenses-binary/LICENSE-epl-2.0.txt) # javax.ws.rs:jsr311-api jsr311-api (https://jsr311.dev.java.net) -- license: CDDL License +- license: CDDL License (licenses-binary/LICENSE-cddl-license.txt) # javax.xml.bind:jaxb-api @@ -1060,6 +1068,7 @@ JOpt Simple (http://jopt-simple.github.io/jopt-simple) # net.sourceforge.argparse4j:argparse4j argparse4j (http://argparse4j.github.io) - license: MIT + (licenses-binary/LICENSE-mit-license.txt) # net.sourceforge.htmlunit:htmlunit HtmlUnit (http://htmlunit.sourceforge.net) @@ -1105,20 +1114,24 @@ Asynchronous Http Client Netty Utils jose4j (https://bitbucket.org/b_c/jose4j/) - license: The Apache Software License, Version 2.0 -# org.bouncycastle:bcmail-jdk18on -Bouncy Castle S/MIME API (https://www.bouncycastle.org/java.html) +# org.bouncycastle:bcjmail-jdk18on +Bouncy Castle JavaMail Jakarta S/MIME APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcpkix-jdk18on -Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/java.html) +Bouncy Castle PKIX, CMS, EAC, TSP, PKCS, OCSP, CMP, and CRMF APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcprov-jdk18on Bouncy Castle Provider (https://www.bouncycastle.org/java.html) +- license: Bouncy Castle Licence + (licenses-binary/LICENSE-bouncy-castle-licence.txt) +# org.bouncycastle:bcprov-jdk18on +Bouncy Castle Provider (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) # org.bouncycastle:bcutil-jdk18on -Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/java.html) +Bouncy Castle ASN.1 Extension and Utility APIs (https://www.bouncycastle.org/download/bouncy-castle-java/) - license: Bouncy Castle Licence (licenses-binary/LICENSE-bouncy-castle-licence.txt) @@ -1140,13 +1153,6 @@ Checker Qual (https://checkerframework.org/) - license: The MIT License (licenses-binary/LICENSE-mit-license.txt) -# org.codehaus.jackson:jackson-core-asl -Jackson (http://jackson.codehaus.org) -- license: The Apache Software License, Version 2.0 -# org.codehaus.jackson:jackson-mapper-asl -Data Mapper for Jackson (http://jackson.codehaus.org) -- license: The Apache Software License, Version 2.0 - # org.codehaus.jettison:jettison Jettison (https://github.com/jettison-json/jettison) - license: Apache License, Version 2.0 @@ -1163,7 +1169,12 @@ Stax2 API (http://github.com/FasterXML/stax2-api) # org.codelibs:jhighlight JHighlight (https://github.com/codelibs/jhighlight) - license: CDDL, v1.0 - (licenses-binary/LICENSE-cddl-v1.0.txt) + (licenses-binary/LICENSE-cddl-1.0.txt) + +# org.eclipse.angus:angus-activation +Angus Activation Registries +- license: EDL 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) # org.eclipse.jetty:jetty-alpn-client Jetty :: ALPN :: Client @@ -1180,18 +1191,22 @@ Jetty :: Http Utility # org.eclipse.jetty:jetty-http Jetty :: Http Utility - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-io Jetty :: IO Utility - license: Apache Software License - Version 2.0 # org.eclipse.jetty:jetty-io Jetty :: IO Utility - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-security Jetty :: Security - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-server Jetty :: Server Core - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-servlet Jetty :: Servlet Handling - license: Apache Software License - Version 2.0 @@ -1201,6 +1216,7 @@ Jetty :: Utilities # org.eclipse.jetty:jetty-util Jetty :: Utilities - license: Eclipse Public License - Version 2.0 + (licenses-binary/LICENSE-epl-2.0.txt) # org.eclipse.jetty:jetty-util-ajax Jetty :: Utilities :: Ajax(JSON) - license: Apache Software License - Version 2.0 @@ -1295,6 +1311,10 @@ Ogg and Vorbis for Java, Core (https://github.com/Gagravarr/VorbisJava) Apache Tika plugin for Ogg, Vorbis and FLAC (https://github.com/Gagravarr/VorbisJava) - license: The Apache Software License, Version 2.0 +# org.glassfish.jaxb:jaxb-core +JAXB Core (https://eclipse-ee4j.github.io/jaxb-ri/) +- license: Eclipse Distribution License - v 1.0 + (licenses-binary/LICENSE-eclipse-distribution-license-v1.0.txt) # org.glassfish.jaxb:jaxb-runtime JAXB Runtime (https://eclipse-ee4j.github.io/jaxb-ri/) - license: Eclipse Distribution License - v 1.0 @@ -1326,22 +1346,16 @@ JDOM (http://www.jdom.org) JDOM (http://www.jdom.org) - license: Similar to Apache License but with the acknowledgment clause removed +# org.jetbrains:annotations +JetBrains Java Annotations (https://github.com/JetBrains/java-annotations) +- license: The Apache Software License, Version 2.0 # org.jetbrains:annotations IntelliJ IDEA Annotations (http://www.jetbrains.org) - license: The Apache Software License, Version 2.0 # org.jetbrains.kotlin:kotlin-stdlib -org.jetbrains.kotlin:kotlin-stdlib (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-common -org.jetbrains.kotlin:kotlin-stdlib-common (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-jdk7 -org.jetbrains.kotlin:kotlin-stdlib-jdk7 (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 -# org.jetbrains.kotlin:kotlin-stdlib-jdk8 -org.jetbrains.kotlin:kotlin-stdlib-jdk8 (https://kotlinlang.org/) -- license: The Apache License, Version 2.0 +Kotlin Stdlib (https://kotlinlang.org/) +- license: Apache-2.0 # org.jline:jline JLine Bundle @@ -1349,6 +1363,10 @@ JLine Bundle (licenses-binary/LICENSE-bsd-2-clause.txt) # org.jsoup:jsoup +jsoup Java HTML Parser (https://jsoup.org/) +- license: The MIT License + (licenses-binary/LICENSE-mit-license.txt) +# org.jsoup:jsoup jsoup (http://jsoup.org/) - license: The MIT License (licenses-binary/LICENSE-mit-license.txt) @@ -1517,6 +1535,9 @@ org.seleniumhq.selenium:selenium-support (https://selenium.dev/) # org.slf4j:jcl-over-slf4j JCL 1.2 implemented over SLF4J (http://www.slf4j.org) - license: Apache License, Version 2.0 +# org.slf4j:jcl-over-slf4j +JCL 1.2 implemented over SLF4J (http://www.slf4j.org) +- license: Apache-2.0 # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) - license: MIT License @@ -1524,6 +1545,7 @@ SLF4J API Module (http://www.slf4j.org) # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) - license: MIT + (licenses-binary/LICENSE-mit-license.txt) # org.tallison:jmatio JMatIO (https://github.com/tballison/jmatio) @@ -1532,8 +1554,7 @@ JMatIO (https://github.com/tballison/jmatio) # org.tukaani:xz XZ for Java (https://tukaani.org/xz/java.html) -- license: Public Domain - (licenses-binary/LICENSE-public-domain.txt) +- license: Zero-Clause BSD (0BSD) # org.xerial.snappy:snappy-java Apache-2.0 (https://github.com/xerial/snappy-java) diff --git a/licenses-binary/LICENSE-bsd-licence.txt b/licenses-binary/LICENSE-bsd-licence.txt new file mode 100644 index 0000000000..ce7787d52f --- /dev/null +++ b/licenses-binary/LICENSE-bsd-licence.txt @@ -0,0 +1,39 @@ +(source: http://antlr.org/license.html) + +ANTLR v4 License + +ANTLR + +ANTLR 4 License +[The BSD License] +Copyright (c) 2012 Terence Parr and Sam Harwell +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Developer's Certificate of Origin +As of 4.10, ANTLR uses the Linux Foundation's Developer Certificate of Origin, DCO, version 1.1. See certificate +of origin. To contribute: + +- fork the dev branch of the ANTLR v4 github repository +- make your changes +- commit your changes, signing your commits with git commit -s .... +- send a pull request diff --git a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt b/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt deleted file mode 100644 index a25e8c704e..0000000000 --- a/licenses-binary/LICENSE-gnu-general-public-license-version-2-gpl2-with-the-classpath-exception.txt +++ /dev/null @@ -1,15 +0,0 @@ -(source: http://www.gnu.org/software/classpath/license.html) - - -GNU Classpath License - GNU Project - Free Software Foundation (FSF) - - - - -Classpath is distributed under the terms of the GNU General Public License with the following clarification and special exception. - - Linking this library statically or dynamically with other modules is making a combined work based on this library. Thus, the terms and conditions of the GNU General Public License cover the whole combination. - - As a special exception, the copyright holders of this library give you permission to link this library with independent modules to produce an executable, regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An independent module is a module which is not derived from or based on this library. If you modify this library, you may extend this exception to your version of the library, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version. - -As such, it can be used to run, create and distribute a large class of applications and applets. When GNU Classpath is used unmodified as the core class library for a virtual machine, compiler for the java languge, or for a program written in the java programming language it does not affect the licensing for distributing those programs directly. diff --git a/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt b/licenses-binary/LICENSE-indiana-university-extreme-lab-software-license-vesion-1.1.1.txt deleted file mode 100644 index e69de29bb2..0000000000 From 0eda915e602b325d78e0ef62018b6926d2fc3962 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 26 Feb 2026 10:26:48 +0100 Subject: [PATCH 25/27] NUTCH-3132 Standardize existing Nutch metrics naming and implementation Apply metrics naming conventions to CCF-specific classes and extensions: lower-case counter names of sitemap types in SitemapInjector. --- src/java/org/apache/nutch/crawl/SitemapInjector.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/crawl/SitemapInjector.java b/src/java/org/apache/nutch/crawl/SitemapInjector.java index 7dff68cf73..b643e3368a 100644 --- a/src/java/org/apache/nutch/crawl/SitemapInjector.java +++ b/src/java/org/apache/nutch/crawl/SitemapInjector.java @@ -26,6 +26,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.PriorityQueue; import java.util.Random; @@ -452,7 +453,8 @@ public void process(String url) { LOG.info("parsed sitemap {} ({})", url, sitemap.getType()); context .getCounter(NutchMetrics.GROUP_SITEMAP_INJECTOR, - NutchMetrics.SITEMAP_TYPE_PREFIX + sitemap.getType()) + NutchMetrics.SITEMAP_TYPE_PREFIX + + sitemap.getType().toString().toLowerCase(Locale.ROOT)) .increment(1); if (checkCrossSubmits) { From 044dfd2d95cdfbb42bfc4614ca8c221ae6e7a213 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 26 Feb 2026 11:40:34 +0100 Subject: [PATCH 26/27] NUTCH-3132 Standardize existing Nutch metrics naming and implementation Apply metrics naming conventions to WARC writer counters. --- .../apache/nutch/metrics/NutchMetrics.java | 40 ++++++++++ .../commoncrawl/util/WarcRecordWriter.java | 80 +++++++++++-------- 2 files changed, 87 insertions(+), 33 deletions(-) diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java index 1f70db09dd..ccb2d70ed3 100644 --- a/src/java/org/apache/nutch/metrics/NutchMetrics.java +++ b/src/java/org/apache/nutch/metrics/NutchMetrics.java @@ -178,6 +178,46 @@ private NutchMetrics() { */ public static final String FETCHER_ROBOTSTXT_ARCHIVING_ROBOTS_DENIED_TOTAL = "robots_denied"; + // ========================================================================= + // Common Crawl's WarcWriter + // ========================================================================= + + /** Counter group for Common Crawl's WARC writer. */ + public static final String GROUP_WARC_WRITER = "warc_writer"; + + /** Skipped records because no content (and protocol status) is available. */ + public static final String WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL = "skipped_no_content"; + + /** Fixed records: invalid URI normalized. */ + public static final String WARC_WRITER_URI_NORMALIZED_TOTAL = "fixed_uri"; + + /** Skipped records because URL is not a valid URI (no WARC-Target-URI). */ + public static final String WARC_WRITER_SKIPPED_INVALID_URI_TOTAL = "skipped_invalid_uri"; + + /** Skipped records by content type / MIME type. */ + public static final String WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL = "skipped_by_content_type"; + + /** Skipped duplicate records. */ + public static final String WARC_WRITER_SKIPPED_DUPLICATE_TOTAL = "skipped_duplicate"; + + /** Skipped records: no protocol status. */ + public static final String WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL = "skipped_no_protocol_status"; + + /** Skipped records: unknown protocol status. */ + public static final String WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL = "skipped_unknown_protocol_status"; + + /** Prefix for error status of language identification (LID), returned by CLD2 Java bindings. */ + public static final String WARC_WRITER_LID_ERROR_PREFIX = "lid_error: "; + + /** Language identification (LID): no result. */ + public static final String WARC_WRITER_LID_NO_RESULT_TOTAL = "lid_no_result"; + + /** Language identification (LID): result is reliable. */ + public static final String WARC_WRITER_LID_RESULT_RELIABLE_TOTAL = "lid_reliable"; + + /** Language identification (LID): result is not reliable. */ + public static final String WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL = "lid_not_reliable"; + // ========================================================================= // Generator Counters // ========================================================================= diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java index 5656c2b3a3..4f9b22943a 100644 --- a/src/java/org/commoncrawl/util/WarcRecordWriter.java +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -52,6 +52,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.metrics.NutchMetrics; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; @@ -76,7 +77,6 @@ class WarcRecordWriter extends RecordWriter { protected static final Pattern PROBLEMATIC_HEADERS = Pattern .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)"); protected static final String X_HIDE_HEADER = "X-Crawler-"; - public static final String WARC_WRITER_COUNTER_GROUP = "WARC-Writer"; protected static final Pattern STATUS_LINE_PATTERN = Pattern .compile("^HTTP/1\\.[01] [0-9]{3}(?: .*)?$"); @@ -527,18 +527,22 @@ public synchronized void write(Text key, WarcCapture value) throws IOException { if (value.content == null) { - String reason = ""; + ProtocolStatus pstatus = null; if (value.datum != null) { - ProtocolStatus pstatus = (ProtocolStatus) value.datum.getMetaData() + pstatus = (ProtocolStatus) value.datum.getMetaData() .get(Nutch.WRITABLE_PROTO_STATUS_KEY); - if (pstatus != null) { - reason = ": " + pstatus.getName() + " - " + pstatus.getMessage(); - } } - LOG.warn("Cannot write WARC record, no content for {}{}", value.url, - reason); - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (no content)").increment(1); + if (pstatus != null) { + LOG.warn( + "Cannot write WARC record, no content for {}, protocol status: {} - {}", + value.url, pstatus.getName(), pstatus.getMessage()); + } else { + LOG.warn( + "Cannot write WARC record, no content and protocol status for {}", + value.url); + } + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_NO_CONTENT_TOTAL).increment(1); return; } @@ -560,10 +564,8 @@ public synchronized void write(Text key, WarcCapture value) try { targetUri = new URI(urlNorm); LOG.info("Normalized URL to valid URI: {} -> {}", url, urlNorm); - context - .getCounter(WARC_WRITER_COUNTER_GROUP, - "fixed records (invalid URI successfully normalized)") - .increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_URI_NORMALIZED_TOTAL).increment(1); } catch (URISyntaxException ee) { // ignore, log exception observed on original URL } @@ -571,8 +573,10 @@ public synchronized void write(Text key, WarcCapture value) } if (targetUri == null) { LOG.error("Cannot write WARC record, invalid URI: {}", url); - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (invalid URI)").increment(1); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_INVALID_URI_TOTAL) + .increment(1); return; } } @@ -594,8 +598,10 @@ public synchronized void write(Text key, WarcCapture value) (truncated != null ? truncated : "-"), value.content.getContentType(), value.content.getContent().length, value.url); - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (by content)").increment(1); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_BY_CONTENT_TYPE_TOTAL) + .increment(1); return; } } @@ -637,8 +643,8 @@ public synchronized void write(Text key, WarcCapture value) } catch (Throwable t) { LOG.error(t.getMessage()); } - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (duplicate)").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_DUPLICATE_TOTAL).increment(1); return; } precedingURL = url; @@ -668,8 +674,10 @@ public synchronized void write(Text key, WarcCapture value) if (pstatus == null) { LOG.warn("Cannot write WARC record, no protocol status for {}", value.url); - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (no protocol status)").increment(1); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_NO_PROTOCOL_STATUS_TOTAL) + .increment(1); return; } switch (pstatus.getCode()) { @@ -698,8 +706,9 @@ public synchronized void write(Text key, WarcCapture value) if (value.content.getMetadata() .get(Response.RESPONSE_HEADERS) == null) { LOG.warn("Unknown or ambiguous protocol status: {}", pstatus); - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "skipped records (unknown protocol status)").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_SKIPPED_UNKNOWN_PROTOCOL_STATUS_TOTAL) + .increment(1); return; } } @@ -839,7 +848,7 @@ public synchronized void write(Text key, WarcCapture value) } LOG.info("WARC {} record {} ({}, status: {}, size: {})", - (notModified ? "revisit" : "response"), targetUri, date, httpStatusCode, + (notModified ? "revisit" : "response"), targetUri, date , httpStatusCode, value.content.getContent().length); URI requestId = null; @@ -860,17 +869,22 @@ public synchronized void write(Text key, WarcCapture value) // detect language only for successfully fetched primary documents ldres = langDetect.detectLanguage(targetUri, value.content); if (ldres.errorReason != null) { - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "language detection: " + ldres.errorStatus.name).increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_ERROR_PREFIX + ldres.errorStatus.name) + .increment(1); } else if (ldres.languages == null) { - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "language detection: no result").increment(1); + context.getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_NO_RESULT_TOTAL).increment(1); } else if (ldres.languages.isReliable()) { - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "language detection: reliable").increment(1); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_RESULT_RELIABLE_TOTAL) + .increment(1); } else { - context.getCounter(WARC_WRITER_COUNTER_GROUP, - "language detection: not reliable").increment(1); + context + .getCounter(NutchMetrics.GROUP_WARC_WRITER, + NutchMetrics.WARC_WRITER_LID_RESULT_NOT_RELIABLE_TOTAL) + .increment(1); } if (generateCdx) { if (ldres.charset != null) { From bf01b431dc0d3e193b1a3c632dc8624bd37a32df Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 26 Feb 2026 12:06:10 +0100 Subject: [PATCH 27/27] WARC writer: log capture date as ISO date --- src/java/org/commoncrawl/util/WarcRecordWriter.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/java/org/commoncrawl/util/WarcRecordWriter.java b/src/java/org/commoncrawl/util/WarcRecordWriter.java index 4f9b22943a..05f2a304f6 100644 --- a/src/java/org/commoncrawl/util/WarcRecordWriter.java +++ b/src/java/org/commoncrawl/util/WarcRecordWriter.java @@ -117,6 +117,8 @@ class WarcRecordWriter extends RecordWriter { private URLNormalizers urlNormalizers; private URLNormalizers urlNormalizersRedirect; + private SimpleDateFormat isoDate; + public WarcRecordWriter(Configuration conf, Path outputPath, int partition, TaskAttemptContext context) throws IOException { @@ -128,6 +130,9 @@ public WarcRecordWriter(Configuration conf, Path outputPath, int partition, Locale.ROOT); fileDate.setTimeZone(TimeZone.getTimeZone("UTC")); + isoDate = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT); + isoDate.setTimeZone(TimeZone.getTimeZone("UTC")); + String prefix = conf.get("warc.export.prefix", "NUTCH-CRAWL"); /* @@ -848,8 +853,8 @@ public synchronized void write(Text key, WarcCapture value) } LOG.info("WARC {} record {} ({}, status: {}, size: {})", - (notModified ? "revisit" : "response"), targetUri, date , httpStatusCode, - value.content.getContent().length); + (notModified ? "revisit" : "response"), targetUri, isoDate.format(date), + httpStatusCode, value.content.getContent().length); URI requestId = null; if (verbatimRequestHeaders != null) {