From 6c9f73c568c81c7b3594df5f2fbd40dbb2b3a122 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 25 Apr 2024 21:43:49 +0200 Subject: [PATCH 1/4] Generator2: count URLs rejected by URL filters - add counters URL_FILTERS_REJECTED and URL_FILTER_EXCEPTION --- src/java/org/apache/nutch/crawl/Generator2.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index af9562fe42..532329823b 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -365,12 +365,13 @@ public void map(Text key, CrawlDatum value, Context context) // If filtering is on don't generate URLs that don't pass // URLFilters try { - if (filters.filter(urlString) == null) + if (filters.filter(urlString) == null) { + context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1); return; - } catch (URLFilterException e) { - if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't filter url {}: {}", key, e.getMessage()); } + } catch (URLFilterException e) { + LOG.warn("Couldn't filter url {}: {}", key, e.getMessage()); + context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1); } } From a889a3fae20bc1b4267ea33669f4b02099e617cf Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 25 Apr 2024 22:04:40 +0200 Subject: [PATCH 2/4] fix(Generator2): make optional URL normalization work - pass the configuration property set via command-line flag (-noNorm) forward to the class SelectorMapper --- src/java/org/apache/nutch/crawl/Generator2.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index 532329823b..f355218b89 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -342,6 +342,11 @@ public void setup( filters = new URLFilters(conf); scfilters = new ScoringFilters(conf); filter = conf.getBoolean(GENERATOR_FILTER, true); + normalise = conf.getBoolean(GENERATOR_NORMALISE, true); + if (normalise) { + normalizers = new URLNormalizers(conf, + URLNormalizers.SCOPE_GENERATE_HOST_COUNT); + } genDelay = conf.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L; long time = conf.getLong(Nutch.GENERATE_TIME_KEY, 0L); if (time > 0) From 2ad13b60c9bfd0ff3833c1d7eb25a2e386aab135 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 25 Apr 2024 22:10:17 +0200 Subject: [PATCH 3/4] refactor: add override annotations and remove unnecessary casts --- src/java/org/apache/nutch/crawl/Generator2.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index f355218b89..c271a3963b 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -255,18 +255,21 @@ public SelectorEntry() { segnum = new IntWritable(0); } + @Override public void readFields(DataInput in) throws IOException { url.readFields(in); datum.readFields(in); segnum.readFields(in); } + @Override public void write(DataOutput out) throws IOException { url.write(out); datum.write(out); segnum.write(out); } + @Override public String toString() { return "url=" + url.toString() + ", datum=" + datum.toString() + ", segnum=" + segnum.toString(); @@ -362,6 +365,7 @@ public void setup( } /** Select & invert subset due for fetch. */ + @Override public void map(Text key, CrawlDatum value, Context context) throws IOException, InterruptedException { String urlString = key.toString(); @@ -681,6 +685,7 @@ private static Map readLimitsFile(Reader limitsReader, * Limit the number of URLs per host/domain and assign segment number to * every record. */ + @Override public void reduce(DomainScorePair key, Iterable values, Context context) throws IOException, InterruptedException { @@ -977,6 +982,7 @@ public static class SegmenterMapper extends SegmenterKey outputKey = new SegmenterKey(); + @Override public void map(FloatWritable key, SelectorEntry value, Context context) throws IOException, InterruptedException { outputKey.set(value.url, value.segnum); @@ -1002,6 +1008,7 @@ public void setup(Context context) { mos = new MultipleOutputs(context); } + @Override public void reduce(SegmenterKey key, Iterable values, Context context) throws IOException, InterruptedException { long count = 0; @@ -1064,6 +1071,7 @@ public void setup(Context context) { partitioner.setDomainLimits(SelectorReducer.readLimitsFile(conf, acceptor)); } + @Override public void map(Text key, SelectorEntry value, Context context) throws IOException, InterruptedException { out.write("sequenceFilesPartitions", key, value.datum, @@ -1090,6 +1098,7 @@ public HashComparator() { super(Text.class); } + @Override @SuppressWarnings("rawtypes") public int compare(WritableComparable a, WritableComparable b) { Text url1 = (Text) a; @@ -1099,6 +1108,7 @@ public int compare(WritableComparable a, WritableComparable b) { return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1)); } + @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int hash1 = hash(b1, s1, l1); int hash2 = hash(b2, s2, l2); @@ -1110,7 +1120,7 @@ private static int hash(byte[] bytes, int start, int length) { // make later bytes more significant in hash code, so that sorting // by hashcode correlates less with by-host ordering. for (int i = length - 1; i >= 0; i--) - hash = (31 * hash) + (int) bytes[start + i]; + hash = (31 * hash) + bytes[start + i]; return hash; } } @@ -1426,6 +1436,7 @@ public static void main(String args[]) throws Exception { System.exit(res); } + @Override public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println( From 09e795ed2638eae7ee5425c9ebd5dbb0f108fd42 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 25 Apr 2024 22:12:27 +0200 Subject: [PATCH 4/4] refactor: simplify logging statements --- src/java/org/apache/nutch/crawl/Generator2.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator2.java b/src/java/org/apache/nutch/crawl/Generator2.java index c271a3963b..ff8c38780f 100644 --- a/src/java/org/apache/nutch/crawl/Generator2.java +++ b/src/java/org/apache/nutch/crawl/Generator2.java @@ -404,9 +404,7 @@ public void map(Text key, CrawlDatum value, Context context) try { sort = scfilters.generatorSortValue(key, value, sort); } catch (ScoringFilterException sfe) { - if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe); - } + LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe); } if (restrictStatus != null && !restrictStatus @@ -1357,9 +1355,7 @@ public Path[] generate(Path dbDir, String dbVersion, Path segments, */ private List partitionSegments(FileSystem fs, Path segmentsDir, List inputDirs, int numLists) throws Exception { - if (LOG.isInfoEnabled()) { - LOG.info("Generator: Partitioning selected urls for politeness."); - } + LOG.info("Generator: Partitioning selected urls for politeness."); List generatedSegments = new ArrayList();