Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,7 @@
<!--source path="${plugins.dir}/protocol-selenium/src/java"/-->
<!--source path="${plugins.dir}/publish-rabbitmq/src/java"/-->
<source path="${plugins.dir}/scoring-adaptive/src/java"/>
<source path="${plugins.dir}/scoring-adaptive/src/test"/>
<source path="${plugins.dir}/scoring-depth/src/java/" />
<source path="${plugins.dir}/scoring-link/src/java/" />
<source path="${plugins.dir}/scoring-opic/src/java/" />
Expand Down
17 changes: 13 additions & 4 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2216,12 +2216,21 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<property>
<name>scoring.adaptive.penalty.non_canonical</name>
<value>.07</value>
<description>
Penalize non-canonical pages, i.e., pages with a canonical link not equal to the URL.
The default is to delay the revisit up to 7 days (7 * scoring.adaptive.factor.fetchtime).
</description>
</property>

<property>
<name>scoring.adaptive.mark.orphan.after</name>
<value>518400</value>
<description>
Time span (in minutes) after which a page not seen anymore by inlink or
seed is marked as orpaned. Default = 518400 minutes = one year.
seed is marked as orphaned. Default = 518400 minutes = one year.
</description>
</property>

Expand All @@ -2230,7 +2239,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<value>172800</value>
<description>
Time span (in minutes) after which a gone page not seen anymore
by inlink or seed is marked as orpaned. Also duplicates and unfetched pages
by inlink or seed is marked as orphaned. Also duplicates and unfetched pages
with a retry count >= 3 are considered as gone.
Default = 172800 minutes = four month.
</description>
Expand All @@ -2241,7 +2250,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<value>518400</value>
<description>
Time span (in minutes) after which a redirect not seen anymore by inlink
or seed is marked as orpaned. Default = 518400 minutes = one year.
or seed is marked as orphaned. Default = 518400 minutes = one year.
</description>
</property>

Expand All @@ -2250,7 +2259,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<value>518400</value>
<description>
Time span (in minutes) after which a still unfetched page not seen anymore
by inlink or seed is marked as orpaned. Default = 518400 minutes = one year.
by inlink or seed is marked as orphaned. Default = 518400 minutes = one year.
</description>
</property>

Expand Down
3 changes: 2 additions & 1 deletion src/java/org/apache/nutch/crawl/Generator2.java
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,8 @@ public void reduce(DomainScorePair key, Iterable<SelectorEntry> values,
// log metrics per host/domain
LOG.info(
"{} :: selected={}, selected_hosts={}, max_urls_overflow={}, max_hosts_overflow={}, max_urls_per_host_overflow={}",
key.getDomain(), hostOrDomainCount, hosts.size(), maxUrlsOverflow,
key.getDomain(), hostOrDomainCount,
(hosts == null ? 0 : hosts.size()), maxUrlsOverflow,
maxHostsOverflowCount, maxUrlsPerHostOverflowCount);
}

Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/fetcher/FetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public class FetcherThread extends Thread {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

private static Writable EMPTY_VALUE = NullWritable.get();
private static final Writable EMPTY_VALUE = NullWritable.get();

private Configuration conf;
private URLFilters urlFilters;
Expand Down
2 changes: 2 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
<ant dir="protocol-http" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<ant dir="protocol-okhttp" target="test"/>
<ant dir="scoring-adaptive" target="test"/>
<ant dir="scoring-orphan" target="test"/>
<ant dir="scoring-metadata" target="test"/>
<ant dir="subcollection" target="test"/>
Expand Down Expand Up @@ -229,6 +230,7 @@
<ant dir="protocol-okhttp" target="clean"/>
<!--<ant dir="protocol-selenium" target="clean" />-->
<!--<ant dir="publish-rabbitmq" target="clean"/>-->
<ant dir="scoring-adaptive" target="clean"/>
<ant dir="scoring-depth" target="clean"/>
<ant dir="scoring-link" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,13 @@

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;

Expand All @@ -50,6 +53,8 @@
* <li>the page score</li>
* <li>the crawl status (fetched, not modified, redirect, gone)</li>
* <li>the time elapsed since the scheduled fetch time</li>
* <li>whether or not a canonical link has been detected on the page and the
* link points to a different URL</li>
* </ul>
* </p>
*
Expand All @@ -66,16 +71,14 @@
*
* <p>
* The plugin is thought for large crawls where there are far more URLs than can
* be fetched and taking a good sample is mandatory. Sampling is, of course,
* usually based on the page score - relevant pages with a high score are
* fetched with higher probability. However, a dynamic rotation of generated
* be fetched and selecting a representative sample is mandatory. Sampling is,
* of course, usually based on the page score - relevant pages with a high score
* are fetched with higher probability. However, a dynamic rotation of generated
* items helps to avoid that the same page with a slightly higher score is
* fetched again while others are still waiting to be queued. It also allows to
* adjust the probabilities that gone or not modified pages are refetched.
* fetched again while others are still waiting to be queued. The plugin also
* allows to adjust when pages gone or not modified are revisited.
* </p>
*
* [TODO:experimental]
*
* The plugin also includes heuristics to &quot;retire&quot; pages to status
* db_orphan if they fail to fetch or are duplicates and are not seen in seeds
* or via inlinks (cf. the plugin scoring-orphan).
Expand Down Expand Up @@ -156,6 +159,20 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
*/
public static final String ADAPTIVE_INJECTED_BOOST = "scoring.adaptive.boost.injected";

/**
* Penalty for pages with a canonical link different than the page URL.
*
* Revisits are delayed by subtracting this penalty from the generator sort
* value.
*
* Note: In order to avoid that pages without a canonical link are preferred,
* the penalty shouldn't be too high. The default is
* <code>7 * scoring.adaptive.factor.fetchtime</code>, that is a revisit can
* be delayed by up to 7 days, in comparison to a page where the canonical
* link equals the page URL, or a page without a canonical link.
*/
public static final String ADAPTIVE_NON_CANONICAL_PENALTY = "scoring.adaptive.penalty.non_canonical";

/*
* Time span (in minutes) after which a page not seen anymore by inlink or
* seed is marked as orphaned.
Expand Down Expand Up @@ -200,6 +217,8 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
public static final String SUCCESSFUL_FETCH_TIME = "_sft_";
public static final Text WRITABLE_SUCCESSFUL_FETCH_TIME = new Text(SUCCESSFUL_FETCH_TIME);

private static final Writable EMPTY_VALUE = NullWritable.get();

private Configuration conf;

/**
Expand All @@ -215,6 +234,7 @@ public class AdaptiveScoringFilter extends AbstractScoringFilter {
private float adaptiveLastSeenTimeSort;
private float adaptiveFetchRetryPenalty;
private float adaptiveBoostInjected;
private float nonCanonicalPenalty;

private Map<Byte, Float> statusSortMap = new TreeMap<Byte, Float>();
private Map<String, Float> contentTypeSortMap = new HashMap<String, Float>();
Expand Down Expand Up @@ -279,6 +299,10 @@ public void setConf(Configuration conf) {
// is marked as orphaned when it's last seen time is not given
orphanTimeLastSeenDefault = nowMinutes;
}

/* Penalize non-canonical pages: default is to delay revisits by 7 days */
nonCanonicalPenalty = conf.getFloat(ADAPTIVE_NON_CANONICAL_PENALTY,
7 * adaptiveFetchTimeSort);
}

private void readSortFile(Reader sortFileReader) throws IOException {
Expand Down Expand Up @@ -390,6 +414,10 @@ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
initSort -= adaptiveLastSeenTimeSort * daysSinceLastSeen;
}
}
if (pageIsNotCanonical(url, datum)) {
// penalize for not being the canonical page
initSort -= nonCanonicalPenalty;
}
return initSort;
}

Expand Down Expand Up @@ -462,4 +490,22 @@ private static boolean pageIsRedirect(CrawlDatum datum) {
return false;
}

private static boolean pageIsNotCanonical(Text url, CrawlDatum datum) {
if (datum.getStatus() != CrawlDatum.STATUS_DB_FETCHED) {
// If not successfully fetched, there's no canonical link
return false;
}
Writable canonicalUrl = datum.getMetaData().get(Nutch.CANONICAL_LINK_KEY);
if (canonicalUrl != null && !canonicalUrl.equals(EMPTY_VALUE)
&& !url.equals(canonicalUrl)) {
/*
* If there is a canonical link and it's different from the URL, it's not
* the canonical page.
*/
return true;
}
// Otherwise, it's the canonical page or no canonical link was detected.
return false;
}

}