#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Usage: crawl [options] # # Arguments: # Directory where the crawl/host/link/segments dirs are saved # The number of rounds to run this crawl for # # Options: # -i|--index Indexes crawl results into a configured indexer # -D = A Nutch or Hadoop property to pass to Nutch calls overwriting # properties defined in configuration files, e.g. # increase content limit to 2MB: # -D http.content.limit=2097152 # (in distributed mode) configure memory of map and reduce tasks: # -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m # -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m # -w|--wait Time to wait before generating a new segment when no URLs # are scheduled for fetching. Suffix can be: s for second, # m for minute, h for hour and d for day. If no suffix is # specified second is used by default. [default: -1] # -s Path to seeds file(s) # -sm Path to sitemap URL file(s) # # --hostdbupdate Boolean indicator if we call hostdbupdate or not # --hostdbgenerate Boolean indicator if we use hostdb in generate or not # # --num-fetchers Number of tasks used for fetching (fetcher map tasks) [default: 1] # Note: This can only be set when running in distributed mode and # should correspond to the number of worker nodes in the cluster. # --num-tasks Number of reducer tasks [default: 2] # --size-fetchlist Number of URLs to fetch in one iteration [default: 50000] # --time-limit-fetch Number of minutes allocated to the fetching [default: 180] # --num-threads Number of threads for fetching / sitemap processing [default: 50] # # --dedup-group Deduplication group method [default: none] # function __to_seconds() { NUMBER=$(echo $1 | tr -dc '0-9') MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') case $MODIFIER in m|M) SECONDS=`expr $NUMBER \* 60` ;; h|H) SECONDS=`expr $NUMBER \* 120` ;; d|D) SECONDS=`expr $NUMBER \* 86400` ;; s|S|*) SECONDS=$NUMBER ;; esac echo $SECONDS } function __print_usage { echo "Usage: crawl [options] " echo -e "" echo -e "Arguments:" echo -e " \t\t\t\tDirectory where the crawl/host/link/segments dirs are saved" echo -e " \t\t\t\tThe number of rounds to run this crawl for" echo -e "" echo -e "Options:" echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured indexer" echo -e " -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting" echo -e " \t\t\t\t\tproperties defined in configuration files, e.g." echo -e " \t\t\t\t\tincrease content limit to 2MB:" echo -e " \t\t\t\t\t -D http.content.limit=2097152" echo -e " \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:" echo -e " \t\t\t\t\t -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m" echo -e " \t\t\t\t\t -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m" echo -e " -w|--wait \t\tTime to wait before generating a new segment when no URLs" echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second," echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is" echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]" echo -e " -s \t\t\t\tPath to seeds file(s)" echo -e " -sm \t\t\tPath to sitemap URL file(s)" echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round" echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not" echo -e " --num-fetchers \t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]" echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and" echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster." echo -e " --num-tasks \t\tNumber of reducer tasks [default: 2]" echo -e " --size-fetchlist \tNumber of URLs to fetch in one iteration [default: 50000]" echo -e " --time-limit-fetch \tNumber of minutes allocated to the fetching [default: 180]" echo -e " --num-threads \t\tNumber of threads for fetching / sitemap processing [default: 50]" echo -e " --sitemaps-from-hostdb \tWhether and how often to process sitemaps based on HostDB." echo -e " \t\t\t\t\tSupported values are:" echo -e " \t\t\t\t\t - never [default]" echo -e " \t\t\t\t\t - always (processing takes place in every iteration)" echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)" echo -e " --dedup-group \tDeduplication group method [default: none]" exit 1 } # default values INDEXFLAG=false HOSTDBUPDATE=false HOSTDBGENERATE=false HADOOP_PROPERTIES=() WAIT=-1 # don't wait if there are no URLs to fetch SEEDDIR="" NUM_FETCHERS=1 NUM_TASKS=2 # 2 x NUM_FETCHERS SIZE_FETCHLIST=50000 # 25K x NUM_TASKS TIME_LIMIT_FETCH=180 NUM_THREADS=50 SITEMAPS_FROM_HOSTDB_FREQUENCY=never DEDUP_GROUP=none while [[ $# > 0 ]] do case $1 in -i|--index) INDEXFLAG=true shift ;; -D) HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}") shift 2 ;; -s) SEEDDIR="${2}" shift 2 ;; -sm) SITEMAPDIR="${2}" shift 2 ;; -w|--wait) WAIT="${2}" shift 2 ;; --num-slaves) # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers NUM_FETCHERS="${2}" shift 2 ;; --num-fetchers) NUM_FETCHERS="${2}" shift 2 ;; --num-tasks) NUM_TASKS="${2}" shift 2 ;; --size-fetchlist) SIZE_FETCHLIST="${2}" shift 2 ;; --time-limit-fetch) TIME_LIMIT_FETCH="${2}" shift 2 ;; --num-threads) NUM_THREADS="${2}" shift 2 ;; --sitemaps-from-hostdb) SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}" shift 2 ;; --dedup-group) DEDUP_GROUP="${2}" shift 2 ;; --hostdbupdate) HOSTDBUPDATE=true shift ;; --hostdbgenerate) HOSTDBGENERATE=true shift ;; *) break ;; esac done if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then echo "Error: --sitemaps-from-hostdb has to be one of never, always, once." echo -e "" __print_usage fi if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then echo "Error: --dedup-group has to be one of none, host, domain." echo -e "" __print_usage fi if [[ $# != 2 ]]; then __print_usage fi CRAWL_PATH="$1" LIMIT="$2" # convert wait time to seconds for compatibility reasons if [ "$WAIT" != "-1" ]; then WAIT=$( __to_seconds "$WAIT" ) echo "Time to wait (--wait) = $WAIT sec." fi bin="`dirname "$0"`" bin="`cd "$bin"; pwd`" # determines whether mode based on presence of job file mode=local if [ -f "${bin}"/../*nutch*.job ]; then mode=distributed fi if [[ "$mode" = "local" ]]; then if [[ "$NUM_FETCHERS" -ne 1 ]]; then echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode." fi NUM_FETCHERS=1 fi # note that some of the options listed here could be set in the # corresponding hadoop site xml param file commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true) # check that hadoop can be found on the path if [ $mode = "distributed" ]; then if [ $(which hadoop | wc -l ) -eq 0 ]; then echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." exit -1; fi fi function __bin_nutch { # run $bin/nutch, exit if exit value indicates error echo "$bin/nutch $@" ;# echo command and arguments "$bin/nutch" "$@" RETCODE=$? if [ $RETCODE -ne 0 ] then echo "Error running:" echo " $bin/nutch $@" echo "Failed with exit value $RETCODE." exit $RETCODE fi } # check if directory exists locally or on hdfs function __directory_exists { if [[ "$mode" == local && -d "$1" ]]; then return 0 elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then return 0 else return 1 fi } function __update_hostdb { if __directory_exists "$CRAWL_PATH"/crawldb; then echo "Updating HostDB" __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb fi } # initial injection if [[ ! -z $SEEDDIR ]]; then echo "Injecting seed URLs" __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR" fi # sitemap processing based on sitemap definition file(s) if [[ ! -z $SITEMAPDIR ]]; then echo "Processing sitemaps defined in $SITEMAPDIR" __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS fi # main loop : rounds of generate - fetch - parse - update for ((a=1; ; a++)) do if [ -e ".STOP" ]; then echo "STOP file found - escaping loop" break fi if [ $LIMIT -ne -1 ]; then if [ $a -gt $LIMIT ]; then echo `date` ": Finished loop with $LIMIT iterations" break fi echo `date` ": Iteration $a of $LIMIT" else echo `date` ": Iteration $a" fi if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then # create / update HostDB on first run [[ $a -eq 1 ]] && __update_hostdb # sitemap processing based on HostDB if __directory_exists "$CRAWL_PATH"/hostdb; then echo "Processing sitemaps based on hosts in HostDB" __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS fi fi echo "Generating a new segment" if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb) else generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter) fi echo "$bin/nutch generate ${generate_args[@]}" $bin/nutch generate "${generate_args[@]}" RETCODE=$? if [ $RETCODE -eq 0 ]; then : # ok: no error elif [ $RETCODE -eq 1 ]; then echo "Generate returned 1 (no new segments created)" if [ "$WAIT" -ne -1 ]; then echo "Waiting for $WAIT sec. ..." sleep $WAIT continue else echo "Escaping loop: no more URLs to fetch now" break fi else echo "Error running:" echo " $bin/nutch generate ${generate_args[@]}" echo "Failed with exit value $RETCODE." exit $RETCODE fi # capture the name of the segment # call hadoop in distributed mode # or use ls if [ $mode = "local" ]; then SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` else SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` fi echo "Operating on segment : $SEGMENT" # fetching the segment echo "Fetching : $SEGMENT" __bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS # parsing the segment echo "Parsing : $SEGMENT" # enable the skipping of records for the parsing so that a dodgy document # so that it does not fail the full task skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT # updatedb with this segment echo "CrawlDB update" __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT echo "HostDB update" if $HOSTDBUPDATE; then __update_hostdb fi # Note that all steps below in this loop (link inversion, deduplication, indexing) # can be done # - either inside the loop on a per segment basis # - or after the loop over all segments created in all loop iterations # (both invertlinks and index accept multiple segments as input) # The latter is more efficient but the index is then updated later. echo "Link inversion" __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter echo "Dedup on crawldb" __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP" if $INDEXFLAG; then echo "Indexing $SEGMENT to index" __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone else echo "Skipping indexing ..." fi done ####################################################### # The following commands fall into WebGraph territory # and should be uncommented based on your requirements ####################################################### #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" #echo "Running Loops Job on WebGraph within $CRAWL_PATH" #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores exit 0