nutch/src/bin/crawl at cc-github-workflow · commoncrawl/nutch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [options] <crawl_dir> <num_rounds>
#
# Arguments:
#   <crawl_dir>                           Directory where the crawl/host/link/segments dirs are saved
#   <num_rounds>                          The number of rounds to run this crawl for
#
# Options:
#   -i|--index                            Indexes crawl results into a configured indexer
#   -D <propery>=<value>                  A Nutch or Hadoop property to pass to Nutch calls overwriting
#                                         properties defined in configuration files, e.g.
#                                           increase content limit to 2MB:
#                                             -D http.content.limit=2097152
#                                         (in distributed mode) configure memory of map and reduce tasks:
#                                           -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m
#                                           -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m
#   -w|--wait <NUMBER[SUFFIX]>            Time to wait before generating a new segment when no URLs
#                                         are scheduled for fetching. Suffix can be: s for second,
#                                         m for minute, h for hour and d for day. If no suffix is
#                                         specified second is used by default. [default: -1]
#   -s <seed_dir>                         Path to seeds file(s)
#   -sm <sitemap_dir>                     Path to sitemap URL file(s)
#
#   --hostdbupdate                        Boolean indicator if we call hostdbupdate or not
#   --hostdbgenerate                      Boolean indicator if we use hostdb in generate or not
#
#   --num-fetchers <num_fetchers>         Number of tasks used for fetching (fetcher map tasks) [default: 1]
#                                         Note: This can only be set when running in distributed mode and
#                                               should correspond to the number of worker nodes in the cluster.
#   --num-tasks <num_tasks>               Number of reducer tasks [default: 2]
#   --size-fetchlist <size_fetchlist>     Number of URLs to fetch in one iteration [default: 50000]
#   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
#   --num-threads <num_threads>           Number of threads for fetching / sitemap processing [default: 50]
#
#   --dedup-group <none|host|domain>       Deduplication group method [default: none]
#

function __to_seconds() {
  NUMBER=$(echo $1 | tr -dc '0-9')
  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')

  case $MODIFIER in
      m|M)
        SECONDS=`expr $NUMBER \* 60`
        ;;
      h|H)
        SECONDS=`expr $NUMBER \* 120`
        ;;
      d|D)
        SECONDS=`expr $NUMBER \* 86400`
        ;;
      s|S|*)
        SECONDS=$NUMBER
        ;;
  esac

  echo $SECONDS
}

function __print_usage {
  echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
  echo -e ""
  echo -e "Arguments:"
  echo -e "  <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
  echo -e "  <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
  echo -e ""
  echo -e "Options:"
  echo -e "  -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
  echo -e "  -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting"
  echo -e "  \t\t\t\t\tproperties defined in configuration files, e.g."
  echo -e "  \t\t\t\t\tincrease content limit to 2MB:"
  echo -e "  \t\t\t\t\t  -D http.content.limit=2097152"
  echo -e "  \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:"
  echo -e "  \t\t\t\t\t  -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m"
  echo -e "  \t\t\t\t\t  -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m"
  echo -e "  -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
  echo -e "  \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
  echo -e "  \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
  echo -e "  \t\t\t\t\tspecified second is used by default. [default: -1]"
  echo -e "  -s <seed_dir>\t\t\t\tPath to seeds file(s)"
  echo -e "  -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
  echo -e "  --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
  echo -e "  --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
  echo -e "  --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
  echo -e "  \t\t\t\t\tNote: This can only be set when running in distributed mode and"
  echo -e "  \t\t\t\t\t      should correspond to the number of worker nodes in the cluster."
  echo -e "  --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
  echo -e "  --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
  echo -e "  --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
  echo -e "  --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
  echo -e "  --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
  echo -e "  \t\t\t\t\tSupported values are:"
  echo -e "  \t\t\t\t\t  - never [default]"
  echo -e "  \t\t\t\t\t  - always (processing takes place in every iteration)"
  echo -e "  \t\t\t\t\t  - once (processing only takes place in the first iteration)"
  echo -e "  --dedup-group <none|host|domain>\tDeduplication group method [default: none]"

  exit 1
}

# default values
INDEXFLAG=false
HOSTDBUPDATE=false
HOSTDBGENERATE=false
HADOOP_PROPERTIES=()
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
NUM_FETCHERS=1
NUM_TASKS=2 # 2 x NUM_FETCHERS
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
SITEMAPS_FROM_HOSTDB_FREQUENCY=never
DEDUP_GROUP=none

while [[ $# > 0 ]]
do
    case $1 in
        -i|--index)
            INDEXFLAG=true
            shift
            ;;
        -D)
            HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
            shift 2
            ;;
        -s)
            SEEDDIR="${2}"
            shift 2
            ;;
        -sm)
            SITEMAPDIR="${2}"
            shift 2
            ;;
        -w|--wait)
            WAIT="${2}"
            shift 2
            ;;
        --num-slaves)
            # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
            NUM_FETCHERS="${2}"
            shift 2
            ;;
        --num-fetchers)
            NUM_FETCHERS="${2}"
            shift 2
            ;;
        --num-tasks)
            NUM_TASKS="${2}"
            shift 2
            ;;
        --size-fetchlist)
            SIZE_FETCHLIST="${2}"
            shift 2
            ;;
        --time-limit-fetch)
            TIME_LIMIT_FETCH="${2}"
            shift 2
            ;;
        --num-threads)
            NUM_THREADS="${2}"
            shift 2
            ;;
        --sitemaps-from-hostdb)
            SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
            shift 2
            ;;
        --dedup-group)
            DEDUP_GROUP="${2}"
            shift 2
            ;;
        --hostdbupdate)
            HOSTDBUPDATE=true
            shift
            ;;
        --hostdbgenerate)
            HOSTDBGENERATE=true
            shift
            ;;
        *)
            break
            ;;
    esac
done

if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
  echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
  echo -e ""
  __print_usage
fi

if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
  echo "Error: --dedup-group <mode> has to be one of none, host, domain."
  echo -e ""
  __print_usage
fi

if [[ $# != 2 ]]; then
  __print_usage
fi

CRAWL_PATH="$1"
LIMIT="$2"

# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
  WAIT=$( __to_seconds "$WAIT" )
  echo "Time to wait (--wait) = $WAIT sec."
fi

bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"

# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
  mode=distributed
fi
if [[ "$mode" = "local" ]]; then
  if [[ "$NUM_FETCHERS" -ne 1 ]]; then
    echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
  fi
  NUM_FETCHERS=1
fi

# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true)

 # check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
    exit -1;
 fi
fi


function __bin_nutch {
    # run $bin/nutch, exit if exit value indicates error

    echo "$bin/nutch $@" ;# echo command and arguments
    "$bin/nutch" "$@"

    RETCODE=$?
    if [ $RETCODE -ne 0 ]
    then
        echo "Error running:"
        echo "  $bin/nutch $@"
        echo "Failed with exit value $RETCODE."
        exit $RETCODE
    fi
}

# check if directory exists locally or on hdfs
function __directory_exists {
  if [[ "$mode" == local  &&  -d "$1" ]]; then
    return 0
  elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
    return 0
  else
    return 1
  fi
}

function __update_hostdb {
  if __directory_exists "$CRAWL_PATH"/crawldb; then
    echo "Updating HostDB"
    __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
  fi
}

# initial injection
if [[ ! -z $SEEDDIR  ]]; then
  echo "Injecting seed URLs"
  __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi

# sitemap processing based on sitemap definition file(s)
if [[ ! -z $SITEMAPDIR ]]; then
  echo "Processing sitemaps defined in $SITEMAPDIR"
  __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
fi

# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
  if [ -e ".STOP" ]; then
    echo "STOP file found - escaping loop"
    break
  fi

  if [ $LIMIT -ne -1 ]; then
    if [ $a -gt $LIMIT ]; then
      echo `date` ": Finished loop with $LIMIT iterations"
      break
    fi
    echo `date` ": Iteration $a of $LIMIT"
  else
    echo `date` ": Iteration $a"
  fi

  if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
    # create / update HostDB on first run
    [[ $a -eq 1 ]] && __update_hostdb

    # sitemap processing based on HostDB
    if __directory_exists "$CRAWL_PATH"/hostdb; then
      echo "Processing sitemaps based on hosts in HostDB"
      __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
    fi
  fi

  echo "Generating a new segment"
  if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
  else
   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
  fi

  echo "$bin/nutch generate ${generate_args[@]}"
  $bin/nutch generate "${generate_args[@]}"
  RETCODE=$?
  if [ $RETCODE -eq 0 ]; then
      : # ok: no error
  elif [ $RETCODE -eq 1 ]; then
    echo "Generate returned 1 (no new segments created)"

    if [ "$WAIT" -ne -1 ]; then
      echo "Waiting for $WAIT sec. ..."
      sleep $WAIT
      continue
    else
      echo "Escaping loop: no more URLs to fetch now"
      break
    fi
  else
    echo "Error running:"
    echo "  $bin/nutch generate ${generate_args[@]}"
    echo "Failed with exit value $RETCODE."
    exit $RETCODE
  fi

  # capture the name of the segment
  # call hadoop in distributed mode
  # or use ls

  if [ $mode = "local" ]; then
   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
  else
   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
  fi

  echo "Operating on segment : $SEGMENT"

  # fetching the segment
  echo "Fetching : $SEGMENT"
  __bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS

  # parsing the segment
  echo "Parsing : $SEGMENT"
  # enable the skipping of records for the parsing so that a dodgy document
  # so that it does not fail the full task
  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
  __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

  # updatedb with this segment
  echo "CrawlDB update"
  __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT

  echo "HostDB update"
  if $HOSTDBUPDATE; then
  __update_hostdb
  fi

  # Note that all steps below in this loop (link inversion, deduplication, indexing)
  # can be done
  # - either inside the loop on a per segment basis
  # - or after the loop over all segments created in all loop iterations
  #   (both invertlinks and index accept multiple segments as input)
  # The latter is more efficient but the index is then updated later.
  echo "Link inversion"
  __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter

  echo "Dedup on crawldb"
  __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"

  if $INDEXFLAG; then
      echo "Indexing $SEGMENT to index"
      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
  else
      echo "Skipping indexing ..."
  fi

done

#######################################################
# The following commands fall into WebGraph territory
# and should be uncommented based on your requirements
#######################################################
#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

exit 0