forked from apache/nutch
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcrawl
More file actions
executable file
·432 lines (384 loc) · 15.9 KB
/
Copy pathcrawl
File metadata and controls
executable file
·432 lines (384 loc) · 15.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [options] <crawl_dir> <num_rounds>
#
# Arguments:
# <crawl_dir> Directory where the crawl/host/link/segments dirs are saved
# <num_rounds> The number of rounds to run this crawl for
#
# Options:
# -i|--index Indexes crawl results into a configured indexer
# -D <propery>=<value> A Nutch or Hadoop property to pass to Nutch calls overwriting
# properties defined in configuration files, e.g.
# increase content limit to 2MB:
# -D http.content.limit=2097152
# (in distributed mode) configure memory of map and reduce tasks:
# -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m
# -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m
# -w|--wait <NUMBER[SUFFIX]> Time to wait before generating a new segment when no URLs
# are scheduled for fetching. Suffix can be: s for second,
# m for minute, h for hour and d for day. If no suffix is
# specified second is used by default. [default: -1]
# -s <seed_dir> Path to seeds file(s)
# -sm <sitemap_dir> Path to sitemap URL file(s)
#
# --hostdbupdate Boolean indicator if we call hostdbupdate or not
# --hostdbgenerate Boolean indicator if we use hostdb in generate or not
#
# --num-fetchers <num_fetchers> Number of tasks used for fetching (fetcher map tasks) [default: 1]
# Note: This can only be set when running in distributed mode and
# should correspond to the number of worker nodes in the cluster.
# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000]
# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
# --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50]
#
# --dedup-group <none|host|domain> Deduplication group method [default: none]
#
function __to_seconds() {
NUMBER=$(echo $1 | tr -dc '0-9')
MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
case $MODIFIER in
m|M)
SECONDS=`expr $NUMBER \* 60`
;;
h|H)
SECONDS=`expr $NUMBER \* 120`
;;
d|D)
SECONDS=`expr $NUMBER \* 86400`
;;
s|S|*)
SECONDS=$NUMBER
;;
esac
echo $SECONDS
}
function __print_usage {
echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
echo -e ""
echo -e "Arguments:"
echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
echo -e ""
echo -e "Options:"
echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
echo -e " -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting"
echo -e " \t\t\t\t\tproperties defined in configuration files, e.g."
echo -e " \t\t\t\t\tincrease content limit to 2MB:"
echo -e " \t\t\t\t\t -D http.content.limit=2097152"
echo -e " \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:"
echo -e " \t\t\t\t\t -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m"
echo -e " \t\t\t\t\t -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m"
echo -e " -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
echo -e " --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and"
echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster."
echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
echo -e " --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
echo -e " \t\t\t\t\tSupported values are:"
echo -e " \t\t\t\t\t - never [default]"
echo -e " \t\t\t\t\t - always (processing takes place in every iteration)"
echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)"
echo -e " --dedup-group <none|host|domain>\tDeduplication group method [default: none]"
exit 1
}
# default values
INDEXFLAG=false
HOSTDBUPDATE=false
HOSTDBGENERATE=false
HADOOP_PROPERTIES=()
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
NUM_FETCHERS=1
NUM_TASKS=2 # 2 x NUM_FETCHERS
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
SITEMAPS_FROM_HOSTDB_FREQUENCY=never
DEDUP_GROUP=none
while [[ $# > 0 ]]
do
case $1 in
-i|--index)
INDEXFLAG=true
shift
;;
-D)
HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
shift 2
;;
-s)
SEEDDIR="${2}"
shift 2
;;
-sm)
SITEMAPDIR="${2}"
shift 2
;;
-w|--wait)
WAIT="${2}"
shift 2
;;
--num-slaves)
# back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
NUM_FETCHERS="${2}"
shift 2
;;
--num-fetchers)
NUM_FETCHERS="${2}"
shift 2
;;
--num-tasks)
NUM_TASKS="${2}"
shift 2
;;
--size-fetchlist)
SIZE_FETCHLIST="${2}"
shift 2
;;
--time-limit-fetch)
TIME_LIMIT_FETCH="${2}"
shift 2
;;
--num-threads)
NUM_THREADS="${2}"
shift 2
;;
--sitemaps-from-hostdb)
SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
shift 2
;;
--dedup-group)
DEDUP_GROUP="${2}"
shift 2
;;
--hostdbupdate)
HOSTDBUPDATE=true
shift
;;
--hostdbgenerate)
HOSTDBGENERATE=true
shift
;;
*)
break
;;
esac
done
if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
echo -e ""
__print_usage
fi
if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
echo "Error: --dedup-group <mode> has to be one of none, host, domain."
echo -e ""
__print_usage
fi
if [[ $# != 2 ]]; then
__print_usage
fi
CRAWL_PATH="$1"
LIMIT="$2"
# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
WAIT=$( __to_seconds "$WAIT" )
echo "Time to wait (--wait) = $WAIT sec."
fi
bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"
# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
if [[ "$mode" = "local" ]]; then
if [[ "$NUM_FETCHERS" -ne 1 ]]; then
echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
fi
NUM_FETCHERS=1
fi
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true)
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
if [ $(which hadoop | wc -l ) -eq 0 ]; then
echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
exit -1;
fi
fi
function __bin_nutch {
# run $bin/nutch, exit if exit value indicates error
echo "$bin/nutch $@" ;# echo command and arguments
"$bin/nutch" "$@"
RETCODE=$?
if [ $RETCODE -ne 0 ]
then
echo "Error running:"
echo " $bin/nutch $@"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
}
# check if directory exists locally or on hdfs
function __directory_exists {
if [[ "$mode" == local && -d "$1" ]]; then
return 0
elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
return 0
else
return 1
fi
}
function __update_hostdb {
if __directory_exists "$CRAWL_PATH"/crawldb; then
echo "Updating HostDB"
__bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
fi
}
# initial injection
if [[ ! -z $SEEDDIR ]]; then
echo "Injecting seed URLs"
__bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi
# sitemap processing based on sitemap definition file(s)
if [[ ! -z $SITEMAPDIR ]]; then
echo "Processing sitemaps defined in $SITEMAPDIR"
__bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
fi
# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
if [ -e ".STOP" ]; then
echo "STOP file found - escaping loop"
break
fi
if [ $LIMIT -ne -1 ]; then
if [ $a -gt $LIMIT ]; then
echo `date` ": Finished loop with $LIMIT iterations"
break
fi
echo `date` ": Iteration $a of $LIMIT"
else
echo `date` ": Iteration $a"
fi
if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
# create / update HostDB on first run
[[ $a -eq 1 ]] && __update_hostdb
# sitemap processing based on HostDB
if __directory_exists "$CRAWL_PATH"/hostdb; then
echo "Processing sitemaps based on hosts in HostDB"
__bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
fi
fi
echo "Generating a new segment"
if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
else
generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
fi
echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
if [ $RETCODE -eq 0 ]; then
: # ok: no error
elif [ $RETCODE -eq 1 ]; then
echo "Generate returned 1 (no new segments created)"
if [ "$WAIT" -ne -1 ]; then
echo "Waiting for $WAIT sec. ..."
sleep $WAIT
continue
else
echo "Escaping loop: no more URLs to fetch now"
break
fi
else
echo "Error running:"
echo " $bin/nutch generate ${generate_args[@]}"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
# capture the name of the segment
# call hadoop in distributed mode
# or use ls
if [ $mode = "local" ]; then
SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
else
SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
fi
echo "Operating on segment : $SEGMENT"
# fetching the segment
echo "Fetching : $SEGMENT"
__bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS
# parsing the segment
echo "Parsing : $SEGMENT"
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
__bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
# updatedb with this segment
echo "CrawlDB update"
__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
echo "HostDB update"
if $HOSTDBUPDATE; then
__update_hostdb
fi
# Note that all steps below in this loop (link inversion, deduplication, indexing)
# can be done
# - either inside the loop on a per segment basis
# - or after the loop over all segments created in all loop iterations
# (both invertlinks and index accept multiple segments as input)
# The latter is more efficient but the index is then updated later.
echo "Link inversion"
__bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter
echo "Dedup on crawldb"
__bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"
if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"
__bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
else
echo "Skipping indexing ..."
fi
done
#######################################################
# The following commands fall into WebGraph territory
# and should be uncommented based on your requirements
#######################################################
#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
exit 0