cosr-back/scripts/build_devindex.py at master · commonsearch/cosr-back · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python

#
# The dev index is a small index of sample pages used to make developement in cosr-front easier
#

# We get the top N domains from alexa
TOP_ALEXA_DOMAINS = 1000

# Plus the following hardcoded documents
# Format is: url => {"static_rank": static rank from 1 to 0, URL}
DOCUMENTS = {

    "https://about.commonsearch.org/": {"static_rank": 1},
    "https://en.wikipedia.org/wiki/Maceo_Parker": {"static_rank": 0.98},  # Significance of this rank left to the listener...

}


import gevent.monkey
gevent.monkey.patch_all()

from gevent.pool import Pool
import sys
import os
sys.path.insert(-1, os.path.dirname(os.path.dirname(__file__)))

import itertools
import requests
from cosrlib.sources.webarchive import create_warc_from_corpus
from urlserver.datasources import list_datasources
from cosrlib.indexer import Indexer
from cosrlib.config import config

request_pool = Pool(20)


def get_documents():
    """ Get the complete list of documents to be crawled """
    # Add alexa top domains to the documents
    if TOP_ALEXA_DOMAINS > 0:
        alexa_datasource = list_datasources()["alexa_top1m"]
        print "Fetching top %s domains from Alexa" % TOP_ALEXA_DOMAINS
        for row in itertools.islice(alexa_datasource.iter_dump(), 0, TOP_ALEXA_DOMAINS):
            alexa_rank = int(row[0])
            domain = row[1]
            static_rank = 0.5 - (0.0001 * alexa_rank)
            DOCUMENTS["http://%s" % domain] = {"static_rank": static_rank}

    return DOCUMENTS


def crawl(url):
    """ Crawl a document """

    print "Crawling %s..." % url

    try:
        res = requests.get(url, timeout=30)
    except Exception, e:
        print "[ERROR] %s failed: %s" % (url, e)
        return None

    if res.status_code != 200:
        print "[WARNING] %s had status=%s" % (url, res.status_code)
        return None

    return {
        "content": res.content,  # text.encode("utf-8"),
        "url": url,
        "headers": res.headers
    }


def generate_corpus():
    """ # Crawl all the documents """

    for res in request_pool.imap_unordered(crawl, get_documents().iterkeys()):
        if res is not None:
            res["url_metadata_extra"] = {
                "url": {
                    "rank": DOCUMENTS[res["url"]]["static_rank"]
                }
            }
            yield res


if "--warc" in sys.argv:

    # Generate a WARC file
    devindex_dir = os.path.join(config["PATH_LOCALDATA"], "devindex")
    if not os.path.isdir(devindex_dir):
        os.makedirs(devindex_dir)
    warc_file = os.path.join(devindex_dir, "crawl.warc")

    create_warc_from_corpus(generate_corpus(), filename=warc_file)

    print "Created WARC file:", warc_file

elif "--index" in sys.argv:

    indexer = Indexer()
    if "--empty" in sys.argv:
        indexer.empty()
    docs = indexer.index_corpus(generate_corpus(), flush=True, refresh=True)
    print "Indexed %s documents." % len(docs)

else:
    print "Usage: python build_devindex.py [--warc | --index]"
    sys.exit(1)