Skip to content

Commit b0bbd1e

Browse files
committed
Adopt victoriadrake/hydra-link-checker v1.0.1
Much faster. No dependencies. Actively maintained. From https://github.com/victoriadrake/hydra-link-checker/blob/v1.0.1/hydra.py
1 parent 9c01e7b commit b0bbd1e

File tree

3 files changed

+239
-11
lines changed

3 files changed

+239
-11
lines changed

.travis.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
language: node_js
2-
3-
node_js: "10"
4-
5-
script:
6-
- npm install spider.js@arschmitz/spider.js#master
7-
- node_modules/spider.js/bin/spider.js --url=https://code.jquery.com
1+
language: python
2+
python: "3.8"
3+
script: python3 hydra.py "https://code.jquery.com"

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
[![Build Status](https://travis-ci.com/jquery/codeorigin.jquery.com.svg?branch=spider-check)](https://travis-ci.com/jquery/codeorigin.jquery.com)
1+
[![Build Status](https://travis-ci.com/jquery/codeorigin.jquery.com.svg?branch=spider-check)](https://travis-ci.com/github/jquery/codeorigin.jquery.com/branches)
22

3-
codeorigin.jquery.com
3+
[code.jquery.com](https://code.jquery.com)
44
=====================
55

6-
This branch takes care of running [spider.js](https://github.com/arschmitz/spider.js) on a regular
7-
basis via Travis CI to uncover broken links and such.
6+
This branch runs on a regular basisq via Travis CI to uncover broken links and such.

hydra.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
import gzip
2+
import sys
3+
from concurrent import futures
4+
from html.parser import HTMLParser
5+
from http.client import IncompleteRead, InvalidURL
6+
from queue import Queue, Empty
7+
from socket import timeout as SocketTimeoutError
8+
from urllib import error, parse, request
9+
10+
11+
class Parser(HTMLParser):
12+
# Tags to check
13+
TAGS = ["a", "link", "img", "script"]
14+
# Valid attributes to check
15+
ATTRS = ["href", "src"]
16+
17+
def __init__(self):
18+
super(Parser, self).__init__()
19+
self.links = []
20+
21+
def handle_starttag(self, tag, attrs):
22+
if tag not in self.TAGS:
23+
return
24+
for a in attrs:
25+
if a[0] in self.ATTRS:
26+
self.links.append(a[1])
27+
28+
def feed_me(self, data):
29+
self.links = []
30+
self.feed(data)
31+
return self.links
32+
33+
def error(self, msg):
34+
return msg
35+
36+
37+
def extract_domain(link):
38+
domain = parse.urlsplit(link).netloc
39+
return domain
40+
41+
42+
class Checker:
43+
TO_PROCESS = Queue()
44+
# Maximum workers to run
45+
THREADS = 50
46+
# Maximum seconds to wait for HTTP response
47+
TIMEOUT = 60
48+
49+
def __init__(self, url):
50+
self.broken = []
51+
self.domain = extract_domain(url)
52+
self.visited = set()
53+
self.mailto_links = list()
54+
self.pool = futures.ThreadPoolExecutor(max_workers=self.THREADS)
55+
self.report = ''
56+
57+
def add_entry(self, code, reason, page):
58+
code = code
59+
reason = reason
60+
entry = {
61+
"code": code,
62+
"link": page["url"],
63+
"parent": page["parent"],
64+
"err": reason,
65+
}
66+
self.broken.append(entry)
67+
68+
# Try to retrieve contents of a page and record result
69+
def load_url(self, page, timeout):
70+
# Store the link to be checked and its parent in the result
71+
result = {
72+
"url": page["url"],
73+
"parent": page["parent"],
74+
"data": "",
75+
"valid_content_type": False,
76+
}
77+
78+
# Use GET as HEAD is frequently not allowed
79+
r = request.Request(
80+
page["url"],
81+
headers={
82+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
83+
}
84+
)
85+
86+
try:
87+
http_response = request.urlopen(r, timeout=self.TIMEOUT)
88+
89+
encoding = http_response.headers.get("Content-Encoding")
90+
if encoding and "gzip" in encoding:
91+
data = gzip.decompress(http_response.read()).decode(
92+
encoding="utf-8", errors="ignore"
93+
)
94+
elif encoding is None:
95+
data = http_response.read().decode(encoding="utf-8", errors="ignore")
96+
else:
97+
# Support for other less common directives not handled
98+
raise NotImplementedError
99+
result["data"] = data
100+
101+
content_type = http_response.headers.get("Content-Type")
102+
if (
103+
content_type is not None
104+
and "text/html" in content_type
105+
or "text/plain" in content_type
106+
):
107+
valid_content_type = True
108+
else:
109+
valid_content_type = False
110+
result["valid_content_type"] = valid_content_type
111+
112+
except error.HTTPError as e:
113+
code = e.getcode()
114+
reason = e.reason
115+
self.add_entry(code, reason, page)
116+
return
117+
except (
118+
error.URLError,
119+
ConnectionRefusedError,
120+
ConnectionResetError,
121+
IncompleteRead,
122+
InvalidURL,
123+
NotImplementedError,
124+
SocketTimeoutError,
125+
TimeoutError,
126+
TypeError,
127+
UnicodeEncodeError,
128+
UnicodeDecodeError,
129+
) as e:
130+
code = 0
131+
reason = e
132+
self.add_entry(code, reason, page)
133+
return
134+
except TimeoutError as e:
135+
code = 408
136+
reason = e
137+
self.add_entry(code, reason, page)
138+
return
139+
140+
return result
141+
142+
def handle_future(self, result):
143+
if result.result():
144+
page = result.result()
145+
self.parse_page(page)
146+
147+
# Get more links from successfully retrieved pages in the same domain
148+
def parse_page(self, page):
149+
if (
150+
self.domain == extract_domain(page["url"])
151+
and page["valid_content_type"]
152+
):
153+
parent = page["url"]
154+
parser = Parser()
155+
links = parser.feed_me(page["data"])
156+
new_links = [x for x in links if x not in self.visited]
157+
full_links = [parse.urljoin(parent, l) for l in new_links]
158+
for l in full_links:
159+
if l not in self.visited:
160+
li = {"parent": parent, "url": l}
161+
self.TO_PROCESS.put(li)
162+
163+
# Parse broken links list into YAML report
164+
def make_report(self):
165+
self.report = "---\ntitle: Broken Link Report"
166+
self.report += "\nchecked: " + str(len(self.visited))
167+
self.report += "\nnumber of email links: " + str(len(self.mailto_links))
168+
self.report += "\nemails: " + ", ".join([str(m) for m in set(self.mailto_links)])
169+
self.report += "\nbroken: " + str(len(self.broken))
170+
self.report += "\n---\n"
171+
sorted_list = sorted(self.broken, key=lambda k: k["code"], reverse=True)
172+
for link in sorted_list:
173+
self.report += f"\n- code: {link['code']}\n url: {link['link']}\n parent: {link['parent']}\n error: {link['err']}\n"
174+
return self.report
175+
176+
# Run crawler until TO_PROCESS queue is empty
177+
def run(self):
178+
while True:
179+
try:
180+
target_url = self.TO_PROCESS.get(block=True, timeout=4)
181+
if target_url["url"].startswith("mailto:"):
182+
email = target_url["url"][len("mailto:"):]
183+
self.mailto_links.append(email)
184+
185+
elif target_url["url"] not in self.visited:
186+
self.visited.add(target_url["url"])
187+
job = self.pool.submit(self.load_url, target_url, self.TIMEOUT)
188+
job.add_done_callback(self.handle_future)
189+
except Empty:
190+
return
191+
except Exception as e:
192+
print(e)
193+
194+
195+
def main():
196+
if len(sys.argv) == 1:
197+
print("url missing as a sh parameter")
198+
sys.exit(1)
199+
200+
url = sys.argv[1]
201+
first_url = {"parent": url, "url": url}
202+
203+
check = Checker(url)
204+
check.TO_PROCESS.put(first_url)
205+
check.run()
206+
print(check.make_report())
207+
208+
209+
if __name__ == "__main__":
210+
main()
211+
212+
213+
# MIT License
214+
#
215+
# Copyright (c) 2020 Victoria Drake
216+
#
217+
# Permission is hereby granted, free of charge, to any person obtaining a copy
218+
# of this software and associated documentation files (the "Software"), to deal
219+
# in the Software without restriction, including without limitation the rights
220+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
221+
# copies of the Software, and to permit persons to whom the Software is
222+
# furnished to do so, subject to the following conditions:
223+
#
224+
# The above copyright notice and this permission notice shall be included in all
225+
# copies or substantial portions of the Software.
226+
#
227+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
228+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
229+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
230+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
231+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
232+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
233+
# SOFTWARE.

0 commit comments

Comments
 (0)