Adopt victoriadrake/hydra-link-checker v1.0.1

Krinkle · Krinkle · commit b0bbd1ecbd7e · 2020-08-31T01:46:25.000+01:00
Much faster. No dependencies. Actively maintained. From https://github.com/victoriadrake/hydra-link-checker/blob/v1.0.1/hydra.py
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,3 @@
-language: node_js
-
-node_js: "10"
-
-script:
-- npm install spider.js@arschmitz/spider.js#master
-- node_modules/spider.js/bin/spider.js --url=https://code.jquery.com
+language: python
+python: "3.8"
+script: python3 hydra.py "https://code.jquery.com"
diff --git a/README.md b/README.md
@@ -1,7 +1,6 @@
-[![Build Status](https://travis-ci.com/jquery/codeorigin.jquery.com.svg?branch=spider-check)](https://travis-ci.com/jquery/codeorigin.jquery.com)
+[![Build Status](https://travis-ci.com/jquery/codeorigin.jquery.com.svg?branch=spider-check)](https://travis-ci.com/github/jquery/codeorigin.jquery.com/branches)
 
-codeorigin.jquery.com
+[code.jquery.com](https://code.jquery.com)
 =====================
 
-This branch takes care of running [spider.js](https://github.com/arschmitz/spider.js) on a regular
-basis via Travis CI to uncover broken links and such.
+This branch runs on a regular basisq via Travis CI to uncover broken links and such.
diff --git a/hydra.py b/hydra.py
@@ -0,0 +1,233 @@
+import gzip
+import sys
+from concurrent import futures
+from html.parser import HTMLParser
+from http.client import IncompleteRead, InvalidURL
+from queue import Queue, Empty
+from socket import timeout as SocketTimeoutError
+from urllib import error, parse, request
+
+
+class Parser(HTMLParser):
+    # Tags to check
+    TAGS = ["a", "link", "img", "script"]
+    # Valid attributes to check
+    ATTRS = ["href", "src"]
+
+    def __init__(self):
+        super(Parser, self).__init__()
+        self.links = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag not in self.TAGS:
+            return
+        for a in attrs:
+            if a[0] in self.ATTRS:
+                self.links.append(a[1])
+
+    def feed_me(self, data):
+        self.links = []
+        self.feed(data)
+        return self.links
+
+    def error(self, msg):
+        return msg
+
+
+def extract_domain(link):
+    domain = parse.urlsplit(link).netloc
+    return domain
+
+
+class Checker:
+    TO_PROCESS = Queue()
+    # Maximum workers to run
+    THREADS = 50
+    # Maximum seconds to wait for HTTP response
+    TIMEOUT = 60
+
+    def __init__(self, url):
+        self.broken = []
+        self.domain = extract_domain(url)
+        self.visited = set()
+        self.mailto_links = list()
+        self.pool = futures.ThreadPoolExecutor(max_workers=self.THREADS)
+        self.report = ''
+
+    def add_entry(self, code, reason, page):
+        code = code
+        reason = reason
+        entry = {
+            "code": code,
+            "link": page["url"],
+            "parent": page["parent"],
+            "err": reason,
+        }
+        self.broken.append(entry)
+
+    # Try to retrieve contents of a page and record result
+    def load_url(self, page, timeout):
+        # Store the link to be checked and its parent in the result
+        result = {
+            "url": page["url"],
+            "parent": page["parent"],
+            "data": "",
+            "valid_content_type": False,
+        }
+
+        # Use GET as HEAD is frequently not allowed
+        r = request.Request(
+            page["url"],
+            headers={
+                "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
+            }
+        )
+
+        try:
+            http_response = request.urlopen(r, timeout=self.TIMEOUT)
+
+            encoding = http_response.headers.get("Content-Encoding")
+            if encoding and "gzip" in encoding:
+                data = gzip.decompress(http_response.read()).decode(
+                    encoding="utf-8", errors="ignore"
+                )
+            elif encoding is None:
+                data = http_response.read().decode(encoding="utf-8", errors="ignore")
+            else:
+                # Support for other less common directives not handled
+                raise NotImplementedError
+            result["data"] = data
+
+            content_type = http_response.headers.get("Content-Type")
+            if (
+                    content_type is not None
+                    and "text/html" in content_type
+                    or "text/plain" in content_type
+            ):
+                valid_content_type = True
+            else:
+                valid_content_type = False
+            result["valid_content_type"] = valid_content_type
+
+        except error.HTTPError as e:
+            code = e.getcode()
+            reason = e.reason
+            self.add_entry(code, reason, page)
+            return
+        except (
+                error.URLError,
+                ConnectionRefusedError,
+                ConnectionResetError,
+                IncompleteRead,
+                InvalidURL,
+                NotImplementedError,
+                SocketTimeoutError,
+                TimeoutError,
+                TypeError,
+                UnicodeEncodeError,
+                UnicodeDecodeError,
+        ) as e:
+            code = 0
+            reason = e
+            self.add_entry(code, reason, page)
+            return
+        except TimeoutError as e:
+            code = 408
+            reason = e
+            self.add_entry(code, reason, page)
+            return
+
+        return result
+
+    def handle_future(self, result):
+        if result.result():
+            page = result.result()
+            self.parse_page(page)
+
+    # Get more links from successfully retrieved pages in the same domain
+    def parse_page(self, page):
+        if (
+                self.domain == extract_domain(page["url"])
+                and page["valid_content_type"]
+        ):
+            parent = page["url"]
+            parser = Parser()
+            links = parser.feed_me(page["data"])
+            new_links = [x for x in links if x not in self.visited]
+            full_links = [parse.urljoin(parent, l) for l in new_links]
+            for l in full_links:
+                if l not in self.visited:
+                    li = {"parent": parent, "url": l}
+                    self.TO_PROCESS.put(li)
+
+    # Parse broken links list into YAML report
+    def make_report(self):
+        self.report = "---\ntitle: Broken Link Report"
+        self.report += "\nchecked: " + str(len(self.visited))
+        self.report += "\nnumber of email links: " + str(len(self.mailto_links))
+        self.report += "\nemails: " + ", ".join([str(m) for m in set(self.mailto_links)])
+        self.report += "\nbroken: " + str(len(self.broken))
+        self.report += "\n---\n"
+        sorted_list = sorted(self.broken, key=lambda k: k["code"], reverse=True)
+        for link in sorted_list:
+            self.report += f"\n- code:    {link['code']}\n  url:     {link['link']}\n  parent:  {link['parent']}\n  error:   {link['err']}\n"
+        return self.report
+
+    # Run crawler until TO_PROCESS queue is empty
+    def run(self):
+        while True:
+            try:
+                target_url = self.TO_PROCESS.get(block=True, timeout=4)
+                if target_url["url"].startswith("mailto:"):
+                    email = target_url["url"][len("mailto:"):]
+                    self.mailto_links.append(email)
+
+                elif target_url["url"] not in self.visited:
+                    self.visited.add(target_url["url"])
+                    job = self.pool.submit(self.load_url, target_url, self.TIMEOUT)
+                    job.add_done_callback(self.handle_future)
+            except Empty:
+                return
+            except Exception as e:
+                print(e)
+
+
+def main():
+    if len(sys.argv) == 1:
+        print("url missing as a sh parameter")
+        sys.exit(1)
+
+    url = sys.argv[1]
+    first_url = {"parent": url, "url": url}
+
+    check = Checker(url)
+    check.TO_PROCESS.put(first_url)
+    check.run()
+    print(check.make_report())
+
+
+if __name__ == "__main__":
+    main()
+
+
+# MIT License
+#
+# Copyright (c) 2020 Victoria Drake
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.