import chalk from "chalk"; import cheerio from "cheerio"; import fetch from "node-fetch"; function getPageUrl(url) { if (typeof url === "string") url = new URL(url); // Use the URL w/out the hash. return new URL(url.origin + url.pathname + url.search); } const pageCache = {}; function createPage(url) { let pageUrl = getPageUrl(url); let cachedPage = pageCache[pageUrl]; if (cachedPage) return cachedPage; let page = { url: pageUrl, anchors: null, links: null, checkedLinks: [], brokenLinks: [] }; pageCache[page.url] = page; return page; } async function getPageAnchorsAndLinks(page) { if (page.anchors && page.links) return; let res; try { res = await fetch(page.url.toString()); } catch (error) { console.error(error.message); return null; } if (res.status !== 200) { throw new Error(`${res.status} error fetching URL: ${page.url}`); } let $ = cheerio.load(await res.text()); page.anchors = []; page.links = []; // GH puts all user-generated markdown in a
let isGitHubMarkdown = false; let selectorContext = undefined; if (page.url.hostname === "github.com") { let $markdownBody = $(".markdown-body"); if ($markdownBody.length > 0) { isGitHubMarkdown = true; selectorContext = ".markdown-body"; } } $("[id]", selectorContext).each((index, a) => { let id = $(a).attr("id"); // GH prefixes the ids of links that point to themselves with the // string "user-content-", so you end up with links like // // GH makes these links work by using JavaScript to adjust the page's scroll // position when the URL fragment id matches an anchor with the "user-content-" // prefix, so just treat this link as if it had the correct id to begin with if (isGitHubMarkdown && id.startsWith("user-content-")) { id = id.replace(/^user-content-/, ""); } page.anchors.push({ id, text: $(a).text() }); }); $("a[href]", selectorContext).each((index, a) => { let to = new URL($(a).attr("href"), page.url); page.links.push({ to, text: $(a).text() }); }); } function defaultShouldCheckPage(url) { return true; } function defaultShouldCheckLink(url) { return true; } async function checkPageLinks(page, options = {}, checkedPages = []) { let { shouldCheckPage = defaultShouldCheckPage, shouldCheckLink = defaultShouldCheckLink } = options; console.log(`Checking ${page.url} ...`); checkedPages.push(page); await getPageAnchorsAndLinks(page); for (let link of page.links) { if (!shouldCheckLink(link)) continue; page.checkedLinks.push(link); // Make sure the link points to a valid page. let linkedPage = createPage(link.to); try { await getPageAnchorsAndLinks(linkedPage); } catch (error) { page.brokenLinks.push(link); continue; } // Make sure the link points to a valid anchor on that page. if (link.to.hash) { let id = link.to.hash.slice(1); let anchor = linkedPage.anchors.find(a => a.id === id); if (anchor == null) { page.brokenLinks.push(link); } } // Check the page it links to. if (!checkedPages.includes(linkedPage) && shouldCheckPage(linkedPage)) { await checkPageLinks(linkedPage, options, checkedPages); } } return checkedPages; } const startPage = createPage( "https://github.com/ReactTraining/react-router/tree/dev/docs" ); checkPageLinks(startPage, { shouldCheckPage(page) { return ( page.url.hostname === "github.com" && /^\/ReactTraining\/react-router\/(tree|blob)\/dev\/docs/i.test( page.url.pathname ) ); }, shouldCheckLink(link) { return link.to.hash !== "#TODO"; } }).then(checkedPages => { checkedPages.forEach(page => { let { url, checkedLinks, brokenLinks } = page; if (brokenLinks.length === 0) { console.log( chalk.green( `Found 0 broken links at ${url} (out of ${checkedLinks.length} total)` ) ); } else { console.log( chalk.red( `Found ${brokenLinks.length} broken link${ brokenLinks.length === 1 ? "" : "s" } at ${url} (out of ${checkedLinks.length} total):` ) ); brokenLinks.forEach(link => { console.log(" " + link.to); }); } }); });