Skip to content

Commit 9af013f

Browse files
bartvenemanBart Veneman
andauthored
Add v2 endpoint to extract CSS with scraping (#48)
* quick to check if duped css is now gone * Add v2 extract-css endpoint * rm lru-cache from v1 endpoint * Fix direct link to CSS file * restore v1 api Co-authored-by: Bart Veneman <bart.veneman@drukwerkdeal.nl>
1 parent d122d10 commit 9af013f

File tree

11 files changed

+840
-1710
lines changed

11 files changed

+840
-1710
lines changed

api/_chromium.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ export const extractCss = async url => {
1919
// Start CSS coverage. This is the meat and bones of this module
2020
await page.coverage.startCSSCoverage().catch(() => { })
2121

22-
url = normalizeUrl(url, {stripWWW: false})
22+
url = normalizeUrl(url, { stripWWW: false })
2323
let response
2424

2525
try {
@@ -74,7 +74,7 @@ export const extractCss = async url => {
7474
return {
7575
type: stylesheet.ownerNode.tagName.toLowerCase(),
7676
href: stylesheet.href || document.location.href,
77-
css: [...stylesheet.cssRules].map(({cssText}) => cssText).join('\n')
77+
css: [...stylesheet.cssRules].map(({ cssText }) => cssText).join('\n')
7878
}
7979
})
8080
})
@@ -105,7 +105,7 @@ export const extractCss = async url => {
105105

106106
const inlineCss = inlineCssRules
107107
.map(rule => `[x-extract-css-inline-style] { ${rule} }`)
108-
.map(css => ({type: 'inline', href: url, css}))
108+
.map(css => ({ type: 'inline', href: url, css }))
109109

110110
const links = coverage
111111
// Filter out the <style> tags that were found in the coverage

api/extract-css.js

Lines changed: 7 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,8 @@
1-
import LRU from 'lru-cache'
2-
import {extractCss} from './_chromium.js'
3-
import {isUrl} from './_is-url.js'
4-
5-
const cssCache = new LRU({
6-
max: 1000,
7-
maxAge: 60 * 1000 // 60 seconds
8-
})
1+
import { extractCss } from './_chromium.js'
2+
import { isUrl } from './_is-url.js'
93

104
export default async (req, res) => {
11-
const {url} = req.query
5+
const { url } = req.query
126

137
if (!isUrl(url)) {
148
res.statusCode = 400
@@ -18,33 +12,20 @@ export default async (req, res) => {
1812
})
1913
}
2014

21-
res.statusCode = 200
22-
23-
if (cssCache.has(url)) {
24-
const result = cssCache.get(url)
25-
26-
if (req.headers.accept === 'application/json') {
27-
return res.json(result)
28-
}
29-
30-
res.setHeader('Content-Type', 'text/css')
31-
const css = result.map(({css}) => css).join('\n')
32-
return res.end(css)
33-
}
34-
3515
try {
3616
const result = await extractCss(url)
37-
cssCache.set(url, result)
17+
18+
res.statusCode = 200
3819

3920
if (req.headers.accept === 'application/json') {
4021
return res.json(result)
4122
}
4223

4324
res.setHeader('Content-Type', 'text/css')
44-
const css = result.map(({css}) => css).join('\n')
25+
const css = result.map(({ css }) => css).join('\n')
4526
return res.end(css)
4627
} catch (error) {
4728
res.statusCode = 500
48-
return res.json({message: error.message})
29+
return res.json({ message: error.message })
4930
}
5031
}

api/v2/_extract-css-basic.js

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import got from 'got'
2+
import { DOMParser } from 'linkedom'
3+
import _resolveUrl from '@jridgewell/resolve-uri'
4+
import parse from 'css-tree/parser'
5+
import walk from 'css-tree/walker'
6+
7+
// To maintain query params, because resolve-uri strips them
8+
function resolveUrl(url, base) {
9+
var resolved = _resolveUrl(url, base)
10+
11+
if (url.includes('?')) {
12+
var search = url.substring(url.indexOf('?'))
13+
return resolved + search
14+
}
15+
16+
return resolved
17+
}
18+
19+
function getImportUrls(css) {
20+
var ast = parse(css, {
21+
parseAtRulePrelude: false,
22+
parseRulePrelude: false,
23+
parseValue: false,
24+
parseCustomProperty: false,
25+
})
26+
var urls = []
27+
28+
walk(ast, function (node) {
29+
if (node.type === 'Url' && this.atrule?.name === 'import') {
30+
urls.push(node.value)
31+
}
32+
})
33+
return urls
34+
}
35+
36+
export async function getCssFile(url) {
37+
try {
38+
var { body } = await got(url)
39+
return body
40+
} catch (error) {
41+
console.error(`CSS not found at ${url} (HTTP ${error.response.statusCode})`)
42+
console.error(error.message)
43+
return ''
44+
}
45+
}
46+
47+
function getStyleNodes(html) {
48+
var document = new DOMParser().parseFromString(html, 'text/html')
49+
return document.querySelectorAll('link[rel*="stylesheet"][href], style, [style]')
50+
}
51+
52+
export function getStyles(nodes) {
53+
var items = []
54+
55+
for (var node of nodes) {
56+
if (node.nodeName === 'LINK') {
57+
items.push({
58+
type: 'link',
59+
href: node.getAttribute('href'),
60+
media: node.getAttribute('media'),
61+
rel: node.getAttribute('rel'),
62+
})
63+
}
64+
if (node.nodeName === 'STYLE') {
65+
var css = node.textContent
66+
items.push({
67+
type: 'style',
68+
css,
69+
})
70+
}
71+
if (node.hasAttribute('style')) {
72+
items.push({
73+
type: 'inline',
74+
// using :where() to keep specificity 0 (but complexity += 2 here)
75+
css: `:where([x-inline]) { ${node.getAttribute('style')} }`
76+
})
77+
}
78+
}
79+
80+
return items
81+
}
82+
83+
export async function extractCss(url) {
84+
var { body, headers } = await got(url)
85+
86+
// Return early if our response was a CSS file already
87+
if (headers['content-type'].includes('text/css')) {
88+
return [{
89+
type: 'file',
90+
href: url,
91+
css: body
92+
}]
93+
}
94+
95+
var nodes = getStyleNodes(body)
96+
var items = getStyles(nodes)
97+
var result = []
98+
99+
for (let i = 0; i < items.length; i++) {
100+
var item = items[i];
101+
102+
if (item.type === 'link') {
103+
var fileUrl = resolveUrl(item.href, url)
104+
var css = await getCssFile(fileUrl)
105+
result.push({
106+
...item,
107+
css
108+
})
109+
}
110+
111+
if (item.type === 'inline' || item.type === 'style') {
112+
result.push(item)
113+
}
114+
115+
if (item.type === 'style' || item.type === 'link') {
116+
// Resolve @import CSS 1 level deep (to avoid infinite loops)
117+
// And c'mon, don't @import inside your @import.
118+
var importUrls = getImportUrls(item.css)
119+
if (importUrls.length > 0) {
120+
var cssRequests = importUrls.map(importUrl => getCssFile(resolveUrl(importUrl, url)))
121+
var importedFiles = await Promise.all(cssRequests)
122+
importedFiles.map((css, index) => {
123+
result.push({
124+
type: 'import',
125+
css,
126+
href: importUrls[index]
127+
})
128+
})
129+
}
130+
}
131+
}
132+
133+
return result
134+
}

api/v2/extract-css.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { isUrl } from '../_is-url.js'
2+
import { extractCss } from './_extract-css-basic.js'
3+
4+
export default async (req, res) => {
5+
const { url } = req.query
6+
7+
if (!isUrl(url)) {
8+
res.statusCode = 400
9+
10+
return res.send({
11+
message: `The provided URL \`${url}\` is not valid`
12+
})
13+
}
14+
15+
try {
16+
const result = await extractCss(url)
17+
18+
res.statusCode = 200
19+
res.setHeader('Cache-Control', 'max-age=60')
20+
21+
if (req.headers.accept.includes('application/json')) {
22+
return res.json(result)
23+
}
24+
25+
res.setHeader('Content-Type', 'text/css')
26+
const css = result.map(({ css }) => css).join('\n')
27+
return res.end(css)
28+
} catch (error) {
29+
res.statusCode = 500
30+
return res.json({ message: error.message })
31+
}
32+
}

0 commit comments

Comments
 (0)