#!/usr/bin/env python3
"""
Scrape wiki.csswg.org and produce a static site.
This script crawls the CSS Working Group Wiki (DokuWiki-based) and generates
a static HTML archive suitable for hosting on GitHub Pages or any static host.
Usage:
python3 bin/scrape.py [output_dir]
If output_dir is not specified, outputs to the current directory.
"""
import re
import sys
import time
import urllib.request
from pathlib import Path
BASE_URL = "https://wiki.csswg.org"
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
DELAY = 0.3 # Seconds between requests (be nice to the server)
def fetch(url):
"""Fetch a URL with proper headers."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except Exception as e:
print(f" Error: {e}")
return None
def fetch_binary(url):
"""Fetch binary content (images) from a URL."""
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
except Exception as e:
print(f" Error downloading {url}: {e}")
return None
def download_media(content, output_dir, page_path):
"""Download /_media/ images and rewrite src attributes to local paths."""
media_dir = output_dir / "assets" / "images"
# Find all /_media/ image sources
seen = set()
def rewrite_img(m):
full_match = m.group(0)
prefix = m.group(1) # everything before src value
media_path = m.group(2) # path after /_media/
params = m.group(3) or "" # ?w=500&tok=... etc.
suffix = m.group(4) # everything after the src value
# Strip query params for the local filename
local_path = media_path
dest = media_dir / local_path
dest.parent.mkdir(parents=True, exist_ok=True)
# Download if not already fetched
if local_path not in seen:
seen.add(local_path)
# Download without resize params to get full-size original
download_url = f"{BASE_URL}/_media/{media_path}"
time.sleep(DELAY)
data = fetch_binary(download_url)
if data:
dest.write_bytes(data)
print(f" Downloaded: {local_path}")
else:
return full_match # leave unchanged if download fails
# Calculate relative path from page to assets/images/
if page_path == "main":
rel = f"assets/images/{local_path}"
else:
depth = len(page_path.split("/"))
rel = "../" * depth + f"assets/images/{local_path}"
return f'{prefix}{rel}{suffix}'
# Rewrite src="/_media/..." in img tags
content = re.sub(
r'(]*src=")/_media/([^"?]+)(\?[^"]*)?("[^>]*/?>)',
rewrite_img,
content
)
# Also rewrite href="/_detail/..." links to point to the image
def rewrite_detail_link(m):
media_path = m.group(1)
params = m.group(2) or ""
# Strip query params for local path
if page_path == "main":
rel = f"assets/images/{media_path}"
else:
depth = len(page_path.split("/"))
rel = "../" * depth + f"assets/images/{media_path}"
return f'href="{rel}"'
content = re.sub(
r'href="/_detail/([^"?]+)(\?[^"]*)?"',
rewrite_detail_link,
content
)
# Handle lib/exe/fetch.php URLs - extract the media parameter
def rewrite_fetch_img(m):
full_match = m.group(0)
prefix = m.group(1)
fetch_url = m.group(2)
suffix = m.group(3)
# Extract media path from fetch.php URL
media_m = re.search(r'media=([^&"]+)', fetch_url)
if not media_m:
return full_match
media_id = media_m.group(1)
# DokuWiki uses : as namespace separator
media_path = media_id.replace(":", "/")
dest = media_dir / media_path
dest.parent.mkdir(parents=True, exist_ok=True)
if media_path not in seen:
seen.add(media_path)
download_url = f"{BASE_URL}/_media/{media_path}"
time.sleep(DELAY)
data = fetch_binary(download_url)
if data:
dest.write_bytes(data)
print(f" Downloaded: {media_path}")
else:
return full_match
if page_path == "main":
rel = f"assets/images/{media_path}"
else:
depth = len(page_path.split("/"))
rel = "../" * depth + f"assets/images/{media_path}"
return f'{prefix}{rel}{suffix}'
content = re.sub(
r'(
]*src=")[^"]*lib/exe/fetch\.php[^"]*?media=([^"]+)("[^>]*/?>)',
rewrite_fetch_img,
content
)
# Also rewrite fetch.php href links
def rewrite_fetch_link(m):
fetch_url = m.group(1)
media_m = re.search(r'media=([^&"]+)', fetch_url)
if not media_m:
return m.group(0)
media_id = media_m.group(1)
media_path = media_id.replace(":", "/")
if page_path == "main":
rel = f"assets/images/{media_path}"
else:
depth = len(page_path.split("/"))
rel = "../" * depth + f"assets/images/{media_path}"
return f'href="{rel}"'
content = re.sub(
r'href="([^"]*lib/exe/fetch\.php[^"]*)"',
rewrite_fetch_link,
content
)
return content
def get_all_pages():
"""Discover all wiki pages by crawling the index recursively."""
pages = set()
def expand_namespace(ns, label=None):
"""Expand a DokuWiki namespace, recursing into sub-namespaces."""
if label:
print(f"Expanding: {label}")
time.sleep(DELAY)
idx_param = f"&idx={ns}" if ns else ""
html = fetch(f"{BASE_URL}/?do=index{idx_param}")
if not html:
return
# Collect page links
for m in re.finditer(r'href="/([^"?#]+)"', html):
p = m.group(1)
if not p.startswith(('lib/', '_', 'feed')) and '?' not in p:
pages.add(p)
# Recurse into sub-namespaces (colons may be URL-encoded as %3A)
escaped_ns = re.escape(ns).replace(":", "(?::|%3A)") if ns else ""
ns_pattern = rf'\?idx=({escaped_ns}(?::|%3A)[a-z0-9_.+-]+)' if ns else r'\?idx=([a-z0-9_-]+)'
for m in re.finditer(ns_pattern, html, re.IGNORECASE):
sub_ns = urllib.parse.unquote(m.group(1))
expand_namespace(sub_ns, label=sub_ns.replace(":", "/"))
print("Fetching main index...")
expand_namespace("")
return sorted(pages)
PAGE_TEMPLATE = '''
Content could not be extracted.
" # Extract title from first h1 or page title title_m = re.search(r'\s*', '', content) content = re.sub(r'\s*