#!/usr/bin/env python3 """ Scrape wiki.csswg.org and produce a static site. This script crawls the CSS Working Group Wiki (DokuWiki-based) and generates a static HTML archive suitable for hosting on GitHub Pages or any static host. Usage: python3 bin/scrape.py [output_dir] If output_dir is not specified, outputs to the current directory. """ import re import sys import time import urllib.request from pathlib import Path BASE_URL = "https://wiki.csswg.org" USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" DELAY = 0.3 # Seconds between requests (be nice to the server) def fetch(url): """Fetch a URL with proper headers.""" req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") except Exception as e: print(f" Error: {e}") return None def fetch_binary(url): """Fetch binary content (images) from a URL.""" req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read() except Exception as e: print(f" Error downloading {url}: {e}") return None def download_media(content, output_dir, page_path): """Download /_media/ images and rewrite src attributes to local paths.""" media_dir = output_dir / "assets" / "images" # Find all /_media/ image sources seen = set() def rewrite_img(m): full_match = m.group(0) prefix = m.group(1) # everything before src value media_path = m.group(2) # path after /_media/ params = m.group(3) or "" # ?w=500&tok=... etc. suffix = m.group(4) # everything after the src value # Strip query params for the local filename local_path = media_path dest = media_dir / local_path dest.parent.mkdir(parents=True, exist_ok=True) # Download if not already fetched if local_path not in seen: seen.add(local_path) # Download without resize params to get full-size original download_url = f"{BASE_URL}/_media/{media_path}" time.sleep(DELAY) data = fetch_binary(download_url) if data: dest.write_bytes(data) print(f" Downloaded: {local_path}") else: return full_match # leave unchanged if download fails # Calculate relative path from page to assets/images/ if page_path == "main": rel = f"assets/images/{local_path}" else: depth = len(page_path.split("/")) rel = "../" * depth + f"assets/images/{local_path}" return f'{prefix}{rel}{suffix}' # Rewrite src="/_media/..." in img tags content = re.sub( r'(]*src=")/_media/([^"?]+)(\?[^"]*)?("[^>]*/?>)', rewrite_img, content ) # Also rewrite href="/_detail/..." links to point to the image def rewrite_detail_link(m): media_path = m.group(1) params = m.group(2) or "" # Strip query params for local path if page_path == "main": rel = f"assets/images/{media_path}" else: depth = len(page_path.split("/")) rel = "../" * depth + f"assets/images/{media_path}" return f'href="{rel}"' content = re.sub( r'href="/_detail/([^"?]+)(\?[^"]*)?"', rewrite_detail_link, content ) # Handle lib/exe/fetch.php URLs - extract the media parameter def rewrite_fetch_img(m): full_match = m.group(0) prefix = m.group(1) fetch_url = m.group(2) suffix = m.group(3) # Extract media path from fetch.php URL media_m = re.search(r'media=([^&"]+)', fetch_url) if not media_m: return full_match media_id = media_m.group(1) # DokuWiki uses : as namespace separator media_path = media_id.replace(":", "/") dest = media_dir / media_path dest.parent.mkdir(parents=True, exist_ok=True) if media_path not in seen: seen.add(media_path) download_url = f"{BASE_URL}/_media/{media_path}" time.sleep(DELAY) data = fetch_binary(download_url) if data: dest.write_bytes(data) print(f" Downloaded: {media_path}") else: return full_match if page_path == "main": rel = f"assets/images/{media_path}" else: depth = len(page_path.split("/")) rel = "../" * depth + f"assets/images/{media_path}" return f'{prefix}{rel}{suffix}' content = re.sub( r'(]*src=")[^"]*lib/exe/fetch\.php[^"]*?media=([^"]+)("[^>]*/?>)', rewrite_fetch_img, content ) # Also rewrite fetch.php href links def rewrite_fetch_link(m): fetch_url = m.group(1) media_m = re.search(r'media=([^&"]+)', fetch_url) if not media_m: return m.group(0) media_id = media_m.group(1) media_path = media_id.replace(":", "/") if page_path == "main": rel = f"assets/images/{media_path}" else: depth = len(page_path.split("/")) rel = "../" * depth + f"assets/images/{media_path}" return f'href="{rel}"' content = re.sub( r'href="([^"]*lib/exe/fetch\.php[^"]*)"', rewrite_fetch_link, content ) return content def get_all_pages(): """Discover all wiki pages by crawling the index recursively.""" pages = set() def expand_namespace(ns, label=None): """Expand a DokuWiki namespace, recursing into sub-namespaces.""" if label: print(f"Expanding: {label}") time.sleep(DELAY) idx_param = f"&idx={ns}" if ns else "" html = fetch(f"{BASE_URL}/?do=index{idx_param}") if not html: return # Collect page links for m in re.finditer(r'href="/([^"?#]+)"', html): p = m.group(1) if not p.startswith(('lib/', '_', 'feed')) and '?' not in p: pages.add(p) # Recurse into sub-namespaces (colons may be URL-encoded as %3A) escaped_ns = re.escape(ns).replace(":", "(?::|%3A)") if ns else "" ns_pattern = rf'\?idx=({escaped_ns}(?::|%3A)[a-z0-9_.+-]+)' if ns else r'\?idx=([a-z0-9_-]+)' for m in re.finditer(ns_pattern, html, re.IGNORECASE): sub_ns = urllib.parse.unquote(m.group(1)) expand_namespace(sub_ns, label=sub_ns.replace(":", "/")) print("Fetching main index...") expand_namespace("") return sorted(pages) PAGE_TEMPLATE = ''' {title} - CSS Working Group Wiki (Archive)

CSS Working Group Wiki

{breadcrumb}

{content}

''' def extract_content(html, page_path): """Extract the main content from a DokuWiki page.""" # Find content between wikipage start/stop comments m = re.search(r'\s*(.*?)\s*', html, re.DOTALL) if not m: # Fallback: find the page div m = re.search(r'

]*>(.*?)

\s*(?:

\s*\s*

Content could not be extracted.

" # Extract title from first h1 or page title title_m = re.search(r']*>([^<]+)', content) if not title_m: title_m = re.search(r'\s*([^<\[]+)', html) title = title_m.group(1).strip() if title_m else page_path # Clean up content # Remove edit section buttons content = re.sub(r'

]*>.*?

', '', content, flags=re.DOTALL) # Remove section edit IDs content = re.sub(r' id="[^"]*sectionedit[^"]*"', '', content) content = re.sub(r' class="sectionedit\d+"', '', content) # Remove wiki-specific link classes content = re.sub(r' class="wikilink\d?"', '', content) content = re.sub(r' data-wiki-id="[^"]*"', '', content) # External link class cleanup content = re.sub(r' class="urlextern"', '', content) content = re.sub(r' rel="ugc nofollow"', ' rel="noopener"', content) # Clean up div wrappers that are just for layout content = re.sub(r'

\s*', '', content) content = re.sub(r'\s*

\s*(?=\s*', '', content) content = re.sub(r'\s*

\s*', '', content) return title, content def make_breadcrumb(page_path, home_path): """Generate breadcrumb navigation.""" if page_path in ('main', ''): return '' parts = page_path.split('/') crumbs = [f'Home'] for i, part in enumerate(parts[:-1]): path = "../" * (len(parts) - i - 1) crumbs.append(f'{part}') crumbs.append(parts[-1]) return f'' def fix_internal_links(content, home_path): """Convert absolute wiki links to relative paths.""" # Fix internal wiki links: /page -> {home_path}page/ def fix_link(m): path = m.group(1) query = m.group(2) or "" fragment = m.group(3) or "" if not path.endswith("/"): path += "/" return f'href="{home_path}{path}{query}{fragment}"' content = re.sub(r'href="/([a-z][^"#?]*)(\?[^"#]*)?(#[^"]*)?"', fix_link, content) return content def save_page(output_dir, page_path, html): """Process and save a page.""" title, content = extract_content(html, page_path) # Calculate depth for relative paths if page_path == 'main': depth = 0 home_path = "./" out_path = output_dir / 'index.html' else: depth = len(page_path.split('/')) home_path = "../" * depth out_dir = output_dir / page_path out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / 'index.html' content = download_media(content, output_dir, page_path) breadcrumb = make_breadcrumb(page_path, home_path) content = fix_internal_links(content, home_path) output = PAGE_TEMPLATE.format( title=title, content=content, breadcrumb=breadcrumb, home_path=home_path ) out_path.write_text(output, encoding='utf-8') def main(): if len(sys.argv) > 1: output_dir = Path(sys.argv[1]) else: output_dir = Path(".") output_dir.mkdir(parents=True, exist_ok=True) pages = get_all_pages() print(f"\nFound {len(pages)} pages. Starting download...\n") # Always include 'main' as the homepage if 'main' not in pages: pages = ['main'] + list(pages) for i, page in enumerate(pages): print(f"[{i+1}/{len(pages)}] {page}") time.sleep(DELAY) html = fetch(f"{BASE_URL}/{page}") if html: save_page(output_dir, page, html) print(f"\nDone! Static site written to {output_dir}") print(f"Preview with: python3 -m http.server -d {output_dir}") if __name__ == "__main__": main()