Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion mincss/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import collections
import random
import re
from urlparse import urlparse
from urlparse import urlparse, urlunparse
from lxml import etree
from lxml.cssselect import CSSSelector, SelectorSyntaxError, ExpressionError
import urllib
Expand Down Expand Up @@ -45,6 +45,7 @@ def __init__(self, debug=False):
self.inlines = []
self.links = []
self._bodies = []
self.url_queue = []

def _download(self, url):
try:
Expand Down Expand Up @@ -91,6 +92,9 @@ def process(self, *urls):

def process_url(self, url):
html = self._download(url)
if url not in self.url_queue:
self.url_queue.append(url)

self.process_html(html.strip(), url=url)

def process_html(self, html, url):
Expand Down Expand Up @@ -122,6 +126,16 @@ def process_html(self, html, url):
key = (link_url, link.attrib['href'])
self.blocks[key] = self._download(link_url)

for anchor in CSSSelector('a')(page):
link_url = self._pre_process_link(anchor.attrib['href'])
if link_url:
link_url = self._make_absolute_url(url, link_url)
if link_url \
and self._is_same_host_url(url, link_url) \
and link_url not in self.url_queue:
self.url_queue.append(link_url)
self.process_url(link_url)

def _process_content(self, content, bodies):
# Find all of the unique media queries

Expand Down Expand Up @@ -334,6 +348,23 @@ def _make_absolute_url(self, url, href):
path = '/'.join(parts)
return parsed.scheme + '://' + parsed.netloc + path

def _pre_process_link(self, link_url):
"""
should we process this link?
1. preprocess link_url and removes fragments and query parameters
2. remove javascript and mailto stuff from link_url
"""
if link_url.startswith('mailto:') \
or link_url.startswith('javascript:'):
return None
parsed = urlparse(link_url)
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, '', ''))

def _is_same_host_url(self, url1, url2):
"""
given 2 absolute urls, test whether both have same host
"""
return (urlparse(url1).netloc == urlparse(url2).netloc)

class _Result(object):
def __init__(self, before, after):
Expand Down