From a08663df3f94a14d4a1c03429649f5ad2e430b7c Mon Sep 17 00:00:00 2001
From: Varialus
Date: Wed, 18 Apr 2012 18:28:20 -0600
Subject: [PATCH 001/208] Fixed case sensitive matching on lxml stable, but
patched for external cssselect, on Windows with Python 2.7 64-bit.
---
cssselect/xpath.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/cssselect/xpath.py b/cssselect/xpath.py
index 23a165c..aa8d376 100644
--- a/cssselect/xpath.py
+++ b/cssselect/xpath.py
@@ -249,7 +249,14 @@ def xpath_hash(self, id_selector):
def xpath_element(self, selector):
"""Translate a type or universal selector."""
if selector.namespace == '*':
- element = selector.element.lower()
+ # Fixed case sensitive matching on lxml 2.3.4 patched for external cssselect with Python 2.7 64-bit on Windows.
+ # Case insensitive matching is not working unless source elements are lower case.
+ # For HTMLTranslator, I kept the existing behavior of setting the element to lower case.
+ # "...in HTML, element names are case-insensitive, but in XML they are case-sensitive."
+ # http://www.w3.org/TR/CSS2/selector.html#pattern-matching
+ element = selector.element
+ if isinstance(self, HTMLTranslator):
+ element = element.lower()
else:
# FIXME: Should we lowercase here?
element = '%s:%s' % (selector.namespace, selector.element)
From 35a2f57fefcbc92ecc56719eda59f82b00fc1238 Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Thu, 19 Apr 2012 15:40:46 +0200
Subject: [PATCH 002/208] Fix case-sensitivity issues
* (Functional) pseudo-classes are always case-insensitive
* Add the 'xhtml' flag
* Element names and attribute names are case sensitive
for HTML, but not XHTML or XML.
---
CHANGES | 9 +++++
cssselect/__init__.py | 2 +-
cssselect/parser.py | 5 +--
cssselect/tests.py | 80 ++++++++++++++++++++++++-------------------
cssselect/xpath.py | 80 ++++++++++++++++++++++++++++++++-----------
docs/index.rst | 3 --
6 files changed, 117 insertions(+), 62 deletions(-)
diff --git a/CHANGES b/CHANGES
index 4583cef..1ffae2d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,15 @@
Changelog
=========
+Version 0.5
+-----------
+
+Not released yet.
+
+* Fix case sensitivity issues.
+* Add the ``xhtml`` parameter for :class:`HTMLTranslator`.
+
+
Version 0.4
-----------
diff --git a/cssselect/__init__.py b/cssselect/__init__.py
index 3129a42..4e044f0 100644
--- a/cssselect/__init__.py
+++ b/cssselect/__init__.py
@@ -17,5 +17,5 @@
from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError
-VERSION = '0.4'
+VERSION = '0.5'
__version__ = VERSION
diff --git a/cssselect/parser.py b/cssselect/parser.py
index f6b42c8..11ff6be 100644
--- a/cssselect/parser.py
+++ b/cssselect/parser.py
@@ -376,7 +376,8 @@ def parse_simple_selector(stream, inside_negation=False):
if stream.peek() == '(':
stream.next()
stream.skip_whitespace()
- if ident == 'not':
+ is_negation = ident.lower() == 'not'
+ if is_negation:
if inside_negation:
raise SelectorSyntaxError('Got nested :not()')
argument, argument_pseudo_element = parse_simple_selector(
@@ -396,7 +397,7 @@ def parse_simple_selector(stream, inside_negation=False):
if not next == ')':
raise SelectorSyntaxError(
"Expected ')', got '%s'" % next)
- if ident == 'not':
+ if is_negation:
result = Negation(result, argument)
else:
result = Function(result, ident, argument)
diff --git a/cssselect/tests.py b/cssselect/tests.py
index 086f01f..2ee1ef9 100755
--- a/cssselect/tests.py
+++ b/cssselect/tests.py
@@ -284,72 +284,72 @@ def xpath(css):
return str(GenericTranslator().css_to_xpath(css, prefix=''))
assert xpath('*') == "*"
- assert xpath('E') == "e"
- assert xpath('E[foo]') == "e[@foo]"
- assert xpath('E[foo="bar"]') == "e[@foo = 'bar']"
- assert xpath('E[foo~="bar"]') == (
+ assert xpath('e') == "e"
+ assert xpath('e[foo]') == "e[@foo]"
+ assert xpath('e[foo="bar"]') == "e[@foo = 'bar']"
+ assert xpath('e[foo~="bar"]') == (
"e[@foo and contains("
"concat(' ', normalize-space(@foo), ' '), ' bar ')]")
- assert xpath('E[foo^="bar"]') == (
+ assert xpath('e[foo^="bar"]') == (
"e[@foo and starts-with(@foo, 'bar')]")
- assert xpath('E[foo$="bar"]') == (
+ assert xpath('e[foo$="bar"]') == (
"e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']")
- assert xpath('E[foo*="bar"]') == (
+ assert xpath('e[foo*="bar"]') == (
"e[@foo and contains(@foo, 'bar')]")
- assert xpath('E[hreflang|="en"]') == (
+ assert xpath('e[hreflang|="en"]') == (
"e[@hreflang and ("
"@hreflang = 'en' or starts-with(@hreflang, 'en-'))]")
- assert xpath('E:nth-child(1)') == (
+ assert xpath('e:nth-child(1)') == (
"*/*[name() = 'e' and (position() = 1)]")
- assert xpath('E:nth-last-child(1)') == (
+ assert xpath('e:nth-last-child(1)') == (
"*/*[name() = 'e' and (position() = last() - 1)]")
- assert xpath('E:nth-last-child(2n+2)') == (
+ assert xpath('e:nth-last-child(2n+2)') == (
"*/*[name() = 'e' and ("
"(position() +2) mod -2 = 0 and position() < (last() -2))]")
- assert xpath('E:nth-of-type(1)') == (
+ assert xpath('e:nth-of-type(1)') == (
"*/e[position() = 1]")
- assert xpath('E:nth-last-of-type(1)') == (
+ assert xpath('e:nth-last-of-type(1)') == (
"*/e[position() = last() - 1]")
- assert xpath('E:nth-last-of-type(1)') == (
+ assert xpath('e:nth-last-of-type(1)') == (
"*/e[position() = last() - 1]")
- assert xpath('div E:nth-last-of-type(1) .aclass') == (
+ assert xpath('div e:nth-last-of-type(1) .aclass') == (
"div/descendant-or-self::*/e[position() = last() - 1]"
"/descendant-or-self::*/*[@class and contains("
"concat(' ', normalize-space(@class), ' '), ' aclass ')]")
- assert xpath('E:first-child') == (
+ assert xpath('e:first-child') == (
"*/*[name() = 'e' and (position() = 1)]")
- assert xpath('E:last-child') == (
+ assert xpath('e:last-child') == (
"*/*[name() = 'e' and (position() = last())]")
- assert xpath('E:first-of-type') == (
+ assert xpath('e:first-of-type') == (
"*/e[position() = 1]")
- assert xpath('E:last-of-type') == (
+ assert xpath('e:last-of-type') == (
"*/e[position() = last()]")
- assert xpath('E:only-child') == (
+ assert xpath('e:only-child') == (
"*/*[name() = 'e' and (last() = 1)]")
- assert xpath('E:only-of-type') == (
+ assert xpath('e:only-of-type') == (
"e[last() = 1]")
- assert xpath('E:empty') == (
+ assert xpath('e:empty') == (
"e[not(*) and not(normalize-space())]")
- assert xpath('E:root') == (
+ assert xpath('e:root') == (
"e[not(parent::*)]")
- assert xpath('E:contains("foo")') == (
+ assert xpath('e:contains("foo")') == (
"e[contains(string(.), 'foo')]")
- assert xpath('E:contains(foo)') == (
+ assert xpath('e:contains(foo)') == (
"e[contains(string(.), 'foo')]")
- assert xpath('E.warning') == (
+ assert xpath('e.warning') == (
"e[@class and contains("
"concat(' ', normalize-space(@class), ' '), ' warning ')]")
- assert xpath('E#myid') == (
+ assert xpath('e#myid') == (
"e[@id = 'myid']")
- assert xpath('E:not(:nth-child(odd))') == (
+ assert xpath('e:not(:nth-child(odd))') == (
"e[not((position() -1) mod 2 = 0 and position() >= 1)]")
- assert xpath('E F') == (
+ assert xpath('e f') == (
"e/descendant-or-self::*/f")
- assert xpath('E > F') == (
+ assert xpath('e > f') == (
"e/f")
- assert xpath('E + F') == (
+ assert xpath('e + f') == (
"e/following-sibling::*[name() = 'f' and (position() = 1)]")
- assert xpath('E ~ F') == (
+ assert xpath('e ~ f') == (
"e/following-sibling::f")
assert xpath('div#container p') == (
"div[@id = 'container']/descendant-or-self::*/p")
@@ -426,12 +426,17 @@ def pcss(main, *selectors, **kwargs):
return result
all_ids = pcss('*')
+ assert len(all_ids) == 27
assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div']
assert all_ids[-1:] == ['foobar-span']
assert pcss('div') == ['outer-div', 'li-div', 'foobar-div']
+ assert pcss('DIV', html_only=True) == [
+ 'outer-div', 'li-div', 'foobar-div'] # case-insensitive in HTML
assert pcss('div div') == ['li-div']
assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div']
assert pcss('a[name]') == ['name-anchor']
+ assert pcss('a[NAme]', html_only=True) == [
+ 'name-anchor'] # case-insensitive in HTML:
assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor']
assert pcss('a[rel="tag"]') == ['tag-anchor']
assert pcss('a[href*="localhost"]') == ['tag-anchor']
@@ -441,7 +446,7 @@ def pcss(main, *selectors, **kwargs):
assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [
'foobar-div']
assert pcss('div[foobar~="cd"]') == []
- assert pcss('*[lang|="en"]', '*[lang|="en-US"]') == ['second-li']
+ assert pcss('*[lang|="en"]', '[lang|="en-US"]') == ['second-li']
assert pcss('*[lang|="e"]') == []
assert pcss('li:nth-child(3)') == ['third-li']
assert pcss('li:nth-child(10)') == []
@@ -471,12 +476,12 @@ def pcss(main, *selectors, **kwargs):
self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
self.assertRaises(ExpressionError, pcss, 'p:lang(fr)')
assert pcss('p:only-of-type') == ['paragraph']
- assert pcss('a:empty') == ['name-anchor']
+ assert pcss('a:empty', 'a:EMpty') == ['name-anchor']
assert pcss('li:empty') == [
'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li']
assert pcss(':root', 'html:root') == ['html']
assert pcss('li:root', '* :root') == []
- assert pcss('*:contains("link")') == [
+ assert pcss('*:contains("link")', ':CONtains("link")') == [
'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor']
assert pcss('*:contains("LInk")') == [] # case sensitive
assert pcss('*:contains("e")') == [
@@ -488,7 +493,6 @@ def pcss(main, *selectors, **kwargs):
assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [
'third-li', 'fourth-li']
assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li']
- # Need some tests of :not()']
assert pcss('li div', 'li > div', 'div div') == ['li-div']
assert pcss('div > div') == []
assert pcss('div>.c', 'div > .c') == ['first-ol']
@@ -507,6 +511,10 @@ def pcss(main, *selectors, **kwargs):
'fieldset', 'checkbox-disabled']
assert pcss(':enabled', html_only=True) == [
'checkbox-unchecked', 'checkbox-checked']
+ assert pcss('a:not([href])') == ['name-anchor']
+ assert pcss('ol :Not(li[class])') == [
+ 'first-li', 'second-li', 'li-div',
+ 'fifth-li', 'sixth-li', 'seventh-li']
def test_select_shakespeare(self):
document = html.document_fromstring(HTML_SHAKESPEARE)
diff --git a/cssselect/xpath.py b/cssselect/xpath.py
index aa8d376..e31b037 100644
--- a/cssselect/xpath.py
+++ b/cssselect/xpath.py
@@ -93,6 +93,10 @@ def join(self, combiner, other):
class GenericTranslator(object):
"""
Translator for "generic" XML documents.
+
+ Everything is case-sensitive, no assumption is made on the meaning
+ of element names and attribute names.
+
"""
combinator_mapping = {
' ': 'descendant',
@@ -116,6 +120,24 @@ class GenericTranslator(object):
#: http://www.w3.org/TR/selectors/#id-selectors
id_attribute = 'id'
+ #: The case sensitivity of document language element names,
+ #: attribute names, and attribute values in selectors depends
+ #: on the document language.
+ #: http://www.w3.org/TR/selectors/#casesens
+ #:
+ #: When a document language defines one of these as case-insensitive,
+ #: cssselect assumes that the document parser makes the parsed values
+ #: lower-case. Making the selector lower-case too makes the comparaison
+ #: case-insensitive.
+ #:
+ #: In HTML, element names and attributes names (but not attribute values)
+ #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
+ #: and HTMLParser make them lower-case in their parse result, so
+ #: the assumption holds.
+ lower_case_element_names = False
+ lower_case_attribute_names = False
+ lower_case_attribute_values = False
+
def css_to_xpath(self, css, prefix='descendant-or-self::'):
"""Translate a *group of selectors* to XPath.
@@ -201,7 +223,7 @@ def xpath_negation(self, negation):
def xpath_function(self, function):
"""Translate a functional pseudo-class."""
- method = 'xpath_%s_function' % function.name.replace('-', '_')
+ method = 'xpath_%s_function' % function.name.replace('-', '_').lower()
method = getattr(self, method, None)
if not method:
raise ExpressionError(
@@ -210,7 +232,7 @@ def xpath_function(self, function):
def xpath_pseudo(self, pseudo):
"""Translate a pseudo-class."""
- method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_')
+ method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_').lower()
method = getattr(self, method, None)
if not method:
# TODO: better error message for pseudo-elements?
@@ -226,12 +248,19 @@ def xpath_attrib(self, selector):
raise ExpressionError(
"Unknown attribute operator: %r" % selector.operator)
method = getattr(self, 'xpath_attrib_%s' % operator)
- # FIXME: what if attrib is *?
+ if self.lower_case_attribute_names:
+ name = selector.attrib.lower()
+ else:
+ name = selector.attrib
if selector.namespace == '*':
- name = '@' + selector.attrib
+ name = '@' + name
+ else:
+ name = '@%s:%s' % (selector.namespace, name)
+ if self.lower_case_attribute_values:
+ value = selector.value.lower()
else:
- name = '@%s:%s' % (selector.namespace, selector.attrib)
- return method(self.xpath(selector.selector), name, selector.value)
+ value = selector.value
+ return method(self.xpath(selector.selector), name, value)
def xpath_class(self, class_selector):
"""Translate a class selector."""
@@ -243,23 +272,18 @@ def xpath_class(self, class_selector):
def xpath_hash(self, id_selector):
"""Translate an ID selector."""
xpath = self.xpath(id_selector.selector)
- return xpath.add_condition('@%s = %s' % (
- self.id_attribute, self.xpath_literal(id_selector.id)))
+ return self.xpath_attrib_equals(xpath, '@id', id_selector.id)
def xpath_element(self, selector):
"""Translate a type or universal selector."""
- if selector.namespace == '*':
- # Fixed case sensitive matching on lxml 2.3.4 patched for external cssselect with Python 2.7 64-bit on Windows.
- # Case insensitive matching is not working unless source elements are lower case.
- # For HTMLTranslator, I kept the existing behavior of setting the element to lower case.
- # "...in HTML, element names are case-insensitive, but in XML they are case-sensitive."
- # http://www.w3.org/TR/CSS2/selector.html#pattern-matching
- element = selector.element
- if isinstance(self, HTMLTranslator):
- element = element.lower()
+ if self.lower_case_element_names:
+ element = selector.element.lower()
else:
- # FIXME: Should we lowercase here?
- element = '%s:%s' % (selector.namespace, selector.element)
+ element = selector.element
+ if selector.namespace != '*':
+ # Namespace prefixes are case-sensitive.
+ # http://www.w3.org/TR/css3-namespace/#prefixes
+ element = '%s:%s' % (selector.namespace, element)
return XPathExpr(element=element)
@@ -465,8 +489,24 @@ def xpath_attrib_substringmatch(self, xpath, name, value):
class HTMLTranslator(GenericTranslator):
"""
- Translator for HTML documents.
+ Translator for (X)HTML documents.
+
+ Has a more useful implementation of some pseudo-classes, based on
+ HTML-specific element names and attribute names.
+ The API is the same as :class:`GenericTranslator`.
+
+ :param xhtml:
+ If false (the default), element names and attribute names
+ are case-insensitive.
+
"""
+ def __init__(self, xhtml=False):
+ self.xhtml = xhtml # Might be useful for sub-classes?
+ if not xhtml:
+ # See their definition in GenericTranslator.
+ self.lower_case_element_names = True
+ self.lower_case_attribute_names = True
+
def xpath_checked_pseudo(self, xpath):
# FIXME: is this really all the elements?
return xpath.add_condition(
diff --git a/docs/index.rst b/docs/index.rst
index 0c060fc..4aed4c2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -57,9 +57,6 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
:members: css_to_xpath, selector_to_xpath
.. autoclass:: HTMLTranslator
-
- The API is the same as :class:`GenericTranslator`.
-
.. autoexception:: SelectorError
.. autoexception:: SelectorSyntaxError
.. autoexception:: ExpressionError
From 95e655dfcf4454ab9140ff2decd9c1ad28c38d70 Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Thu, 19 Apr 2012 19:51:13 +0200
Subject: [PATCH 003/208] Reduce nesting level in the parser.
---
cssselect/parser.py | 53 +++++++++++++++++++++------------------------
1 file changed, 25 insertions(+), 28 deletions(-)
diff --git a/cssselect/parser.py b/cssselect/parser.py
index 11ff6be..31b086b 100644
--- a/cssselect/parser.py
+++ b/cssselect/parser.py
@@ -373,36 +373,33 @@ def parse_simple_selector(stream, inside_negation=False):
# Any new pseudo-element must have two.
pseudo_element = ident
continue
- if stream.peek() == '(':
- stream.next()
- stream.skip_whitespace()
- is_negation = ident.lower() == 'not'
- if is_negation:
- if inside_negation:
- raise SelectorSyntaxError('Got nested :not()')
- argument, argument_pseudo_element = parse_simple_selector(
- stream, inside_negation=True)
- if argument_pseudo_element:
- raise SelectorSyntaxError(
- 'Pseudo-elements are not allowed inside :not()')
- else:
- peek = stream.peek()
- if isinstance(peek, (Symbol, String)):
- argument = stream.next()
- else:
- raise SelectorSyntaxError(
- "Expected argument, got '%s'" % peek)
- stream.skip_whitespace()
- next = stream.next()
- if not next == ')':
+ if stream.peek() != '(':
+ result = Pseudo(result, ident)
+ continue
+ stream.next()
+ stream.skip_whitespace()
+ if ident.lower() == 'not':
+ if inside_negation:
+ raise SelectorSyntaxError('Got nested :not()')
+ argument, argument_pseudo_element = parse_simple_selector(
+ stream, inside_negation=True)
+ if argument_pseudo_element:
raise SelectorSyntaxError(
- "Expected ')', got '%s'" % next)
- if is_negation:
- result = Negation(result, argument)
- else:
- result = Function(result, ident, argument)
+ 'Pseudo-elements are not allowed inside :not()')
+ result = Negation(result, argument)
else:
- result = Pseudo(result, ident)
+ peek = stream.peek()
+ if isinstance(peek, (Symbol, String)):
+ argument = stream.next()
+ else:
+ raise SelectorSyntaxError(
+ "Expected argument, got '%s'" % peek)
+ result = Function(result, ident, argument)
+ stream.skip_whitespace()
+ next = stream.next()
+ if not next == ')':
+ raise SelectorSyntaxError(
+ "Expected ')', got '%s'" % next)
continue
else:
raise SelectorSyntaxError(
From c6137ceed4e596c313f18a46d9b194fa73a8270d Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Thu, 19 Apr 2012 19:54:51 +0200
Subject: [PATCH 004/208] Aesthetics.
---
docs/index.rst | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/docs/index.rst b/docs/index.rst
index 4aed4c2..079d2d7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -45,7 +45,7 @@ User API
========
In CSS3 terms, a `group of selectors`_ is a sequence of comma-separated
-selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
+selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
.. _group of selectors: http://www.w3.org/TR/selectors/#grouping
@@ -57,6 +57,10 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
:members: css_to_xpath, selector_to_xpath
.. autoclass:: HTMLTranslator
+
+Exceptions
+----------
+
.. autoexception:: SelectorError
.. autoexception:: SelectorSyntaxError
.. autoexception:: ExpressionError
From 3d8fd09aac1578497de0bec7b9fdc164f88af62c Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Fri, 20 Apr 2012 11:07:25 +0200
Subject: [PATCH 005/208] Implement :link per the HTML5 spec.
---
CHANGES | 5 ++++-
cssselect/tests.py | 35 +++++++++++++++++++++++++----------
cssselect/xpath.py | 10 +++++++---
3 files changed, 36 insertions(+), 14 deletions(-)
diff --git a/CHANGES b/CHANGES
index 1ffae2d..f7d0c00 100644
--- a/CHANGES
+++ b/CHANGES
@@ -7,7 +7,10 @@ Version 0.5
Not released yet.
* Fix case sensitivity issues.
-* Add the ``xhtml`` parameter for :class:`HTMLTranslator`.
+* Implement :class:`HTMLTranslator` based on the `HTML5 specification`_
+ rather than guessing; add the ``xhtml`` parameter.
+
+.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
Version 0.4
diff --git a/cssselect/tests.py b/cssselect/tests.py
index 2ee1ef9..b60acf0 100755
--- a/cssselect/tests.py
+++ b/cssselect/tests.py
@@ -20,7 +20,7 @@
import operator
import unittest
-from lxml import html
+from lxml import etree, html
from cssselect import (parse, GenericTranslator, HTMLTranslator,
SelectorSyntaxError, ExpressionError)
from cssselect.parser import tokenize, parse_series
@@ -401,7 +401,7 @@ def test_series(self):
assert parse_series('5') == (0, 5)
def test_select(self):
- document = html.document_fromstring(HTML_IDS)
+ document = etree.fromstring(HTML_IDS)
sort_key = dict(
(el, count) for count, el in enumerate(document.getiterator())
).__getitem__
@@ -426,8 +426,9 @@ def pcss(main, *selectors, **kwargs):
return result
all_ids = pcss('*')
- assert len(all_ids) == 27
- assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div']
+ assert len(all_ids) == 32
+ assert all_ids[:6] == [
+ 'html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div']
assert all_ids[-1:] == ['foobar-span']
assert pcss('div') == ['outer-div', 'li-div', 'foobar-div']
assert pcss('DIV', html_only=True) == [
@@ -503,9 +504,16 @@ def pcss(main, *selectors, **kwargs):
assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li']
assert pcss('#outer-div:first-child') == ['outer-div']
assert pcss('#outer-div :first-child') == [
- 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled']
+ 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled',
+ 'area-href']
assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor']
- assert pcss(':link', html_only=True) == pcss('a[href]')
+
+
+ assert pcss(':link', html_only=True) == [
+ 'link-href', 'tag-anchor', 'nofollow-anchor', 'area-href']
+ assert pcss(':visited', html_only=True) == []
+
+
assert pcss(':checked', html_only=True) == ['checkbox-checked']
assert pcss(':disabled', html_only=True) == [
'fieldset', 'checkbox-disabled']
@@ -590,7 +598,10 @@ def count(selector):
assert count('div[class~=dialog]') == 51 # ? Seems right
HTML_IDS = '''
-
+
+
+
+
diff --git a/cssselect/xpath.py b/cssselect/xpath.py
index e31b037..0d8a3f6 100644
--- a/cssselect/xpath.py
+++ b/cssselect/xpath.py
@@ -491,10 +491,13 @@ class HTMLTranslator(GenericTranslator):
"""
Translator for (X)HTML documents.
- Has a more useful implementation of some pseudo-classes, based on
- HTML-specific element names and attribute names.
+ Has a more useful implementation of some pseudo-classes based on
+ HTML-specific element names and attribute names, as described in
+ the `HTML5 specification`_. It assumes no-quirks mode.
The API is the same as :class:`GenericTranslator`.
+ .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
+
:param xhtml:
If false (the default), element names and attribute names
are case-insensitive.
@@ -514,7 +517,8 @@ def xpath_checked_pseudo(self, xpath):
"(@checked and name(.) = 'input')")
def xpath_link_pseudo(self, xpath):
- return xpath.add_condition("@href and name(.) = 'a'")
+ return xpath.add_condition("@href and "
+ "(name(.) = 'a' or name(.) = 'link' or name(.) = 'area')")
# Links are never visited, the implementation for :visited is the same
# as in GenericTranslator
From 1c12e2aa0b57984f8889fb8de617d5461eb88221 Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Fri, 20 Apr 2012 11:44:46 +0200
Subject: [PATCH 006/208] Implement :enabled/:disabled per the HTML5 spec.
(Almost, see #6)
---
cssselect/tests.py | 34 ++++++++++++++++++----------------
cssselect/xpath.py | 31 +++++++++++++++++++++----------
2 files changed, 39 insertions(+), 26 deletions(-)
diff --git a/cssselect/tests.py b/cssselect/tests.py
index b60acf0..79d5f19 100755
--- a/cssselect/tests.py
+++ b/cssselect/tests.py
@@ -426,7 +426,6 @@ def pcss(main, *selectors, **kwargs):
return result
all_ids = pcss('*')
- assert len(all_ids) == 32
assert all_ids[:6] == [
'html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div']
assert all_ids[-1:] == ['foobar-span']
@@ -472,8 +471,7 @@ def pcss(main, *selectors, **kwargs):
assert pcss('ol:nth-last-of-type(1)') == ['first-ol']
assert pcss('span:only-child') == ['foobar-span']
assert pcss('li div:only-child') == ['li-div']
- assert pcss('div *:only-child') == [
- 'li-div', 'checkbox-disabled', 'foobar-span']
+ assert pcss('div *:only-child') == ['li-div', 'foobar-span']
self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
self.assertRaises(ExpressionError, pcss, 'p:lang(fr)')
assert pcss('p:only-of-type') == ['paragraph']
@@ -504,25 +502,25 @@ def pcss(main, *selectors, **kwargs):
assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li']
assert pcss('#outer-div:first-child') == ['outer-div']
assert pcss('#outer-div :first-child') == [
- 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled',
- 'area-href']
+ 'name-anchor', 'first-li', 'li-div', 'p-b',
+ 'checkbox-fieldset-disabled', 'area-href']
assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor']
+ assert pcss('a:not([href])') == ['name-anchor']
+ assert pcss('ol :Not(li[class])') == [
+ 'first-li', 'second-li', 'li-div',
+ 'fifth-li', 'sixth-li', 'seventh-li']
-
+ # HTML-specific
assert pcss(':link', html_only=True) == [
'link-href', 'tag-anchor', 'nofollow-anchor', 'area-href']
assert pcss(':visited', html_only=True) == []
-
+ assert pcss(':enabled', html_only=True) == [
+ 'link-href', 'tag-anchor', 'nofollow-anchor',
+ 'checkbox-unchecked', 'checkbox-checked', 'area-href']
+ assert pcss(':disabled', html_only=True) == [
+ 'checkbox-disabled', 'fieldset', 'checkbox-fieldset-disabled']
assert pcss(':checked', html_only=True) == ['checkbox-checked']
- assert pcss(':disabled', html_only=True) == [
- 'fieldset', 'checkbox-disabled']
- assert pcss(':enabled', html_only=True) == [
- 'checkbox-unchecked', 'checkbox-checked']
- assert pcss('a:not([href])') == ['name-anchor']
- assert pcss('ol :Not(li[class])') == [
- 'first-li', 'second-li', 'li-div',
- 'fifth-li', 'sixth-li', 'seventh-li']
def test_select_shakespeare(self):
document = html.document_fromstring(HTML_SHAKESPEARE)
@@ -624,9 +622,13 @@ def count(selector):
hi there
guy
+
+
+
diff --git a/cssselect/xpath.py b/cssselect/xpath.py
index 0d8a3f6..5e25cea 100644
--- a/cssselect/xpath.py
+++ b/cssselect/xpath.py
@@ -529,11 +529,10 @@ def xpath_disabled_pseudo(self, xpath):
(
@disabled and
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
- name(.) = 'keygen' or
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup' or
@@ -541,11 +540,10 @@ def xpath_disabled_pseudo(self, xpath):
)
) or (
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
- name(.) = 'textarea' or
- name(.) = 'keygen'
+ name(.) = 'textarea'
)
and ancestor::fieldset[@disabled]
)
@@ -557,23 +555,36 @@ def xpath_enabled_pseudo(self, xpath):
# http://www.w3.org/TR/html5/section-index.html#attributes-1
return xpath.add_condition('''
(
+ @href and (
+ name(.) = 'a' or
+ name(.) = 'link' or
+ name(.) = 'area'
+ )
+ ) or (
(
name(.) = 'command' or
name(.) = 'fieldset' or
- name(.) = 'optgroup' or
- name(.) = 'option'
+ name(.) = 'optgroup'
)
and not(@disabled)
) or (
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'keygen'
)
and not (@disabled or ancestor::fieldset[@disabled])
+ ) or (
+ name(.) = 'option' and not(
+ @disabled or ancestor::optgroup[@disabled]
+ )
)
''')
- # FIXME: in the second half, add "and is not a descendant of that
- # fieldset element's first legend element child, if any."
+ # FIXME: ... or "li elements that are children of menu elements,
+ # and that have a child element that defines a command, if the first
+ # such element's Disabled State facet is false (not disabled)".
+ # FIXME: after ancestor::fieldset[@disabled], add "and is not a
+ # descendant of that fieldset element's first legend element child,
+ # if any."
From de70d4a74caac9ba72f7aa1598a4e16b21aa53b6 Mon Sep 17 00:00:00 2001
From: Simon Sapin
Date: Fri, 20 Apr 2012 11:52:03 +0200
Subject: [PATCH 007/208] Implement :checked per the HTML5 spec.
---
cssselect/tests.py | 13 +++++++++----
cssselect/xpath.py | 4 +++-
2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/cssselect/tests.py b/cssselect/tests.py
index 79d5f19..c8a0bf4 100755
--- a/cssselect/tests.py
+++ b/cssselect/tests.py
@@ -516,11 +516,13 @@ def pcss(main, *selectors, **kwargs):
assert pcss(':visited', html_only=True) == []
assert pcss(':enabled', html_only=True) == [
'link-href', 'tag-anchor', 'nofollow-anchor',
- 'checkbox-unchecked', 'checkbox-checked', 'area-href']
+ 'checkbox-unchecked', 'text-checked', 'checkbox-checked',
+ 'area-href']
assert pcss(':disabled', html_only=True) == [
- 'checkbox-disabled', 'fieldset', 'checkbox-fieldset-disabled']
-
- assert pcss(':checked', html_only=True) == ['checkbox-checked']
+ 'checkbox-disabled', 'checkbox-disabled-checked', 'fieldset',
+ 'checkbox-fieldset-disabled']
+ assert pcss(':checked', html_only=True) == [
+ 'checkbox-checked', 'checkbox-disabled-checked']
def test_select_shakespeare(self):
document = html.document_fromstring(HTML_SHAKESPEARE)
@@ -623,9 +625,12 @@ def count(selector):
guy
+
+