diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..38558bf
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+root = true
+
+[*]
+charset = utf-8
+indent_style = space
+indent_size = 4
+insert_final_newline = true
+end_of_line = lf
+
+[*.{yml,yaml}]
+indent_size = 2
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000..bb4f6e1
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# applying pre-commit hooks to the project
+e91101b37f82558db84a6b8ee9a6dba1fd2ae0bb
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
new file mode 100644
index 0000000..41ff7e1
--- /dev/null
+++ b/.github/workflows/checks.yml
@@ -0,0 +1,43 @@
+name: Checks
+on: [push, pull_request]
+
+jobs:
+ checks:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - python-version: 3.14
+ env:
+ TOXENV: pylint
+ - python-version: 3.14 # Keep in sync with .readthedocs.yml
+ env:
+ TOXENV: docs
+ - python-version: 3.14
+ env:
+ TOXENV: typing
+ - python-version: 3.14
+ env:
+ TOXENV: twinecheck
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Run check
+ env: ${{ matrix.env }}
+ run: |
+ pip install -U pip
+ pip install -U tox
+ tox
+
+ pre-commit:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+ - uses: pre-commit/action@v3.0.1
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..526c458
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,32 @@
+name: Publish
+on:
+ push:
+ tags:
+ - 'v[0-9]+.[0-9]+.[0-9]+'
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+
+ environment:
+ name: pypi
+ url: https://pypi.org/p/cssselect
+
+ permissions:
+ id-token: write
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python
+ uses: actions/setup-python@v6
+ with:
+ python-version: 3.14
+
+ - name: Build
+ run: |
+ python -m pip install --upgrade build
+ python -m build
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests-macos.yml b/.github/workflows/tests-macos.yml
new file mode 100644
index 0000000..4947937
--- /dev/null
+++ b/.github/workflows/tests-macos.yml
@@ -0,0 +1,27 @@
+name: macOS
+on: [push, pull_request]
+
+jobs:
+ tests:
+ runs-on: macos-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Run tests
+ run: |
+ pip install -U pip
+ pip install -U tox
+ tox -e py
+
+ - name: Upload coverage report
+ uses: codecov/codecov-action@v5
diff --git a/.github/workflows/tests-ubuntu.yml b/.github/workflows/tests-ubuntu.yml
new file mode 100644
index 0000000..1ef905b
--- /dev/null
+++ b/.github/workflows/tests-ubuntu.yml
@@ -0,0 +1,33 @@
+name: Ubuntu
+on: [push, pull_request]
+
+jobs:
+ tests:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy3.11"]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Install system libraries
+ if: contains(matrix.python-version, 'pypy')
+ run: |
+ sudo apt-get update
+ sudo apt-get install libxml2-dev libxslt-dev
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Run tests
+ run: |
+ pip install -U pip
+ pip install -U tox
+ tox -e py
+
+ - name: Upload coverage report
+ uses: codecov/codecov-action@v5
diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml
new file mode 100644
index 0000000..24d7ee8
--- /dev/null
+++ b/.github/workflows/tests-windows.yml
@@ -0,0 +1,27 @@
+name: Windows
+on: [push, pull_request]
+
+jobs:
+ tests:
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v6
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Run tests
+ run: |
+ pip install -U pip
+ pip install -U tox
+ tox -e py
+
+ - name: Upload coverage report
+ uses: codecov/codecov-action@v5
diff --git a/.gitignore b/.gitignore
index 36120ab..c276bd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,7 @@
/MANIFEST
/dist
/docs/_build
+/.coverage
+.idea
+htmlcov/
+coverage.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..81ca890
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.14.4
+ hooks:
+ - id: ruff-check
+ args: [ --fix ]
+ - id: ruff-format
+- repo: https://github.com/adamchainz/blacken-docs
+ rev: 1.20.0
+ hooks:
+ - id: blacken-docs
+ additional_dependencies:
+ - black==26.1.0
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v6.0.0
+ hooks:
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+- repo: https://github.com/sphinx-contrib/sphinx-lint
+ rev: v1.0.0
+ hooks:
+ - id: sphinx-lint
+- repo: https://github.com/rhysd/actionlint
+ rev: v1.7.10
+ hooks:
+ - id: actionlint
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000..b91642a
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,15 @@
+version: 2
+formats: all
+sphinx:
+ configuration: docs/conf.py
+ fail_on_warning: true
+build:
+ os: ubuntu-24.04
+ tools:
+ # For available versions, see:
+ # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
+ python: "3.14" # Keep in sync with .github/workflows/checks.yml
+python:
+ install:
+ - requirements: docs/requirements.txt
+ - path: .
diff --git a/AUTHORS b/AUTHORS
index 8c69e8f..66dcc22 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,13 @@
+Daniel Graña
Ian Bicking
+James Salter
Laurence Rowe
+Mikhail Korobov
+Nik Nyby
+Paul Tremberth
+Simon Potter
Simon Sapin
Stefan Behnel
+Thomas Grainger
+Varialus
+Arthur Darcet
diff --git a/CHANGES b/CHANGES
index 4583cef..5ca2959 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,251 @@
Changelog
=========
+Version 1.4.0
+-------------
+
+Released on 2026-01-29.
+
+* Dropped support for Python 3.9 and PyPy 3.10.
+
+* Added support for Python 3.14 and PyPy 3.11.
+
+* Switched the build system to ``hatchling``.
+
+* CI fixes and improvements.
+
+Version 1.3.0
+-------------
+
+Released on 2025-03-10.
+
+* Dropped support for Python 3.7-3.8, added support for Python 3.12-3.13 and
+ PyPy 3.10.
+
+* Removed ``_unicode_safe_getattr()``, deprecated in 1.2.0.
+
+* Added ``pre-commit`` and formatted the code with ``ruff``.
+
+* Many CI additions and improvements.
+
+
+Version 1.2.0
+-------------
+
+Released on 2022-10-27.
+
+* Drop support for Python 2.7, 3.4-3.6, add support for Python 3.7-3.11.
+
+* Add type annotations (PEP 484 and PEP 561).
+
+* More features from the CSS Selectors Level 4:
+
+ * The ``:is()`` pseudo-class.
+
+ * The ``:where()`` pseudo-class.
+
+ * The ``:has()`` pseudo-class, with some limitations.
+
+* Fix parsing ``:scope`` after a comma.
+
+* Add parentheses to fix condition precedence in some cases.
+
+* Private API changes related to the removal of the Python 2 support:
+
+ * Remove ``_unicode`` and ``_unichr`` aliases from ``csselect.parser``.
+
+ * Remove ``_basestring`` and ``_unicode`` aliases from ``csselect.xpath``.
+
+ * Deprecate ``csselect.xpath._unicode_safe_getattr()`` and change it to just
+ call ``getattr()``.
+
+* Include tests in the PyPI tarball.
+
+* Many CI additions and improvements.
+
+* Improve the test coverage.
+
+
+Version 1.1.0
+-------------
+
+Released on 2019-08-09.
+
+* Support for the ``:scope`` selector, which allows to access immediate
+ children of a selector.
+
+* Support for the ``|E`` syntax for type selectors without a namespace.
+
+* A new selector method, ``canonical``, returns the CSS expression of the
+ selector, as a string.
+
+
+Version 1.0.3
+-------------
+
+Released on 2017-12-27.
+
+* Fix artifact uploads to pypi
+
+
+Version 1.0.2
+-------------
+
+Released on 2017-12-26.
+
+* Drop support for Python 2.6 and Python 3.3.
+* Fix deprecation warning in Python 3.6.
+* Minor cleanups.
+
+
+Version 1.0.1
+-------------
+
+Released on 2017-01-10.
+
+* Add support for Python 3.6.
+* Documentation hosted `on Read the Docs `_
+
+
+Version 1.0.0
+-------------
+
+Released on 2016-10-21.
+
+* Add code coverage reports.
+* Fix ``:nth-*(an+b)`` pseudo-classes selectors.
+ (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.)
+
+
+Version 0.9.2
+-------------
+
+Released on 2016-06-15.
+
+* Distribute as universal wheel.
+* Add support for Python 3.3, 3.4 and 3.5.
+* Drop support for Python 2.5 as testing is getting difficult.
+* Improve tests on pseudo-elements.
+
+
+Version 0.9.1
+-------------
+
+Released on 2013-10-17.
+
+* **Backward incompatible change from 0.9**:
+ :meth:`~GenericTranslator.selector_to_xpath` defaults to
+ ignoring pseudo-elements,
+ as it did in 0.8 and previous versions.
+ (:meth:`~GenericTranslator.css_to_xpath` doesn’t change.)
+* Drop official support for Python 2.4 and 3.1,
+ as testing was becoming difficult.
+ Nothing will break overnight,
+ but future releases may on may not work on these versions.
+ Older releases will remain available on PyPI.
+
+
+Version 0.9
+-----------
+
+Released on 2013-10-11.
+
+Add parser support for :attr:`functional
+pseudo-elements `.
+
+*Update:*
+This version accidentally introduced a **backward incompatible** change:
+:meth:`~GenericTranslator.selector_to_xpath` defaults to
+rejecting pseudo-elements instead of ignoring them.
+
+
+Version 0.8
+-----------
+
+Released on 2013-03-15.
+
+Improvements:
+
+* `#22 `_
+ Let extended translators override what XPathExpr class is used
+* `#19 `_
+ Use the built-in ``lang()`` XPath function
+ for implementing the ``:lang()`` pseudo-class
+ with XML documents.
+ This is probably faster than ``ancestor-or-self::``.
+
+Bug fixes:
+
+* `#14 `_
+ Fix non-ASCII pseudo-classes. (Invalid selector instead of crash.)
+* `#20 `_
+ As per the spec, elements containing only whitespace are not considered empty
+ for the ``:empty`` pseudo-class.
+
+
+Version 0.7.1
+-------------
+
+Released on 2012-06-14. Code name *remember-to-test-with-tox*.
+
+0.7 broke the parser in Python 2.4 and 2.5; the tests in 2.x.
+Now all is well again.
+
+Also, pseudo-elements are now correctly made lower-case. (They are supposed
+to be case-insensitive.)
+
+
+Version 0.7
+-----------
+
+Released on 2012-06-14.
+
+Bug fix release: see #2, #7 and #10 on GitHub.
+
+* The tokenizer and parser have been rewritten to be much closer to the
+ specified grammar. In particular, non-ASCII characters and backslash-escapes
+ are now handled correctly.
+* Special characters are protected in the output so that generated XPath
+ exrpessions should always be valid
+* The ``~=``, ``^=`` and ``*=`` attribute operators now correctly never match
+ when used with an empty string.
+
+
+Version 0.6.1
+-------------
+
+Released on 2012-04-25.
+
+Make sure that internal token objects do not "leak" into the public API and
+:attr:`Selector.pseudo_element` is an unicode string.
+
+
+Version 0.6
+-----------
+
+Released on 2012-04-24.
+
+* In ``setup.py`` use setuptools/distribute if available, but fall back
+ on distutils.
+* Implement the ``:lang()`` pseudo-class, although it is only based on
+ ``xml:lang`` or ``lang`` attributes. If the document language is known from
+ some other meta-data (like a ``Content-Language`` HTTP header or ````
+ element), a workaround is to set a lang attribute on the root element.
+
+
+Version 0.5
+-----------
+
+Released on 2012-04-20.
+
+* Fix case sensitivity issues.
+* Implement :class:`HTMLTranslator` based on the `HTML5 specification`_
+ rather than guessing; add the ``xhtml`` parameter.
+* Several bug fixes and better test coverage.
+
+.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
+
+
Version 0.4
-----------
@@ -19,14 +264,14 @@ Version 0.3
Released on 2012-04-17.
* Fix many parsing bugs.
-* Rename the :class:`Translator` class to :class:`GenericTranslator`
+* Rename the ``Translator`` class to :class:`GenericTranslator`
* There, implement ``:target``, ``:hover``, ``:focus``, ``:active``
``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited``
as never matching.
* Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement
``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited``
as appropriate for HTML, with all links "not visited".
-* Remove the :func:`css_to_xpath` function. The translator classes
+* Remove the ``css_to_xpath`` function. The translator classes
are the new API.
* Add support for ``:contains()`` back, but case-sensitive. lxml will
override it to be case-insensitive for backward-compatibility.
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index c8f5dc3..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,3 +0,0 @@
-include AUTHORS CHANGES LICENSE README.rst tox.ini
-recursive-include docs *
-prune docs/_build
diff --git a/README.rst b/README.rst
index fa53a5b..c055295 100644
--- a/README.rst
+++ b/README.rst
@@ -1,25 +1,40 @@
+
===================================
cssselect: CSS Selectors for Python
===================================
-*cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_
-expressions. Such expressions can be used in lxml_ or another XPath engine
-to find the matching elements in an XML or HTML document.
+.. image:: https://img.shields.io/pypi/v/cssselect.svg
+ :target: https://pypi.python.org/pypi/cssselect
+ :alt: PyPI Version
+
+.. image:: https://img.shields.io/pypi/pyversions/cssselect.svg
+ :target: https://pypi.python.org/pypi/cssselect
+ :alt: Supported Python Versions
+
+.. image:: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml/badge.svg
+ :target: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml
+ :alt: Tests
-This module used to live inside of lxml as ``lxml.cssselect`` before it was
-extracted as a stand-alone project.
+.. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg
+ :target: https://codecov.io/github/scrapy/cssselect?branch=master
+ :alt: Coverage report
-.. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/
-.. _XPath 1.0: http://www.w3.org/TR/xpath/
-.. _lxml: http://lxml.de/
+**cssselect** is a BSD-licensed Python library to parse `CSS3 selectors`_ and
+translate them to `XPath 1.0`_ expressions.
+`XPath 1.0`_ expressions can be used in lxml_ or another XPath engine to find
+the matching elements in an XML or HTML document.
+
+Find the cssselect online documentation at https://cssselect.readthedocs.io.
Quick facts:
-* Free software: BSD licensed
-* Compatible with Python 2.4+ and 3.x
-* Latest documentation `on python.org `_
-* Source, issues and pull requests `on Github
- `_
-* Releases `on PyPI `_
+* Source, issues and pull requests `on GitHub
+ `_
+* Releases `on PyPI `_
* Install with ``pip install cssselect``
+
+
+.. _CSS3 selectors: https://www.w3.org/TR/selectors-3/
+.. _XPath 1.0: https://www.w3.org/TR/xpath/all/
+.. _lxml: https://lxml.de/
diff --git a/cssselect/__init__.py b/cssselect/__init__.py
index 3129a42..59d62df 100644
--- a/cssselect/__init__.py
+++ b/cssselect/__init__.py
@@ -1,21 +1,36 @@
"""
- CSS Selectors based on XPath
- ============================
+CSS Selectors based on XPath
+============================
- This module supports selecting XML/HTML elements based on CSS selectors.
- See the `CSSSelector` class for details.
+This module supports selecting XML/HTML elements based on CSS selectors.
+See the `CSSSelector` class for details.
- :copyright: (c) 2007-2012 Ian Bicking and contributors.
- See AUTHORS for more details.
- :license: BSD, see LICENSE for more details.
+:copyright: (c) 2007-2012 Ian Bicking and contributors.
+See AUTHORS for more details.
+:license: BSD, see LICENSE for more details.
"""
-from cssselect.parser import (parse, Selector, SelectorError,
- SelectorSyntaxError)
-from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError
+from cssselect.parser import (
+ FunctionalPseudoElement,
+ Selector,
+ SelectorError,
+ SelectorSyntaxError,
+ parse,
+)
+from cssselect.xpath import ExpressionError, GenericTranslator, HTMLTranslator
+__all__ = (
+ "ExpressionError",
+ "FunctionalPseudoElement",
+ "GenericTranslator",
+ "HTMLTranslator",
+ "Selector",
+ "SelectorError",
+ "SelectorSyntaxError",
+ "parse",
+)
-VERSION = '0.4'
+VERSION = "1.4.0"
__version__ = VERSION
diff --git a/cssselect/parser.py b/cssselect/parser.py
index f6b42c8..f969769 100644
--- a/cssselect/parser.py
+++ b/cssselect/parser.py
@@ -1,26 +1,33 @@
"""
- cssselect.parser
- ================
+cssselect.parser
+================
- Tokenizer, parser and parsed objects for CSS selectors.
+Tokenizer, parser and parsed objects for CSS selectors.
- :copyright: (c) 2007-2012 Ian Bicking and contributors.
- See AUTHORS for more details.
- :license: BSD, see LICENSE for more details.
+:copyright: (c) 2007-2012 Ian Bicking and contributors.
+See AUTHORS for more details.
+:license: BSD, see LICENSE for more details.
"""
+from __future__ import annotations
+
+import operator
import re
+import sys
+from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias, Union, cast, overload
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable, Iterator, Sequence
+
+ # typing.Self requires Python 3.11
+ from typing_extensions import Self
-try:
- _unicode = unicode
- _unichr = unichr
-except NameError:
- # Python 3
- _unicode = str
- _unichr = chr
+def ascii_lower(string: str) -> str:
+ """Lower-case, but only in the ASCII range."""
+ return string.encode("utf8").lower().decode("utf8")
class SelectorError(Exception):
@@ -32,205 +39,444 @@ class SelectorError(Exception):
"""
+
class SelectorSyntaxError(SelectorError, SyntaxError):
"""Parsing a selector that does not match the grammar."""
#### Parsed objects
-class Selector(object):
+Tree: TypeAlias = Union[
+ "Element",
+ "Hash",
+ "Class",
+ "Function",
+ "Pseudo",
+ "Attrib",
+ "Negation",
+ "Relation",
+ "Matching",
+ "SpecificityAdjustment",
+ "CombinedSelector",
+]
+PseudoElement: TypeAlias = Union["FunctionalPseudoElement", str]
+
+
+class Selector:
"""
- Represents a selector with an optional pseudo element.
+ Represents a parsed selector.
+
+ :meth:`~GenericTranslator.selector_to_xpath` accepts this object,
+ but ignores :attr:`pseudo_element`. It is the user’s responsibility
+ to account for pseudo-elements and reject selectors with unknown
+ or unsupported pseudo-elements.
+
"""
- def __init__(self, tree, pseudo_element=None):
- self._tree = tree
- #: If the selector has a pseudo-element: a string like ``'after'``.
- #: Otherwise, ``None``.
- #: Any identifier preceded by ``::`` is accepted as a pseudo-element.
- #: It is the user’s responsibility to reject selectors with
- #: unknown or unsupported pseudo-elements.
+
+ def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None:
+ self.parsed_tree = tree
+ if pseudo_element is not None and not isinstance(
+ pseudo_element, FunctionalPseudoElement
+ ):
+ pseudo_element = ascii_lower(pseudo_element)
+ #: A :class:`FunctionalPseudoElement`,
+ #: or the identifier for the pseudo-element as a string,
+ # or ``None``.
+ #:
+ #: +-------------------------+----------------+--------------------------------+
+ #: | | Selector | Pseudo-element |
+ #: +=========================+================+================================+
+ #: | CSS3 syntax | ``a::before`` | ``'before'`` |
+ #: +-------------------------+----------------+--------------------------------+
+ #: | Older syntax | ``a:before`` | ``'before'`` |
+ #: +-------------------------+----------------+--------------------------------+
+ #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
+ #: | not in Selectors3 | | |
+ #: +-------------------------+----------------+--------------------------------+
+ #: | Invalid pseudo-class | ``li:marker`` | ``None`` |
+ #: +-------------------------+----------------+--------------------------------+
+ #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
+ #: +-------------------------+----------------+--------------------------------+
+ #:
+ #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
self.pseudo_element = pseudo_element
- def __repr__(self):
- if self.pseudo_element:
- pseudo_element = '::%s' % self.pseudo_element
+ def __repr__(self) -> str:
+ if isinstance(self.pseudo_element, FunctionalPseudoElement):
+ pseudo_element = repr(self.pseudo_element)
+ elif self.pseudo_element:
+ pseudo_element = f"::{self.pseudo_element}"
+ else:
+ pseudo_element = ""
+ return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]"
+
+ def canonical(self) -> str:
+ """Return a CSS representation for this selector (a string)"""
+ if isinstance(self.pseudo_element, FunctionalPseudoElement):
+ pseudo_element = f"::{self.pseudo_element.canonical()}"
+ elif self.pseudo_element:
+ pseudo_element = f"::{self.pseudo_element}"
else:
- pseudo_element = ''
- return '%s[%r%s]' % (
- self.__class__.__name__, self._tree, pseudo_element)
+ pseudo_element = ""
+ res = f"{self.parsed_tree.canonical()}{pseudo_element}"
+ if len(res) > 1:
+ res = res.lstrip("*")
+ return res
- def specificity(self):
+ def specificity(self) -> tuple[int, int, int]:
"""Return the specificity_ of this selector as a tuple of 3 integers.
.. _specificity: http://www.w3.org/TR/selectors/#specificity
"""
- a, b, c = self._tree.specificity()
+ a, b, c = self.parsed_tree.specificity()
if self.pseudo_element:
c += 1
return a, b, c
-class Class(object):
+class Class:
"""
Represents selector.class_name
"""
- def __init__(self, selector, class_name):
+
+ def __init__(self, selector: Tree, class_name: str) -> None:
self.selector = selector
self.class_name = class_name
- def __repr__(self):
- return '%s[%r.%s]' % (
- self.__class__.__name__, self.selector, self.class_name)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]"
+
+ def canonical(self) -> str:
+ return f"{self.selector.canonical()}.{self.class_name}"
- def specificity(self):
+ def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
-class Function(object):
+class FunctionalPseudoElement:
+ """
+ Represents selector::name(arguments)
+
+ .. attribute:: name
+
+ The name (identifier) of the pseudo-element, as a string.
+
+ .. attribute:: arguments
+
+ The arguments of the pseudo-element, as a list of tokens.
+
+ **Note:** tokens are not part of the public API,
+ and may change between cssselect versions.
+ Use at your own risks.
+
+ """
+
+ def __init__(self, name: str, arguments: Sequence[Token]):
+ self.name = ascii_lower(name)
+ self.arguments = arguments
+
+ def __repr__(self) -> str:
+ token_values = [token.value for token in self.arguments]
+ return f"{self.__class__.__name__}[::{self.name}({token_values!r})]"
+
+ def argument_types(self) -> list[str]:
+ return [token.type for token in self.arguments]
+
+ def canonical(self) -> str:
+ args = "".join(token.css() for token in self.arguments)
+ return f"{self.name}({args})"
+
+
+class Function:
"""
Represents selector:name(expr)
"""
- def __init__(self, selector, name, arguments):
+
+ def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None:
self.selector = selector
- self.name = name
+ self.name = ascii_lower(name)
self.arguments = arguments
- def __repr__(self):
- return '%s[%r:%s(%r)]' % (
- self.__class__.__name__, self.selector, self.name, self.arguments)
+ def __repr__(self) -> str:
+ token_values = [token.value for token in self.arguments]
+ return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]"
- def specificity(self):
+ def argument_types(self) -> list[str]:
+ return [token.type for token in self.arguments]
+
+ def canonical(self) -> str:
+ args = "".join(token.css() for token in self.arguments)
+ return f"{self.selector.canonical()}:{self.name}({args})"
+
+ def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
-class Pseudo(object):
+class Pseudo:
"""
Represents selector:ident
"""
- def __init__(self, selector, ident):
+
+ def __init__(self, selector: Tree, ident: str) -> None:
self.selector = selector
- self.ident = ident
+ self.ident = ascii_lower(ident)
- def __repr__(self):
- return '%s[%r:%s]' % (
- self.__class__.__name__, self.selector, self.ident)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]"
- def specificity(self):
+ def canonical(self) -> str:
+ return f"{self.selector.canonical()}:{self.ident}"
+
+ def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
-class Negation(object):
+class Negation:
"""
Represents selector:not(subselector)
"""
- def __init__(self, selector, subselector):
+
+ def __init__(self, selector: Tree, subselector: Tree) -> None:
self.selector = selector
self.subselector = subselector
- def __repr__(self):
- return '%s[%r:not(%r)]' % (
- self.__class__.__name__, self.selector, self.subselector)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]"
+
+ def canonical(self) -> str:
+ subsel = self.subselector.canonical()
+ if len(subsel) > 1:
+ subsel = subsel.lstrip("*")
+ return f"{self.selector.canonical()}:not({subsel})"
- def specificity(self):
+ def specificity(self) -> tuple[int, int, int]:
a1, b1, c1 = self.selector.specificity()
- a2, b2, c2 = self.sub_selector.specificity()
+ a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
-class Attrib(object):
+class Relation:
+ """
+ Represents selector:has(subselector)
+ """
+
+ def __init__(self, selector: Tree, combinator: Token, subselector: Selector):
+ self.selector = selector
+ self.combinator = combinator
+ self.subselector = subselector
+
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]"
+
+ def canonical(self) -> str:
+ try:
+ subsel = self.subselector[0].canonical() # type: ignore[index]
+ except TypeError:
+ subsel = self.subselector.canonical()
+ if len(subsel) > 1:
+ subsel = subsel.lstrip("*")
+ return f"{self.selector.canonical()}:has({subsel})"
+
+ def specificity(self) -> tuple[int, int, int]:
+ a1, b1, c1 = self.selector.specificity()
+ try:
+ a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index]
+ except TypeError:
+ a2, b2, c2 = self.subselector.specificity()
+ return a1 + a2, b1 + b2, c1 + c2
+
+
+class Matching:
+ """
+ Represents selector:is(selector_list)
+ """
+
+ def __init__(self, selector: Tree, selector_list: Iterable[Tree]):
+ self.selector = selector
+ self.selector_list = selector_list
+
+ def __repr__(self) -> str:
+ args_str = ", ".join(repr(s) for s in self.selector_list)
+ return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]"
+
+ def canonical(self) -> str:
+ selector_arguments = []
+ for s in self.selector_list:
+ selarg = s.canonical()
+ selector_arguments.append(selarg.lstrip("*"))
+ args_str = ", ".join(str(s) for s in selector_arguments)
+ return f"{self.selector.canonical()}:is({args_str})"
+
+ def specificity(self) -> tuple[int, int, int]:
+ return max(x.specificity() for x in self.selector_list)
+
+
+class SpecificityAdjustment:
+ """
+ Represents selector:where(selector_list)
+ Same as selector:is(selector_list), but its specificity is always 0
+ """
+
+ def __init__(self, selector: Tree, selector_list: list[Tree]):
+ self.selector = selector
+ self.selector_list = selector_list
+
+ def __repr__(self) -> str:
+ args_str = ", ".join(repr(s) for s in self.selector_list)
+ return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]"
+
+ def canonical(self) -> str:
+ selector_arguments = []
+ for s in self.selector_list:
+ selarg = s.canonical()
+ selector_arguments.append(selarg.lstrip("*"))
+ args_str = ", ".join(str(s) for s in selector_arguments)
+ return f"{self.selector.canonical()}:where({args_str})"
+
+ def specificity(self) -> tuple[int, int, int]:
+ return 0, 0, 0
+
+
+class Attrib:
"""
Represents selector[namespace|attrib operator value]
"""
- def __init__(self, selector, namespace, attrib, operator, value):
+
+ @overload
+ def __init__(
+ self,
+ selector: Tree,
+ namespace: str | None,
+ attrib: str,
+ operator: Literal["exists"],
+ value: None,
+ ) -> None: ...
+
+ @overload
+ def __init__(
+ self,
+ selector: Tree,
+ namespace: str | None,
+ attrib: str,
+ operator: str,
+ value: Token,
+ ) -> None: ...
+
+ def __init__(
+ self,
+ selector: Tree,
+ namespace: str | None,
+ attrib: str,
+ operator: str,
+ value: Token | None,
+ ) -> None:
self.selector = selector
self.namespace = namespace
self.attrib = attrib
self.operator = operator
self.value = value
- def __repr__(self):
- if self.namespace == '*':
- attrib = self.attrib
- else:
- attrib = '%s|%s' % (self.namespace, self.attrib)
- if self.operator == 'exists':
- return '%s[%r[%s]]' % (
- self.__class__.__name__, self.selector, attrib)
+ def __repr__(self) -> str:
+ attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
+ if self.operator == "exists":
+ return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]"
+ assert self.value is not None
+ return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]"
+
+ def canonical(self) -> str:
+ attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib
+
+ if self.operator == "exists":
+ op = attrib
else:
- return '%s[%r[%s %s %r]]' % (
- self.__class__.__name__, self.selector, attrib,
- self.operator, self.value)
+ assert self.value is not None
+ op = f"{attrib}{self.operator}{self.value.css()}"
- def specificity(self):
+ return f"{self.selector.canonical()}[{op}]"
+
+ def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
b += 1
return a, b, c
-class Element(object):
+class Element:
"""
Represents namespace|element
+
+ `None` is for the universal selector '*'
+
"""
- def __init__(self, namespace, element):
+
+ def __init__(
+ self, namespace: str | None = None, element: str | None = None
+ ) -> None:
self.namespace = namespace
self.element = element
- def __repr__(self):
- if self.namespace == '*':
- element = self.element
- else:
- element = '%s|%s' % (self.namespace, self.element)
- return '%s[%s]' % (
- self.__class__.__name__, element)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.canonical()}]"
- def specificity(self):
- if self.element == '*':
- return 0, 0, 0
- else:
+ def canonical(self) -> str:
+ element = self.element or "*"
+ if self.namespace:
+ element = f"{self.namespace}|{element}"
+ return element
+
+ def specificity(self) -> tuple[int, int, int]:
+ if self.element:
return 0, 0, 1
+ return 0, 0, 0
-class Hash(object):
+class Hash:
"""
Represents selector#id
"""
- def __init__(self, selector, id):
+
+ def __init__(self, selector: Tree, id: str) -> None: # noqa: A002
self.selector = selector
self.id = id
- def __repr__(self):
- return '%s[%r#%s]' % (
- self.__class__.__name__, self.selector, self.id)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]"
+
+ def canonical(self) -> str:
+ return f"{self.selector.canonical()}#{self.id}"
- def specificity(self):
+ def specificity(self) -> tuple[int, int, int]:
a, b, c = self.selector.specificity()
a += 1
return a, b, c
-class CombinedSelector(object):
- def __init__(self, selector, combinator, subselector):
+class CombinedSelector:
+ def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None:
assert selector is not None
self.selector = selector
self.combinator = combinator
self.subselector = subselector
- def __repr__(self):
- if self.combinator == ' ':
- comb = ''
- else:
- comb = self.combinator
- return '%s[%r %s %r]' % (
- self.__class__.__name__, self.selector, comb, self.subselector)
+ def __repr__(self) -> str:
+ comb = "" if self.combinator == " " else self.combinator
+ return (
+ f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]"
+ )
+
+ def canonical(self) -> str:
+ subsel = self.subselector.canonical()
+ if len(subsel) > 1:
+ subsel = subsel.lstrip("*")
+ return f"{self.selector.canonical()} {self.combinator} {subsel}"
- def specificity(self):
+ def specificity(self) -> tuple[int, int, int]:
a1, b1, c1 = self.selector.specificity()
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
@@ -238,19 +484,26 @@ def specificity(self):
#### Parser
-_el_re = re.compile(r'^\s*(\w+)$')
-_id_re = re.compile(r'^\s*(\w*)#(\w+)\s*$')
-_class_re = re.compile(r'^\s*(\w*)\.(\w+)\s*$')
+# foo
+_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$")
+
+# foo#bar or #bar
+_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$")
+# foo.bar or .bar
+_class_re = re.compile(
+ r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$"
+)
-def parse(css):
+
+def parse(css: str) -> list[Selector]:
"""Parse a CSS *group of selectors*.
If you don't care about pseudo-elements or selector specificity,
you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
:param css:
- A *group of selectors* as an Unicode string.
+ A *group of selectors* as a string.
:raises:
:class:`SelectorSyntaxError` on invalid selectors.
:returns:
@@ -261,415 +514,533 @@ def parse(css):
# Fast path for simple cases
match = _el_re.match(css)
if match:
- return [Selector(Element('*', match.group(1)))]
+ return [Selector(Element(element=match.group(1)))]
match = _id_re.match(css)
if match is not None:
- return [Selector(Hash(Element(
- '*', match.group(1) or '*'), match.group(2)))]
+ return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))]
match = _class_re.match(css)
if match is not None:
- return [Selector(Class(Element(
- '*', match.group(1) or '*'), match.group(2)))]
+ return [
+ Selector(Class(Element(element=match.group(1) or None), match.group(2)))
+ ]
stream = TokenStream(tokenize(css))
stream.source = css
- try:
- return list(parse_selector_group(stream))
- except SelectorSyntaxError:
- import sys
- e = sys.exc_info()[1]
- message = "%s at %s -> %r" % (
- e, stream.used, stream.peek())
- e.msg = message
- if sys.version_info < (2,6):
- e.message = message
- e.args = tuple([message])
- raise
-
-
-def parse_selector_group(stream):
+ return list(parse_selector_group(stream))
+
+
+# except SelectorSyntaxError:
+# e = sys.exc_info()[1]
+# message = "%s at %s -> %r" % (
+# e, stream.used, stream.peek())
+# e.msg = message
+# e.args = tuple([message])
+# raise
+
+
+def parse_selector_group(stream: TokenStream) -> Iterator[Selector]:
stream.skip_whitespace()
while 1:
yield Selector(*parse_selector(stream))
- if stream.peek() == ',':
+ if stream.peek() == ("DELIM", ","):
stream.next()
stream.skip_whitespace()
else:
break
-def parse_selector(stream):
+
+def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]:
result, pseudo_element = parse_simple_selector(stream)
while 1:
stream.skip_whitespace()
peek = stream.peek()
- if peek == ',' or peek is None:
+ if peek in (("EOF", None), ("DELIM", ",")):
break
if pseudo_element:
raise SelectorSyntaxError(
- 'A pseudo-element must be at the end of a selector')
- if peek in ('+', '>', '~'):
+ f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
+ )
+ if peek.is_delim("+", ">", "~"):
# A combinator
- combinator = stream.next()
+ combinator = cast("str", stream.next().value)
stream.skip_whitespace()
else:
# By exclusion, the last parse_simple_selector() ended
# at peek == ' '
- combinator = ' '
+ combinator = " "
next_selector, pseudo_element = parse_simple_selector(stream)
result = CombinedSelector(result, combinator, next_selector)
return result, pseudo_element
-def parse_simple_selector(stream, inside_negation=False):
+def parse_simple_selector(
+ stream: TokenStream, inside_negation: bool = False
+) -> tuple[Tree, PseudoElement | None]:
stream.skip_whitespace()
+ selector_start = len(stream.used)
peek = stream.peek()
- consumed = len(stream.used)
- if peek == '*' or isinstance(peek, Symbol):
- next = stream.next()
- if stream.peek() == '|':
- namespace = next
+ if peek.type == "IDENT" or peek == ("DELIM", "*"):
+ if peek.type == "IDENT":
+ namespace = stream.next().value
+ else:
+ stream.next()
+ namespace = None
+ if stream.peek() == ("DELIM", "|"):
stream.next()
- element = stream.next_symbol_or_star()
+ element = stream.next_ident_or_star()
else:
- namespace = '*'
- element = next
+ element = namespace
+ namespace = None
else:
- element = namespace = '*'
- result = Element(namespace, element)
- pseudo_element = None
+ element = namespace = None
+ result: Tree = Element(namespace, element)
+ pseudo_element: PseudoElement | None = None
while 1:
peek = stream.peek()
- if peek in (None, ' ', ',', '+', '>', '~') or (
- inside_negation and peek == ')'):
+ if (
+ peek.type in ("S", "EOF")
+ or peek.is_delim(",", "+", ">", "~")
+ or (inside_negation and peek == ("DELIM", ")"))
+ ):
break
if pseudo_element:
raise SelectorSyntaxError(
- 'A pseudo-element must be at the end of a selector')
- if peek == '#':
+ f"Got pseudo-element ::{pseudo_element} not at the end of a selector"
+ )
+ if peek.type == "HASH":
+ result = Hash(result, cast("str", stream.next().value))
+ elif peek == ("DELIM", "."):
stream.next()
- result = Hash(result, stream.next_symbol())
- continue
- elif peek == '.':
+ result = Class(result, stream.next_ident())
+ elif peek == ("DELIM", "|"):
stream.next()
- result = Class(result, stream.next_symbol())
- continue
- elif peek == '[':
+ result = Element(None, stream.next_ident())
+ elif peek == ("DELIM", "["):
stream.next()
result = parse_attrib(result, stream)
- next = stream.next()
- if next != ']':
- raise SelectorSyntaxError(
- "] expected, got '%s'" % next)
- continue
- elif peek == '::':
+ elif peek == ("DELIM", ":"):
stream.next()
- pseudo_element = stream.next_symbol()
- continue
- elif peek == ':':
- stream.next()
- ident = stream.next_symbol()
- if ident in ('first-line', 'first-letter', 'before', 'after'):
+ if stream.peek() == ("DELIM", ":"):
+ stream.next()
+ pseudo_element = stream.next_ident()
+ if stream.peek() == ("DELIM", "("):
+ stream.next()
+ pseudo_element = FunctionalPseudoElement(
+ pseudo_element, parse_arguments(stream)
+ )
+ continue
+ ident = stream.next_ident()
+ if ident.lower() in ("first-line", "first-letter", "before", "after"):
# Special case: CSS 2.1 pseudo-elements can have a single ':'
# Any new pseudo-element must have two.
- pseudo_element = ident
+ pseudo_element = str(ident)
continue
- if stream.peek() == '(':
- stream.next()
- stream.skip_whitespace()
- if ident == 'not':
- if inside_negation:
- raise SelectorSyntaxError('Got nested :not()')
- argument, argument_pseudo_element = parse_simple_selector(
- stream, inside_negation=True)
- if argument_pseudo_element:
- raise SelectorSyntaxError(
- 'Pseudo-elements are not allowed inside :not()')
- else:
- peek = stream.peek()
- if isinstance(peek, (Symbol, String)):
- argument = stream.next()
- else:
- raise SelectorSyntaxError(
- "Expected argument, got '%s'" % peek)
- stream.skip_whitespace()
- next = stream.next()
- if not next == ')':
+ if stream.peek() != ("DELIM", "("):
+ result = Pseudo(result, ident)
+ if repr(result) == "Pseudo[Element[*]:scope]" and not (
+ len(stream.used) == 2
+ or (len(stream.used) == 3 and stream.used[0].type == "S")
+ or (len(stream.used) >= 3 and stream.used[-3].is_delim(","))
+ or (
+ len(stream.used) >= 4
+ and stream.used[-3].type == "S"
+ and stream.used[-4].is_delim(",")
+ )
+ ):
raise SelectorSyntaxError(
- "Expected ')', got '%s'" % next)
- if ident == 'not':
- result = Negation(result, argument)
- else:
- result = Function(result, ident, argument)
+ 'Got immediate child pseudo-element ":scope" '
+ "not at the start of a selector"
+ )
+ continue
+ stream.next()
+ stream.skip_whitespace()
+ if ident.lower() == "not":
+ if inside_negation:
+ raise SelectorSyntaxError("Got nested :not()")
+ argument, argument_pseudo_element = parse_simple_selector(
+ stream, inside_negation=True
+ )
+ next_ = stream.next()
+ if argument_pseudo_element:
+ raise SelectorSyntaxError(
+ f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next_.pos}"
+ )
+ if next_ != ("DELIM", ")"):
+ raise SelectorSyntaxError(f"Expected ')', got {next_}")
+ result = Negation(result, argument)
+ elif ident.lower() == "has":
+ combinator, arguments = parse_relative_selector(stream)
+ result = Relation(result, combinator, arguments)
+
+ elif ident.lower() in ("matches", "is"):
+ selectors = parse_simple_selector_arguments(stream)
+ result = Matching(result, selectors)
+ elif ident.lower() == "where":
+ selectors = parse_simple_selector_arguments(stream)
+ result = SpecificityAdjustment(result, selectors)
else:
- result = Pseudo(result, ident)
- continue
+ result = Function(result, ident, parse_arguments(stream))
else:
- raise SelectorSyntaxError(
- "Expected selector, got '%s'" % peek)
- if consumed == len(stream.used):
- raise SelectorSyntaxError(
- "Expected selector, got '%s'" % stream.peek())
+ raise SelectorSyntaxError(f"Expected selector, got {peek}")
+ if len(stream.used) == selector_start:
+ raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}")
return result, pseudo_element
-def parse_attrib(selector, stream):
+def parse_arguments(stream: TokenStream) -> list[Token]: # noqa: RET503
+ arguments: list[Token] = []
+ while 1:
+ stream.skip_whitespace()
+ next_ = stream.next()
+ if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [
+ ("DELIM", "+"),
+ ("DELIM", "-"),
+ ]:
+ arguments.append(next_)
+ elif next_ == ("DELIM", ")"):
+ return arguments
+ else:
+ raise SelectorSyntaxError(f"Expected an argument, got {next_}")
+
+
+def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]: # noqa: RET503
stream.skip_whitespace()
- attrib = stream.next_symbol_or_star()
- if attrib == '*' and stream.peek() != '|':
- raise SelectorSyntaxError(
- "Expected '|', got '%s'" % stream.peek())
- if stream.peek() == '|':
- namespace = attrib
- stream.next()
- attrib = stream.next_symbol()
+ subselector = ""
+ next_ = stream.next()
+
+ if next_ in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]:
+ combinator = next_
+ stream.skip_whitespace()
+ next_ = stream.next()
else:
- namespace = '*'
+ combinator = Token("DELIM", " ", pos=0)
+
+ while 1:
+ if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [
+ ("DELIM", "."),
+ ("DELIM", "*"),
+ ]:
+ subselector += cast("str", next_.value)
+ elif next_ == ("DELIM", ")"):
+ result = parse(subselector)
+ return combinator, result[0]
+ else:
+ raise SelectorSyntaxError(f"Expected an argument, got {next_}")
+ next_ = stream.next()
+
+
+def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]:
+ arguments = []
+ while 1:
+ result, pseudo_element = parse_simple_selector(stream, True)
+ if pseudo_element:
+ raise SelectorSyntaxError(
+ f"Got pseudo-element ::{pseudo_element} inside function"
+ )
+ stream.skip_whitespace()
+ next_ = stream.next()
+ if next_ in (("EOF", None), ("DELIM", ",")):
+ stream.next()
+ stream.skip_whitespace()
+ arguments.append(result)
+ elif next_ == ("DELIM", ")"):
+ arguments.append(result)
+ break
+ else:
+ raise SelectorSyntaxError(f"Expected an argument, got {next_}")
+ return arguments
+
+
+def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib:
stream.skip_whitespace()
- if stream.peek() == ']':
- return Attrib(selector, namespace, attrib, 'exists', None)
- op = stream.next()
- if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
- raise SelectorSyntaxError(
- "Operator expected, got '%s'" % op)
+ attrib = stream.next_ident_or_star()
+ if attrib is None and stream.peek() != ("DELIM", "|"):
+ raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}")
+ namespace: str | None
+ op: str | None
+ if stream.peek() == ("DELIM", "|"):
+ stream.next()
+ if stream.peek() == ("DELIM", "="):
+ namespace = None
+ stream.next()
+ op = "|="
+ else:
+ namespace = attrib
+ attrib = stream.next_ident()
+ op = None
+ else:
+ namespace = op = None
+ if op is None:
+ stream.skip_whitespace()
+ next_ = stream.next()
+ if next_ == ("DELIM", "]"):
+ return Attrib(selector, namespace, cast("str", attrib), "exists", None)
+ if next_ == ("DELIM", "="):
+ op = "="
+ elif next_.is_delim("^", "$", "*", "~", "|", "!") and (
+ stream.peek() == ("DELIM", "=")
+ ):
+ op = cast("str", next_.value) + "="
+ stream.next()
+ else:
+ raise SelectorSyntaxError(f"Operator expected, got {next_}")
stream.skip_whitespace()
value = stream.next()
- if not isinstance(value, (Symbol, String)):
- raise SelectorSyntaxError(
- "Expected string or symbol, got '%s'" % value)
+ if value.type not in ("IDENT", "STRING"):
+ raise SelectorSyntaxError(f"Expected string or ident, got {value}")
stream.skip_whitespace()
- return Attrib(selector, namespace, attrib, op, value)
+ next_ = stream.next()
+ if next_ != ("DELIM", "]"):
+ raise SelectorSyntaxError(f"Expected ']', got {next_}")
+ return Attrib(selector, namespace, cast("str", attrib), op, value)
-def parse_series(s):
+def parse_series(tokens: Iterable[Token]) -> tuple[int, int]:
"""
- Parses things like '1n+2', or 'an+b' generally, returning (a, b)
+ Parses the arguments for :nth-child() and friends.
+
+ :raises: A list of tokens
+ :returns: :``(a, b)``
+
"""
- if isinstance(s, Element):
- s = s._format_element()
- if not s or s == '*':
- # Happens when there's nothing, which the CSS parser thinks of as *
- return (0, 0)
- if isinstance(s, int):
- # Happens when you just get a number
- return (0, s)
- if s == 'odd':
- return (2, 1)
- elif s == 'even':
- return (2, 0)
- elif s == 'n':
- return (1, 0)
- if 'n' not in s:
- # Just a b
- return (0, int(s))
- a, b = s.split('n', 1)
+ for token in tokens:
+ if token.type == "STRING":
+ raise ValueError("String tokens not allowed in series.")
+ s = "".join(cast("str", token.value) for token in tokens).strip()
+ if s == "odd":
+ return 2, 1
+ if s == "even":
+ return 2, 0
+ if s == "n":
+ return 1, 0
+ if "n" not in s:
+ # Just b
+ return 0, int(s)
+ a, b = s.split("n", 1)
+ a_as_int: int
if not a:
- a = 1
- elif a == '-' or a == '+':
- a = int(a+'1')
- else:
- a = int(a)
- if not b:
- b = 0
- elif b == '-' or b == '+':
- b = int(b+'1')
+ a_as_int = 1
+ elif a in {"-", "+"}:
+ a_as_int = int(a + "1")
else:
- b = int(b)
- return (a, b)
+ a_as_int = int(a)
+ b_as_int = int(b) if b else 0
+ return a_as_int, b_as_int
#### Token objects
-class _UniToken(_unicode):
- def __new__(cls, contents, pos):
- obj = _unicode.__new__(cls, contents)
+
+class Token(tuple[str, str | None]): # noqa: SLOT001
+ @overload
+ def __new__(
+ cls,
+ type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"],
+ value: str,
+ pos: int,
+ ) -> Self: ...
+
+ @overload
+ def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ...
+
+ def __new__(cls, type_: str, value: str | None, pos: int) -> Self:
+ obj = tuple.__new__(cls, (type_, value))
obj.pos = pos
return obj
- def __repr__(self):
- return '%s(%s, %r)' % (
- self.__class__.__name__,
- _unicode.__repr__(self),
- self.pos)
+ def __repr__(self) -> str:
+ return f"<{self.type} '{self.value}' at {self.pos}>"
+
+ def is_delim(self, *values: str) -> bool:
+ return self.type == "DELIM" and self.value in values
+
+ pos: int
+
+ @property
+ def type(self) -> str:
+ return self[0]
-class Symbol(_UniToken):
- pass
+ @property
+ def value(self) -> str | None:
+ return self[1]
-class String(_UniToken):
- pass
+ def css(self) -> str:
+ if self.type == "STRING":
+ return repr(self.value)
+ return cast("str", self.value)
-class Token(_UniToken):
- pass
+
+class EOFToken(Token):
+ def __new__(cls, pos: int) -> Self:
+ return Token.__new__(cls, "EOF", None, pos)
+
+ def __repr__(self) -> str:
+ return f"<{self.type} at {self.pos}>"
#### Tokenizer
-_match_whitespace = re.compile(r'\s+', re.UNICODE).match
-_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub
+class TokenMacros:
+ unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?"
+ escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]"
+ string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape
+ nonascii = r"[^\0-\177]"
+ nmchar = f"[_a-z0-9-]|{escape}|{nonascii}"
+ nmstart = f"[_a-z]|{escape}|{nonascii}"
+
+
+class MatchFunc(Protocol):
+ def __call__(
+ self, string: str, pos: int = ..., endpos: int = ...
+ ) -> re.Match[str] | None: ...
+
+
+def _compile(pattern: str) -> MatchFunc:
+ return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
-_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match
-def tokenize(s):
+_match_whitespace = _compile(r"[ \t\r\n\f]+")
+_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)")
+_match_hash = _compile("#(?:%(nmchar)s)+")
+_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*")
+_match_string_by_quote = {
+ "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
+ '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
+}
+
+_sub_simple_escape = re.compile(r"\\(.)").sub
+_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub
+_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub
+
+# Same as r'\1', but faster on CPython
+_replace_simple = operator.methodcaller("group", 1)
+
+
+def _replace_unicode(match: re.Match[str]) -> str:
+ codepoint = int(match.group(1), 16)
+ if codepoint > sys.maxunicode:
+ codepoint = 0xFFFD
+ return chr(codepoint)
+
+
+def unescape_ident(value: str) -> str:
+ value = _sub_unicode_escape(_replace_unicode, value)
+ return _sub_simple_escape(_replace_simple, value)
+
+
+def tokenize(s: str) -> Iterator[Token]:
pos = 0
- s = _replace_comments('', s)
len_s = len(s)
while pos < len_s:
match = _match_whitespace(s, pos=pos)
if match:
- yield Token(' ', pos)
+ yield Token("S", " ", pos)
pos = match.end()
continue
- match = _match_count_number(s, pos=pos)
- if match and match.group() != 'n':
- sym = s[pos:match.end()]
- yield Symbol(sym, pos)
+
+ match = _match_ident(s, pos=pos)
+ if match:
+ value = _sub_simple_escape(
+ _replace_simple, _sub_unicode_escape(_replace_unicode, match.group())
+ )
+ yield Token("IDENT", value, pos)
pos = match.end()
continue
- c = s[pos]
- c2 = s[pos:pos+2]
- if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
- yield Token(c2, pos)
- pos += 2
- continue
- if c in '>+~,.*=[]()|:#':
- yield Token(c, pos)
- pos += 1
- continue
- if c == '"' or c == "'":
- # Quoted string
- old_pos = pos
- sym, pos = tokenize_escaped_string(s, pos)
- yield String(sym, old_pos)
- continue
- old_pos = pos
- sym, pos = tokenize_symbol(s, pos)
- yield Symbol(sym, old_pos)
- continue
-split_at_string_escapes = re.compile(r'(\\(?:%s))'
- % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
- '[^A-Fa-f0-9]'])).split
+ match = _match_hash(s, pos=pos)
+ if match:
+ value = _sub_simple_escape(
+ _replace_simple,
+ _sub_unicode_escape(_replace_unicode, match.group()[1:]),
+ )
+ yield Token("HASH", value, pos)
+ pos = match.end()
+ continue
+ quote = s[pos]
+ if quote in _match_string_by_quote:
+ match = _match_string_by_quote[quote](s, pos=pos + 1)
+ assert match, "Should have found at least an empty match"
+ end_pos = match.end()
+ if end_pos == len_s:
+ raise SelectorSyntaxError(f"Unclosed string at {pos}")
+ if s[end_pos] != quote:
+ raise SelectorSyntaxError(f"Invalid string at {pos}")
+ value = _sub_simple_escape(
+ _replace_simple,
+ _sub_unicode_escape(
+ _replace_unicode, _sub_newline_escape("", match.group())
+ ),
+ )
+ yield Token("STRING", value, pos)
+ pos = end_pos + 1
+ continue
-def unescape_string_literal(literal):
- substrings = []
- for substring in split_at_string_escapes(literal):
- if not substring:
+ match = _match_number(s, pos=pos)
+ if match:
+ value = match.group()
+ yield Token("NUMBER", value, pos)
+ pos = match.end()
continue
- elif '\\' in substring:
- if substring[0] == '\\' and len(substring) > 1:
- substring = substring[1:]
- if substring[0] in '0123456789ABCDEFabcdef':
- # int() correctly ignores the potentially trailing whitespace
- substring = _unichr(int(substring, 16))
+
+ pos2 = pos + 2
+ if s[pos:pos2] == "/*":
+ pos = s.find("*/", pos2)
+ if pos == -1:
+ pos = len_s
else:
- raise SelectorSyntaxError(
- "Invalid escape sequence %r in string %r"
- % (substring.split('\\')[1], literal))
- substrings.append(substring)
- return ''.join(substrings)
-
-
-def tokenize_escaped_string(s, pos):
- quote = s[pos]
- assert quote in ('"', "'")
- pos = pos+1
- start = pos
- while 1:
- next = s.find(quote, pos)
- if next == -1:
- raise SelectorSyntaxError(
- "Expected closing %s for string in: %r"
- % (quote, s[start:]))
- result = s[start:next]
- if result.endswith('\\'):
- # next quote character is escaped
- pos = next+1
+ pos += 2
continue
- if '\\' in result:
- result = unescape_string_literal(result)
- return result, next+1
-
-
-_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
-
-def tokenize_symbol(s, pos):
- start = pos
- match = _illegal_symbol.search(s, pos=pos)
- if not match:
- # Goes to end of s
- return s[start:], len(s)
- if match.start() == pos:
- raise SelectorSyntaxError(
- "Unexpected symbol: %r" % s[pos])
- if not match:
- result = s[start:]
- pos = len(s)
- else:
- result = s[start:match.start()]
- pos = match.start()
- try:
- result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
- except UnicodeDecodeError:
- import sys
- e = sys.exc_info()[1]
- raise SelectorSyntaxError(
- "Bad symbol %r: %s" % (result, e))
- return result, pos
-
-
-class TokenStream(object):
- def __init__(self, tokens, source=None):
- self.used = []
+
+ yield Token("DELIM", s[pos], pos)
+ pos += 1
+
+ assert pos == len_s
+ yield EOFToken(pos)
+
+
+class TokenStream:
+ def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None:
+ self.used: list[Token] = []
self.tokens = iter(tokens)
self.source = source
- self.peeked = None
+ self.peeked: Token | None = None
self._peeking = False
- try:
- self.next_token = self.tokens.next
- except AttributeError:
- # Python 3
- self.next_token = self.tokens.__next__
+ self.next_token = self.tokens.__next__
- def next(self):
+ def next(self) -> Token:
if self._peeking:
self._peeking = False
+ assert self.peeked is not None
self.used.append(self.peeked)
return self.peeked
- else:
- try:
- next = self.next_token()
- self.used.append(next)
- return next
- except StopIteration:
- return None
+ next_ = self.next_token()
+ self.used.append(next_)
+ return next_
- def __iter__(self):
- return iter(self.next, None)
-
- def peek(self):
+ def peek(self) -> Token:
if not self._peeking:
- try:
- self.peeked = self.next_token()
- except StopIteration:
- return None
+ self.peeked = self.next_token()
self._peeking = True
+ assert self.peeked is not None
return self.peeked
- def next_symbol(self):
- next = self.next()
- if not isinstance(next, Symbol):
- raise SelectorSyntaxError(
- "Expected symbol, got '%s'" % next)
- return next
-
- def next_symbol_or_star(self):
- next = self.next()
- if next != '*' and not isinstance(next, Symbol):
- raise SelectorSyntaxError(
- "Expected symbol or '*', got '%s'" % next)
- return next
-
- def skip_whitespace(self):
- if self.peek() == ' ':
+ def next_ident(self) -> str:
+ next_ = self.next()
+ if next_.type != "IDENT":
+ raise SelectorSyntaxError(f"Expected ident, got {next_}")
+ return cast("str", next_.value)
+
+ def next_ident_or_star(self) -> str | None:
+ next_ = self.next()
+ if next_.type == "IDENT":
+ return next_.value
+ if next_ == ("DELIM", "*"):
+ return None
+ raise SelectorSyntaxError(f"Expected ident or '*', got {next_}")
+
+ def skip_whitespace(self) -> None:
+ peek = self.peek()
+ if peek.type == "S":
self.next()
diff --git a/cssselect/py.typed b/cssselect/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/cssselect/tests.py b/cssselect/tests.py
deleted file mode 100755
index 086f01f..0000000
--- a/cssselect/tests.py
+++ /dev/null
@@ -1,935 +0,0 @@
-#!/usr/bin/env python
-"""
- Tests for cssselect
- ===================
-
- These tests can be run either by py.test or by the standard library's
- unittest. They use plain ``assert`` statements and do little reporting
- themselves in case of failure.
-
- Use py.test to get fancy error reporting and assert introspection.
-
-
- :copyright: (c) 2007-2012 Ian Bicking and contributors.
- See AUTHORS for more details.
- :license: BSD, see LICENSE for more details.
-
-"""
-
-import sys
-import operator
-import unittest
-
-from lxml import html
-from cssselect import (parse, GenericTranslator, HTMLTranslator,
- SelectorSyntaxError, ExpressionError)
-from cssselect.parser import tokenize, parse_series
-
-
-class TestCssselect(unittest.TestCase):
- def test_tokenizer(self):
- tokens = [repr(item).replace("u'", "'")
- for item in tokenize('E > f[a~="y\\"x"]')]
- assert tokens == [
- "Symbol('E', 0)",
- "Token(' ', 1)",
- "Token('>', 2)",
- "Token(' ', 3)",
- "Symbol('f', 4)",
- "Token('[', 5)",
- "Symbol('a', 6)",
- "Token('~=', 7)",
- "String('y\"x', 9)",
- "Token(']', 15)"]
-
- def test_parser(self):
- def repr_parse(css):
- selectors = parse(css)
- for selector in selectors:
- assert selector.pseudo_element is None
- return [repr(selector._tree).replace("(u'", "('")
- for selector in selectors]
-
- def parse_many(first, *others):
- result = repr_parse(first)
- for other in others:
- assert repr_parse(other) == result
- return result
-
- assert parse_many('*') == ['Element[*]']
- assert parse_many('*|*') == ['Element[*]']
- assert parse_many('*|foo') == ['Element[foo]']
- assert parse_many('foo|*') == ['Element[foo|*]']
- assert parse_many('foo|bar') == ['Element[foo|bar]']
- # This will never match, but it is valid:
- assert parse_many('#foo#bar') == ['Hash[Hash[Element[*]#foo]#bar]']
- assert parse_many(
- 'div>.foo',
- 'div> .foo',
- 'div >.foo',
- 'div > .foo',
- 'div \n> \t \t .foo', 'div\r>\n\n\n.foo', 'div\f>\f.foo'
- ) == ['CombinedSelector[Element[div] > Class[Element[*].foo]]']
- assert parse_many('td.foo,.bar',
- 'td.foo, .bar',
- 'td.foo\t\r\n\f ,\t\r\n\f .bar'
- ) == [
- 'Class[Element[td].foo]',
- 'Class[Element[*].bar]'
- ]
- assert parse_many('div, td.foo, div.bar span') == [
- 'Element[div]',
- 'Class[Element[td].foo]',
- 'CombinedSelector[Class[Element[div].bar] '
- ' Element[span]]']
- assert parse_many('div > p') == [
- 'CombinedSelector[Element[div] > Element[p]]']
- assert parse_many('td:first') == [
- 'Pseudo[Element[td]:first]']
- assert parse_many('td:first') == [
- 'Pseudo[Element[td]:first]']
- assert parse_many('td :first') == [
- 'CombinedSelector[Element[td] '
- ' Pseudo[Element[*]:first]]']
- assert parse_many('td :first') == [
- 'CombinedSelector[Element[td] '
- ' Pseudo[Element[*]:first]]']
- assert parse_many('a[name]', 'a[ name\t]') == [
- 'Attrib[Element[a][name]]']
- assert parse_many('a [name]') == [
- 'CombinedSelector[Element[a] Attrib[Element[*][name]]]']
- assert parse_many('a[rel="include"]') == [
- "Attrib[Element[a][rel = String('include', 6)]]"]
- assert parse_many('a[rel = include]') == [
- "Attrib[Element[a][rel = Symbol('include', 8)]]"]
- assert parse_many("a[hreflang |= 'en']") == [
- "Attrib[Element[a][hreflang |= String('en', 14)]]"]
- assert parse_many('div:nth-child(10)') == [
- "Function[Element[div]:nth-child(Symbol('10', 14))]"]
- assert parse_many(':nth-child(2n+2)') == [
- "Function[Element[*]:nth-child(Symbol('2n+2', 11))]"]
- assert parse_many('div:nth-of-type(10)') == [
- "Function[Element[div]:nth-of-type(Symbol('10', 16))]"]
- assert parse_many('div div:nth-of-type(10) .aclass') == [
- 'CombinedSelector[CombinedSelector[Element[div] '
- "Function[Element[div]:nth-of-type(Symbol('10', 20))]] "
- ' Class[Element[*].aclass]]']
- assert parse_many('label:only') == [
- 'Pseudo[Element[label]:only]']
- assert parse_many('a:lang(fr)') == [
- "Function[Element[a]:lang(Symbol('fr', 7))]"]
- assert parse_many('div:contains("foo")') == [
- "Function[Element[div]:contains(String('foo', 13))]"]
- assert parse_many('div#foobar') == [
- 'Hash[Element[div]#foobar]']
- assert parse_many('div:not(div.foo)') == [
- 'Negation[Element[div]:not(Class[Element[div].foo])]']
- assert parse_many('td ~ th') == [
- 'CombinedSelector[Element[td] ~ Element[th]]']
-
- def test_pseudo_elements(self):
- def parse_pseudo(css):
- result = []
- for selector in parse(css):
- result.append((
- repr(selector._tree).replace("(u'", "('"),
- selector.pseudo_element))
- return result
-
- def parse_one(css):
- result = parse_pseudo(css)
- assert len(result) == 1
- return result[0]
-
- assert parse_one('foo') == ('Element[foo]', None)
- assert parse_one('*') == ('Element[*]', None)
- assert parse_one(':empty') == ('Pseudo[Element[*]:empty]', None)
-
- # Special cases for CSS 2.1 pseudo-elements
- assert parse_one(':before') == ('Element[*]', 'before')
- assert parse_one(':after') == ('Element[*]', 'after')
- assert parse_one(':first-line') == ('Element[*]', 'first-line')
- assert parse_one(':first-letter') == ('Element[*]', 'first-letter')
-
- assert parse_one('::before') == ('Element[*]', 'before')
- assert parse_one('::after') == ('Element[*]', 'after')
- assert parse_one('::first-line') == ('Element[*]', 'first-line')
- assert parse_one('::first-letter') == ('Element[*]', 'first-letter')
-
- assert parse_one('::selection') == ('Element[*]', 'selection')
- assert parse_one('foo:after') == ('Element[foo]', 'after')
- assert parse_one('foo::selection') == ('Element[foo]', 'selection')
- assert parse_one('lorem#ipsum ~ a#b.c[href]:empty::selection') == (
- 'CombinedSelector[Hash[Element[lorem]#ipsum] ~ '
- 'Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]',
- 'selection')
-
- parse_pseudo('foo:before, bar, baz:after') == [
- ('Element[foo]', 'before'),
- ('Element[bar]', None),
- ('Element[baz]', 'after')]
-
- def test_specificity(self):
- def specificity(css):
- selectors = parse(css)
- assert len(selectors) == 1
- return selectors[0].specificity()
-
- assert specificity('*') == (0, 0, 0)
- assert specificity(' foo') == (0, 0, 1)
- assert specificity(':empty ') == (0, 1, 0)
- assert specificity(':before') == (0, 0, 1)
- assert specificity('*:before') == (0, 0, 1)
- assert specificity(':nth-child(2)') == (0, 1, 0)
- assert specificity('.bar') == (0, 1, 0)
- assert specificity('[baz]') == (0, 1, 0)
- assert specificity('[baz="4"]') == (0, 1, 0)
- assert specificity('[baz^="4"]') == (0, 1, 0)
- assert specificity('#lipsum') == (1, 0, 0)
-
- assert specificity('foo:empty') == (0, 1, 1)
- assert specificity('foo:before') == (0, 0, 2)
- assert specificity('foo::before') == (0, 0, 2)
- assert specificity('foo:empty::before') == (0, 1, 2)
-
- assert specificity('#lorem + foo#ipsum:first-child > bar:first-line'
- ) == (2, 1, 3)
-
- def test_parse_errors(self):
- def get_error(css):
- try:
- parse(css)
- except SelectorSyntaxError:
- # Py2, Py3, ...
- return str(sys.exc_info()[1]).replace("(u'", "('")
-
- assert get_error('attributes(href)/html/body/a') == (
- "Expected selector, got '(' at "
- "[Symbol('attributes', 0)] -> Token('(', 10)")
- assert get_error('attributes(href)') == (
- "Expected selector, got '(' at "
- "[Symbol('attributes', 0)] -> Token('(', 10)")
- assert get_error('html/body/a') == (
- "Unexpected symbol: '/' at [Symbol('html', 0)] -> None")
- assert get_error(' ') == (
- "Expected selector, got 'None' at [Token(' ', 0)] -> None")
- assert get_error('div, ') == (
- "Expected selector, got 'None' at "
- "[Symbol('div', 0), Token(',', 3), Token(' ', 4)] -> None")
- assert get_error(' , div') == (
- "Expected selector, got ',' at "
- "[Token(' ', 0)] -> Token(',', 1)")
- assert get_error('p, , div') == (
- "Expected selector, got ',' at "
- "[Symbol('p', 0), Token(',', 1), Token(' ', 2)] -> Token(',', 3)")
- assert get_error('div > ') == (
- "Expected selector, got 'None' at "
- "[Symbol('div', 0), Token(' ', 3), Token('>', 4), Token(' ', 5)]"
- " -> None")
- assert get_error(' > div') == (
- "Expected selector, got '>' at [Token(' ', 0)] -> Token('>', 2)")
- assert get_error('foo|#bar') == (
- "Expected symbol or '*', got '#' at "
- "[Symbol('foo', 0), Token('|', 3), "
- "Token('#', 4)] -> Symbol('bar', 5)")
- assert get_error('#.foo') == (
- "Expected symbol, got '.' at "
- "[Token('#', 0), Token('.', 1)] -> Symbol('foo', 2)")
- assert get_error('.#foo') == (
- "Expected symbol, got '#' at "
- "[Token('.', 0), Token('#', 1)] -> Symbol('foo', 2)")
- assert get_error(':#foo') == (
- "Expected symbol, got '#' at "
- "[Token(':', 0), Token('#', 1)] -> Symbol('foo', 2)")
- assert get_error('[*]') == (
- "Expected '|', got ']' at "
- "[Token('[', 0), Token('*', 1)] -> Token(']', 2)")
- assert get_error('[foo|]') == (
- "Expected symbol, got ']' at "
- "[Token('[', 0), Symbol('foo', 1), Token('|', 4), Token(']', 5)]"
- " -> None")
- assert get_error('[#]') == (
- "Expected symbol or '*', got '#' at "
- "[Token('[', 0), Token('#', 1)] -> Token(']', 2)")
- assert get_error('[foo=#]') == (
- "Expected string or symbol, got '#' at "
- "[Token('[', 0), Symbol('foo', 1), Token('=', 4), Token('#', 5)]"
- " -> Token(']', 6)")
- assert get_error(':nth-child()') == (
- "Expected argument, got ')' at "
- "[Token(':', 0), Symbol('nth-child', 1), Token('(', 10)]"
- " -> Token(')', 11)")
- assert get_error('[href]a') == (
- "Expected selector, got 'a' at "
- "[Token('[', 0), Symbol('href', 1), Token(']', 5)]"
- " -> Symbol('a', 6)")
-
- # Mis-placed pseudo-elements
- assert get_error('a:before:empty') == (
- "A pseudo-element must be at the end of a selector at "
- "[Symbol('a', 0), Token(':', 1), Symbol('before', 2)] "
- "-> Token(':', 8)")
- assert get_error('li:before a') == (
- "A pseudo-element must be at the end of a selector at "
- "[Symbol('li', 0), Token(':', 2), Symbol('before', 3), "
- "Token(' ', 9)] -> Symbol('a', 10)")
- assert get_error(':not(:before)') == (
- "Pseudo-elements are not allowed inside :not() at "
- "[Token(':', 0), Symbol('not', 1), Token('(', 4), Token(':', 5),"
- " Symbol('before', 6)] -> Token(')', 12)")
-
-
- def test_translation(self):
- def xpath(css):
- return str(GenericTranslator().css_to_xpath(css, prefix=''))
-
- assert xpath('*') == "*"
- assert xpath('E') == "e"
- assert xpath('E[foo]') == "e[@foo]"
- assert xpath('E[foo="bar"]') == "e[@foo = 'bar']"
- assert xpath('E[foo~="bar"]') == (
- "e[@foo and contains("
- "concat(' ', normalize-space(@foo), ' '), ' bar ')]")
- assert xpath('E[foo^="bar"]') == (
- "e[@foo and starts-with(@foo, 'bar')]")
- assert xpath('E[foo$="bar"]') == (
- "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']")
- assert xpath('E[foo*="bar"]') == (
- "e[@foo and contains(@foo, 'bar')]")
- assert xpath('E[hreflang|="en"]') == (
- "e[@hreflang and ("
- "@hreflang = 'en' or starts-with(@hreflang, 'en-'))]")
- assert xpath('E:nth-child(1)') == (
- "*/*[name() = 'e' and (position() = 1)]")
- assert xpath('E:nth-last-child(1)') == (
- "*/*[name() = 'e' and (position() = last() - 1)]")
- assert xpath('E:nth-last-child(2n+2)') == (
- "*/*[name() = 'e' and ("
- "(position() +2) mod -2 = 0 and position() < (last() -2))]")
- assert xpath('E:nth-of-type(1)') == (
- "*/e[position() = 1]")
- assert xpath('E:nth-last-of-type(1)') == (
- "*/e[position() = last() - 1]")
- assert xpath('E:nth-last-of-type(1)') == (
- "*/e[position() = last() - 1]")
- assert xpath('div E:nth-last-of-type(1) .aclass') == (
- "div/descendant-or-self::*/e[position() = last() - 1]"
- "/descendant-or-self::*/*[@class and contains("
- "concat(' ', normalize-space(@class), ' '), ' aclass ')]")
- assert xpath('E:first-child') == (
- "*/*[name() = 'e' and (position() = 1)]")
- assert xpath('E:last-child') == (
- "*/*[name() = 'e' and (position() = last())]")
- assert xpath('E:first-of-type') == (
- "*/e[position() = 1]")
- assert xpath('E:last-of-type') == (
- "*/e[position() = last()]")
- assert xpath('E:only-child') == (
- "*/*[name() = 'e' and (last() = 1)]")
- assert xpath('E:only-of-type') == (
- "e[last() = 1]")
- assert xpath('E:empty') == (
- "e[not(*) and not(normalize-space())]")
- assert xpath('E:root') == (
- "e[not(parent::*)]")
- assert xpath('E:contains("foo")') == (
- "e[contains(string(.), 'foo')]")
- assert xpath('E:contains(foo)') == (
- "e[contains(string(.), 'foo')]")
- assert xpath('E.warning') == (
- "e[@class and contains("
- "concat(' ', normalize-space(@class), ' '), ' warning ')]")
- assert xpath('E#myid') == (
- "e[@id = 'myid']")
- assert xpath('E:not(:nth-child(odd))') == (
- "e[not((position() -1) mod 2 = 0 and position() >= 1)]")
- assert xpath('E F') == (
- "e/descendant-or-self::*/f")
- assert xpath('E > F') == (
- "e/f")
- assert xpath('E + F') == (
- "e/following-sibling::*[name() = 'f' and (position() = 1)]")
- assert xpath('E ~ F') == (
- "e/following-sibling::f")
- assert xpath('div#container p') == (
- "div[@id = 'container']/descendant-or-self::*/p")
- self.assertRaises(ExpressionError, xpath, 'p *:only-of-type')
-
- def test_unicode(self):
- if sys.version_info[0] >= 3:
- css = '.a\xc1b'
- else:
- css = '.a\xc1b'.decode('ISO-8859-1')
-
- xpath = GenericTranslator().css_to_xpath(css)
- assert css[1:] in xpath
- xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII')
- assert xpath == (
- "descendant-or-self::*[@class and contains("
- "concat(' ', normalize-space(@class), ' '), ' aÁb ')]")
-
- def test_quoting(self):
- css_to_xpath = GenericTranslator().css_to_xpath
- assert css_to_xpath('*[aval="\'"]') == (
- '''descendant-or-self::*[@aval = "'"]''')
- assert css_to_xpath('*[aval="\'\'\'"]') == (
- """descendant-or-self::*[@aval = "'''"]""")
- assert css_to_xpath('*[aval=\'"\']') == (
- '''descendant-or-self::*[@aval = '"']''')
- assert css_to_xpath('*[aval=\'"""\']') == (
- '''descendant-or-self::*[@aval = '"""']''')
-
- def test_unicode_escapes(self):
- # \22 == '"' \20 == ' '
- css_to_xpath = GenericTranslator().css_to_xpath
- assert css_to_xpath(r'*[aval="\'\22\'"]') == (
- '''descendant-or-self::*[@aval = concat("'",'"',"'")]''')
- assert css_to_xpath(r'*[aval="\'\22 2\'"]') == (
- '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''')
- assert css_to_xpath(r'*[aval="\'\20 \'"]') == (
- '''descendant-or-self::*[@aval = "' '"]''')
- assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == (
- '''descendant-or-self::*[@aval = "' '"]''')
-
- def test_series(self):
- assert parse_series('1n+3') == (1, 3)
- assert parse_series('n-5') == (1, -5)
- assert parse_series('odd') == (2, 1)
- assert parse_series('even') == (2, 0)
- assert parse_series('3n') == (3, 0)
- assert parse_series('n') == (1, 0)
- assert parse_series('5') == (0, 5)
-
- def test_select(self):
- document = html.document_fromstring(HTML_IDS)
- sort_key = dict(
- (el, count) for count, el in enumerate(document.getiterator())
- ).__getitem__
- css_to_xpath = GenericTranslator().css_to_xpath
- html_css_to_xpath = HTMLTranslator().css_to_xpath
-
- def select_ids(selector, html_only):
- xpath = css_to_xpath(selector)
- items = document.xpath(xpath)
- if html_only:
- assert items == []
- xpath = html_css_to_xpath(selector)
- items = document.xpath(xpath)
- items.sort(key=sort_key)
- return [element.get('id', 'nil') for element in items]
-
- def pcss(main, *selectors, **kwargs):
- html_only = kwargs.pop('html_only', False)
- result = select_ids(main, html_only)
- for selector in selectors:
- assert select_ids(selector, html_only) == result
- return result
-
- all_ids = pcss('*')
- assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div']
- assert all_ids[-1:] == ['foobar-span']
- assert pcss('div') == ['outer-div', 'li-div', 'foobar-div']
- assert pcss('div div') == ['li-div']
- assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div']
- assert pcss('a[name]') == ['name-anchor']
- assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor']
- assert pcss('a[rel="tag"]') == ['tag-anchor']
- assert pcss('a[href*="localhost"]') == ['tag-anchor']
- assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor']
- assert pcss('a[href^="http:"]') == ['tag-anchor']
- assert pcss('a[href$="org"]') == ['nofollow-anchor']
- assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [
- 'foobar-div']
- assert pcss('div[foobar~="cd"]') == []
- assert pcss('*[lang|="en"]', '*[lang|="en-US"]') == ['second-li']
- assert pcss('*[lang|="e"]') == []
- assert pcss('li:nth-child(3)') == ['third-li']
- assert pcss('li:nth-child(10)') == []
- assert pcss('li:nth-child(2n)', 'li:nth-child(even)',
- 'li:nth-child(2n+0)') == [
- 'second-li', 'fourth-li', 'sixth-li']
- assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [
- 'first-li', 'third-li', 'fifth-li', 'seventh-li']
- assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li']
- # FIXME: I'm not 100% sure this is right:
- assert pcss('li:nth-child(3n+1)') == [
- 'first-li', 'fourth-li', 'seventh-li']
- assert pcss('li:nth-last-child(0)') == [
- 'seventh-li']
- assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [
- 'second-li', 'fourth-li', 'sixth-li']
- assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li']
- assert pcss('ol:first-of-type') == ['first-ol']
- assert pcss('ol:nth-child(1)') == []
- assert pcss('ol:nth-of-type(2)') == ['second-ol']
- # FIXME: like above', '(1) or (2)?
- assert pcss('ol:nth-last-of-type(1)') == ['first-ol']
- assert pcss('span:only-child') == ['foobar-span']
- assert pcss('li div:only-child') == ['li-div']
- assert pcss('div *:only-child') == [
- 'li-div', 'checkbox-disabled', 'foobar-span']
- self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
- self.assertRaises(ExpressionError, pcss, 'p:lang(fr)')
- assert pcss('p:only-of-type') == ['paragraph']
- assert pcss('a:empty') == ['name-anchor']
- assert pcss('li:empty') == [
- 'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li']
- assert pcss(':root', 'html:root') == ['html']
- assert pcss('li:root', '* :root') == []
- assert pcss('*:contains("link")') == [
- 'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor']
- assert pcss('*:contains("LInk")') == [] # case sensitive
- assert pcss('*:contains("e")') == [
- 'html', 'nil', 'outer-div', 'first-ol', 'first-li',
- 'paragraph', 'p-em']
- assert pcss('*:contains("E")') == [] # case-sensitive
- assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol']
- assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li']
- assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [
- 'third-li', 'fourth-li']
- assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li']
- # Need some tests of :not()']
- assert pcss('li div', 'li > div', 'div div') == ['li-div']
- assert pcss('div > div') == []
- assert pcss('div>.c', 'div > .c') == ['first-ol']
- assert pcss('div + div') == ['foobar-div']
- assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor']
- assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor']
- assert pcss('ol#first-ol li:last-child') == ['seventh-li']
- assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li']
- assert pcss('#outer-div:first-child') == ['outer-div']
- assert pcss('#outer-div :first-child') == [
- 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled']
- assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor']
- assert pcss(':link', html_only=True) == pcss('a[href]')
- assert pcss(':checked', html_only=True) == ['checkbox-checked']
- assert pcss(':disabled', html_only=True) == [
- 'fieldset', 'checkbox-disabled']
- assert pcss(':enabled', html_only=True) == [
- 'checkbox-unchecked', 'checkbox-checked']
-
- def test_select_shakespeare(self):
- document = html.document_fromstring(HTML_SHAKESPEARE)
- body = document.xpath('//body')[0]
- css_to_xpath = GenericTranslator().css_to_xpath
-
- try:
- basestring_ = basestring
- except NameError:
- basestring_ = (str, bytes)
-
- def count(selector):
- xpath = css_to_xpath(selector)
- results = body.xpath(xpath)
- assert not isinstance(results, basestring_)
- found = set()
- for item in results:
- assert item not in found
- found.add(item)
- assert not isinstance(item, basestring_)
- return len(results)
-
- # Data borrowed from http://mootools.net/slickspeed/
-
- ## Changed from original; probably because I'm only
- ## searching the body.
- #assert count('*') == 252
- assert count('*') == 246
- assert count('div:contains(CELIA)') == 26
- assert count('div:only-child') == 22 # ?
- assert count('div:nth-child(even)') == 106
- assert count('div:nth-child(2n)') == 106
- assert count('div:nth-child(odd)') == 137
- assert count('div:nth-child(2n+1)') == 137
- assert count('div:nth-child(n)') == 243
- assert count('div:last-child') == 53
- assert count('div:first-child') == 51
- assert count('div > div') == 242
- assert count('div + div') == 190
- assert count('div ~ div') == 190
- assert count('body') == 1
- assert count('body div') == 243
- assert count('div') == 243
- assert count('div div') == 242
- assert count('div div div') == 241
- assert count('div, div, div') == 243
- assert count('div, a, span') == 243
- assert count('.dialog') == 51
- assert count('div.dialog') == 51
- assert count('div .dialog') == 51
- assert count('div.character, div.dialog') == 99
- assert count('div.direction.dialog') == 0
- assert count('div.dialog.direction') == 0
- assert count('div.dialog.scene') == 1
- assert count('div.scene.scene') == 1
- assert count('div.scene .scene') == 0
- assert count('div.direction .dialog ') == 0
- assert count('div .dialog .direction') == 4
- assert count('div.dialog .dialog .direction') == 4
- assert count('#speech5') == 1
- assert count('div#speech5') == 1
- assert count('div #speech5') == 1
- assert count('div.scene div.dialog') == 49
- assert count('div#scene1 div.dialog div') == 142
- assert count('#scene1 #speech1') == 1
- assert count('div[class]') == 103
- assert count('div[class=dialog]') == 50
- assert count('div[class^=dia]') == 51
- assert count('div[class$=log]') == 50
- assert count('div[class*=sce]') == 1
- assert count('div[class|=dialog]') == 50 # ? Seems right
- assert count('div[class!=madeup]') == 243 # ? Seems right
- assert count('div[class~=dialog]') == 51 # ? Seems right
-
-HTML_IDS = '''
-
-
-
-
link
-
- link
-
- - content
- -
-
-
-
-
-
-
-
- -
-
-
- hi there
- guy
-
-
-
-
-
-
-
-
-
-'''
-
-
-HTML_SHAKESPEARE = '''
-
-
-
-
-
-
-
-
-
As You Like It
-
- by William Shakespeare
-
-
-
ACT I, SCENE III. A room in the palace.
-
-
Enter CELIA and ROSALIND
-
-
CELIA
-
-
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
-
-
ROSALIND
-
-
Not one to throw at a dog.
-
-
CELIA
-
-
No, thy words are too precious to be cast away upon
-
curs; throw some of them at me; come, lame me with reasons.
-
-
ROSALIND
-
CELIA
-
-
But is all this for your father?
-
-
-
Then there were two cousins laid up; when the one
-
should be lamed with reasons and the other mad
-
without any.
-
-
ROSALIND
-
-
No, some of it is for my child's father. O, how
-
full of briers is this working-day world!
-
-
CELIA
-
-
They are but burs, cousin, thrown upon thee in
-
holiday foolery: if we walk not in the trodden
-
paths our very petticoats will catch them.
-
-
ROSALIND
-
-
I could shake them off my coat: these burs are in my heart.
-
-
CELIA
-
-
ROSALIND
-
-
I would try, if I could cry 'hem' and have him.
-
-
CELIA
-
-
Come, come, wrestle with thy affections.
-
-
ROSALIND
-
-
O, they take the part of a better wrestler than myself!
-
-
CELIA
-
-
O, a good wish upon you! you will try in time, in
-
despite of a fall. But, turning these jests out of
-
service, let us talk in good earnest: is it
-
possible, on such a sudden, you should fall into so
-
strong a liking with old Sir Rowland's youngest son?
-
-
ROSALIND
-
-
The duke my father loved his father dearly.
-
-
CELIA
-
-
Doth it therefore ensue that you should love his son
-
dearly? By this kind of chase, I should hate him,
-
for my father hated his father dearly; yet I hate
-
not Orlando.
-
-
ROSALIND
-
-
No, faith, hate him not, for my sake.
-
-
CELIA
-
-
Why should I not? doth he not deserve well?
-
-
ROSALIND
-
-
Let me love him for that, and do you love him
-
because I do. Look, here comes the duke.
-
-
CELIA
-
-
With his eyes full of anger.
-
Enter DUKE FREDERICK, with Lords
-
-
DUKE FREDERICK
-
-
Mistress, dispatch you with your safest haste
-
And get you from our court.
-
-
ROSALIND
-
-
DUKE FREDERICK
-
-
You, cousin
-
Within these ten days if that thou be'st found
-
So near our public court as twenty miles,
-
Thou diest for it.
-
-
ROSALIND
-
-
I do beseech your grace,
-
Let me the knowledge of my fault bear with me:
-
If with myself I hold intelligence
-
Or have acquaintance with mine own desires,
-
If that I do not dream or be not frantic,--
-
As I do trust I am not--then, dear uncle,
-
Never so much as in a thought unborn
-
Did I offend your highness.
-
-
DUKE FREDERICK
-
-
Thus do all traitors:
-
If their purgation did consist in words,
-
They are as innocent as grace itself:
-
Let it suffice thee that I trust thee not.
-
-
ROSALIND
-
-
Yet your mistrust cannot make me a traitor:
-
Tell me whereon the likelihood depends.
-
-
DUKE FREDERICK
-
-
Thou art thy father's daughter; there's enough.
-
-
ROSALIND
-
-
So was I when your highness took his dukedom;
-
So was I when your highness banish'd him:
-
Treason is not inherited, my lord;
-
Or, if we did derive it from our friends,
-
What's that to me? my father was no traitor:
-
Then, good my liege, mistake me not so much
-
To think my poverty is treacherous.
-
-
CELIA
-
-
Dear sovereign, hear me speak.
-
-
DUKE FREDERICK
-
-
Ay, Celia; we stay'd her for your sake,
-
Else had she with her father ranged along.
-
-
CELIA
-
-
I did not then entreat to have her stay;
-
It was your pleasure and your own remorse:
-
I was too young that time to value her;
-
But now I know her: if she be a traitor,
-
Why so am I; we still have slept together,
-
Rose at an instant, learn'd, play'd, eat together,
-
And wheresoever we went, like Juno's swans,
-
Still we went coupled and inseparable.
-
-
DUKE FREDERICK
-
-
She is too subtle for thee; and her smoothness,
-
Her very silence and her patience
-
Speak to the people, and they pity her.
-
Thou art a fool: she robs thee of thy name;
-
And thou wilt show more bright and seem more virtuous
-
When she is gone. Then open not thy lips:
-
Firm and irrevocable is my doom
-
Which I have pass'd upon her; she is banish'd.
-
-
CELIA
-
-
Pronounce that sentence then on me, my liege:
-
I cannot live out of her company.
-
-
DUKE FREDERICK
-
-
You are a fool. You, niece, provide yourself:
-
If you outstay the time, upon mine honour,
-
And in the greatness of my word, you die.
-
Exeunt DUKE FREDERICK and Lords
-
-
CELIA
-
-
O my poor Rosalind, whither wilt thou go?
-
Wilt thou change fathers? I will give thee mine.
-
I charge thee, be not thou more grieved than I am.
-
-
ROSALIND
-
-
CELIA
-
-
Thou hast not, cousin;
-
Prithee be cheerful: know'st thou not, the duke
-
Hath banish'd me, his daughter?
-
-
ROSALIND
-
-
CELIA
-
-
No, hath not? Rosalind lacks then the love
-
Which teacheth thee that thou and I am one:
-
Shall we be sunder'd? shall we part, sweet girl?
-
No: let my father seek another heir.
-
Therefore devise with me how we may fly,
-
Whither to go and what to bear with us;
-
And do not seek to take your change upon you,
-
To bear your griefs yourself and leave me out;
-
For, by this heaven, now at our sorrows pale,
-
Say what thou canst, I'll go along with thee.
-
-
ROSALIND
-
-
Why, whither shall we go?
-
-
CELIA
-
-
To seek my uncle in the forest of Arden.
-
-
ROSALIND
-
-
Alas, what danger will it be to us,
-
Maids as we are, to travel forth so far!
-
Beauty provoketh thieves sooner than gold.
-
-
CELIA
-
-
I'll put myself in poor and mean attire
-
And with a kind of umber smirch my face;
-
The like do you: so shall we pass along
-
And never stir assailants.
-
-
ROSALIND
-
-
Were it not better,
-
Because that I am more than common tall,
-
That I did suit me all points like a man?
-
A gallant curtle-axe upon my thigh,
-
A boar-spear in my hand; and--in my heart
-
Lie there what hidden woman's fear there will--
-
We'll have a swashing and a martial outside,
-
As many other mannish cowards have
-
That do outface it with their semblances.
-
-
CELIA
-
-
What shall I call thee when thou art a man?
-
-
ROSALIND
-
-
I'll have no worse a name than Jove's own page;
-
And therefore look you call me Ganymede.
-
But what will you be call'd?
-
-
CELIA
-
-
Something that hath a reference to my state
-
No longer Celia, but Aliena.
-
-
ROSALIND
-
-
But, cousin, what if we assay'd to steal
-
The clownish fool out of your father's court?
-
Would he not be a comfort to our travel?
-
-
CELIA
-
-
He'll go along o'er the wide world with me;
-
Leave me alone to woo him. Let's away,
-
And get our jewels and our wealth together,
-
Devise the fittest time and safest way
-
To hide us from pursuit that will be made
-
After my flight. Now go we in content
-
To liberty and not to banishment.
-
Exeunt
-
-
-
-
-
-
-'''
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/cssselect/xpath.py b/cssselect/xpath.py
index 23a165c..96eac3f 100644
--- a/cssselect/xpath.py
+++ b/cssselect/xpath.py
@@ -1,27 +1,46 @@
"""
- cssselect.xpath
- ===============
+cssselect.xpath
+===============
- Translation of parsed CSS selectors to XPath expressions.
+Translation of parsed CSS selectors to XPath expressions.
- :copyright: (c) 2007-2012 Ian Bicking and contributors.
- See AUTHORS for more details.
- :license: BSD, see LICENSE for more details.
+:copyright: (c) 2007-2012 Ian Bicking and contributors.
+See AUTHORS for more details.
+:license: BSD, see LICENSE for more details.
"""
-import re
-from cssselect.parser import parse, parse_series, SelectorError
-
+from __future__ import annotations
-try:
- _basestring = basestring
- _unicode = unicode
-except NameError:
- # Python 3
- _basestring = str
- _unicode = str
+import re
+from typing import TYPE_CHECKING, cast
+
+from cssselect.parser import (
+ Attrib,
+ Class,
+ CombinedSelector,
+ Element,
+ Function,
+ Hash,
+ Matching,
+ Negation,
+ Pseudo,
+ PseudoElement,
+ Relation,
+ Selector,
+ SelectorError,
+ SpecificityAdjustment,
+ Tree,
+ parse,
+ parse_series,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Callable
+
+ # typing.Self requires Python 3.11
+ from typing_extensions import Self
class ExpressionError(SelectorError, RuntimeError):
@@ -30,368 +49,656 @@ class ExpressionError(SelectorError, RuntimeError):
#### XPath Helpers
-class XPathExpr(object):
- def __init__(self, path='', element='*', condition='', star_prefix=False):
+class XPathExpr:
+ def __init__(
+ self,
+ path: str = "",
+ element: str = "*",
+ condition: str = "",
+ star_prefix: bool = False,
+ ) -> None:
self.path = path
self.element = element
self.condition = condition
- self.star_prefix = star_prefix
- def __str__(self):
- path = _unicode(self.path) + _unicode(self.element)
+ def __str__(self) -> str:
+ path = str(self.path) + str(self.element)
if self.condition:
- path += '[%s]' % self.condition
+ path += f"[{self.condition}]"
return path
- def __repr__(self):
- return '%s[%s]' % (self.__class__.__name__, self)
+ def __repr__(self) -> str:
+ return f"{self.__class__.__name__}[{self}]"
- def add_condition(self, condition):
+ def add_condition(self, condition: str, conjuction: str = "and") -> Self:
if self.condition:
- self.condition = '%s and (%s)' % (self.condition, condition)
+ self.condition = f"({self.condition}) {conjuction} ({condition})"
else:
self.condition = condition
return self
- def add_name_test(self):
- if self.element == '*':
+ def add_name_test(self) -> None:
+ if self.element == "*":
# We weren't doing a test anyway
return
- self.add_condition(
- "name() = %s" % GenericTranslator.xpath_literal(self.element))
- self.element = '*'
+ self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}")
+ self.element = "*"
- def add_star_prefix(self):
+ def add_star_prefix(self) -> None:
"""
- Adds a /* prefix if there is no prefix. This is when you need
- to keep context's constrained to a single parent.
+ Append '*/' to the path to keep the context constrained
+ to a single parent.
"""
- if self.path:
- self.path += '*/'
- else:
- self.path = '*/'
- self.star_prefix = True
-
- def join(self, combiner, other):
- path = _unicode(self) + combiner
- # We don't need a star prefix if we are joining to this other
- # prefix; so we'll get rid of it
- if not(other.star_prefix and other.path == '*/'):
+ self.path += "*/"
+
+ def join(
+ self,
+ combiner: str,
+ other: XPathExpr,
+ closing_combiner: str | None = None,
+ has_inner_condition: bool = False,
+ ) -> Self:
+ path = str(self) + combiner
+ # Any "star prefix" is redundant when joining.
+ if other.path != "*/":
path += other.path
self.path = path
- self.element = other.element
- self.condition = other.condition
+ if not has_inner_condition:
+ self.element = (
+ other.element + closing_combiner if closing_combiner else other.element
+ )
+ self.condition = other.condition
+ else:
+ self.element = other.element
+ if other.condition:
+ self.element += "[" + other.condition + "]"
+ if closing_combiner:
+ self.element += closing_combiner
return self
split_at_single_quotes = re.compile("('+)").split
+# The spec is actually more permissive than that, but don’t bother.
+# This is just for the fast path.
+# http://www.w3.org/TR/REC-xml/#NT-NameStartChar
+is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match
+
+# Test that the string is not empty and does not contain whitespace
+is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match
+
#### Translation
-class GenericTranslator(object):
+
+class GenericTranslator:
"""
Translator for "generic" XML documents.
+
+ Everything is case-sensitive, no assumption is made on the meaning
+ of element names and attribute names.
+
"""
+
+ ####
+ #### HERE BE DRAGONS
+ ####
+ #### You are welcome to hook into this to change some behavior,
+ #### but do so at your own risks.
+ #### Until it has received a lot more work and review,
+ #### I reserve the right to change this API in backward-incompatible ways
+ #### with any minor version of cssselect.
+ #### See https://github.com/scrapy/cssselect/pull/22
+ #### -- Simon Sapin.
+ ####
+
combinator_mapping = {
- ' ': 'descendant',
- '>': 'child',
- '+': 'direct_adjacent',
- '~': 'indirect_adjacent',
+ " ": "descendant",
+ ">": "child",
+ "+": "direct_adjacent",
+ "~": "indirect_adjacent",
}
attribute_operator_mapping = {
- 'exists': 'exists',
- '=': 'equals',
- '~=': 'includes',
- '|=': 'dashmatch',
- '^=': 'prefixmatch',
- '$=': 'suffixmatch',
- '*=': 'substringmatch',
- '!=': 'different', # XXX Not in Level 3 but meh
+ "exists": "exists",
+ "=": "equals",
+ "~=": "includes",
+ "|=": "dashmatch",
+ "^=": "prefixmatch",
+ "$=": "suffixmatch",
+ "*=": "substringmatch",
+ "!=": "different", # XXX Not in Level 3 but meh
}
#: The attribute used for ID selectors depends on the document language:
#: http://www.w3.org/TR/selectors/#id-selectors
- id_attribute = 'id'
-
- def css_to_xpath(self, css, prefix='descendant-or-self::'):
+ id_attribute = "id"
+
+ #: The attribute used for ``:lang()`` depends on the document language:
+ #: http://www.w3.org/TR/selectors/#lang-pseudo
+ lang_attribute = "xml:lang"
+
+ #: The case sensitivity of document language element names,
+ #: attribute names, and attribute values in selectors depends
+ #: on the document language.
+ #: http://www.w3.org/TR/selectors/#casesens
+ #:
+ #: When a document language defines one of these as case-insensitive,
+ #: cssselect assumes that the document parser makes the parsed values
+ #: lower-case. Making the selector lower-case too makes the comparaison
+ #: case-insensitive.
+ #:
+ #: In HTML, element names and attributes names (but not attribute values)
+ #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
+ #: and HTMLParser make them lower-case in their parse result, so
+ #: the assumption holds.
+ lower_case_element_names = False
+ lower_case_attribute_names = False
+ lower_case_attribute_values = False
+
+ # class used to represent and xpath expression
+ xpathexpr_cls = XPathExpr
+
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
"""Translate a *group of selectors* to XPath.
Pseudo-elements are not supported here since XPath only knows
about "real" elements.
:param css:
- A *group of selectors* as an Unicode string.
+ A *group of selectors* as a string.
+ :param prefix:
+ This string is prepended to the XPath expression for each selector.
+ The default makes selectors scoped to the context node’s subtree.
:raises:
- :class:`SelectorSyntaxError` on invalid selectors,
+ :class:`~cssselect.SelectorSyntaxError` on invalid selectors,
:class:`ExpressionError` on unknown/unsupported selectors,
including pseudo-elements.
:returns:
- The equivalent XPath 1.0 expression as an Unicode string.
+ The equivalent XPath 1.0 expression as a string.
"""
- selectors = parse(css)
- for selector in selectors:
- if selector.pseudo_element:
- raise ExpressionError('Pseudo-elements are not supported.')
-
- return ' | '.join(
- self.selector_to_xpath(selector, prefix)
- for selector in selectors)
+ return " | ".join(
+ self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True)
+ for selector in parse(css)
+ )
- def selector_to_xpath(self, selector, prefix='descendant-or-self::'):
+ def selector_to_xpath(
+ self,
+ selector: Selector,
+ prefix: str = "descendant-or-self::",
+ translate_pseudo_elements: bool = False,
+ ) -> str:
"""Translate a parsed selector to XPath.
- The :attr:`~Selector.pseudo_element` attribute of the selector
- is ignored. It is the caller's responsibility to reject selectors
- with pseudo-elements, or to account for them somehow.
:param selector:
A parsed :class:`Selector` object.
+ :param prefix:
+ This string is prepended to the resulting XPath expression.
+ The default makes selectors scoped to the context node’s subtree.
+ :param translate_pseudo_elements:
+ Unless this is set to ``True`` (as :meth:`css_to_xpath` does),
+ the :attr:`~Selector.pseudo_element` attribute of the selector
+ is ignored.
+ It is the caller's responsibility to reject selectors
+ with pseudo-elements, or to account for them somehow.
:raises:
:class:`ExpressionError` on unknown/unsupported selectors.
:returns:
- The equivalent XPath 1.0 expression as an Unicode string.
+ The equivalent XPath 1.0 expression as a string.
+
+ """
+ tree = getattr(selector, "parsed_tree", None)
+ if not tree:
+ raise TypeError(f"Expected a parsed selector, got {selector!r}")
+ xpath = self.xpath(tree)
+ assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return'
+ if translate_pseudo_elements and selector.pseudo_element:
+ xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element)
+ return (prefix or "") + str(xpath)
+
+ def xpath_pseudo_element(
+ self, xpath: XPathExpr, pseudo_element: PseudoElement
+ ) -> XPathExpr:
+ """Translate a pseudo-element.
+
+ Defaults to not supporting pseudo-elements at all,
+ but can be overridden by sub-classes.
"""
- return (prefix or '') + _unicode(self.xpath(selector._tree))
+ raise ExpressionError("Pseudo-elements are not supported.")
@staticmethod
- def xpath_literal(s):
- s = _unicode(s)
+ def xpath_literal(s: str) -> str:
+ s = str(s)
if "'" not in s:
- s = "'%s'" % s
+ s = f"'{s}'"
elif '"' not in s:
- s = '"%s"' % s
+ s = f'"{s}"'
else:
- s = "concat(%s)" % ','.join([
- (("'" in part) and '"%s"' or "'%s'") % part
- for part in split_at_single_quotes(s) if part
- ])
+ parts_quoted = [
+ f'"{part}"' if "'" in part else f"'{part}'"
+ for part in split_at_single_quotes(s)
+ if part
+ ]
+ s = "concat({})".format(",".join(parts_quoted))
return s
- def xpath(self, parsed_selector):
+ def xpath(self, parsed_selector: Tree) -> XPathExpr:
"""Translate any parsed selector object."""
type_name = type(parsed_selector).__name__
- method = getattr(self, 'xpath_%s' % type_name.lower(), None)
- if not method:
- raise TypeError('Expected a parsed selector, got %s' % type_name)
+ method = cast(
+ "Callable[[Tree], XPathExpr] | None",
+ getattr(self, f"xpath_{type_name.lower()}", None),
+ )
+ if method is None:
+ raise ExpressionError(f"{type_name} is not supported.")
return method(parsed_selector)
-
# Dispatched by parsed object type
- def xpath_combinedselector(self, combined):
+ def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr:
"""Translate a combined selector."""
- combinator = self.combinator_mapping.get(combined.combinator)
- if not combinator:
- raise ExpressionError(
- "Unknown combinator: %r" % combined.combinator)
- method = getattr(self, 'xpath_%s_combinator' % combinator)
- return method(self.xpath(combined.selector),
- self.xpath(combined.subselector))
+ combinator = self.combinator_mapping[combined.combinator]
+ method = cast(
+ "Callable[[XPathExpr, XPathExpr], XPathExpr]",
+ getattr(self, f"xpath_{combinator}_combinator"),
+ )
+ return method(self.xpath(combined.selector), self.xpath(combined.subselector))
- def xpath_negation(self, negation):
+ def xpath_negation(self, negation: Negation) -> XPathExpr:
xpath = self.xpath(negation.selector)
sub_xpath = self.xpath(negation.subselector)
sub_xpath.add_name_test()
- return xpath.add_condition('not(%s)' % sub_xpath.condition)
+ if sub_xpath.condition:
+ return xpath.add_condition(f"not({sub_xpath.condition})")
+ return xpath.add_condition("0")
+
+ def xpath_relation(self, relation: Relation) -> XPathExpr:
+ xpath = self.xpath(relation.selector)
+ combinator = relation.combinator
+ subselector = relation.subselector
+ right = self.xpath(subselector.parsed_tree)
+ method = cast(
+ "Callable[[XPathExpr, XPathExpr], XPathExpr]",
+ getattr(
+ self,
+ f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator",
+ ),
+ )
+ return method(xpath, right)
+
+ def xpath_matching(self, matching: Matching) -> XPathExpr:
+ xpath = self.xpath(matching.selector)
+ exprs = [self.xpath(selector) for selector in matching.selector_list]
+ for e in exprs:
+ e.add_name_test()
+ if e.condition:
+ xpath.add_condition(e.condition, "or")
+ return xpath
- def xpath_function(self, function):
+ def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr:
+ xpath = self.xpath(matching.selector)
+ exprs = [self.xpath(selector) for selector in matching.selector_list]
+ for e in exprs:
+ e.add_name_test()
+ if e.condition:
+ xpath.add_condition(e.condition, "or")
+ return xpath
+
+ def xpath_function(self, function: Function) -> XPathExpr:
"""Translate a functional pseudo-class."""
- method = 'xpath_%s_function' % function.name.replace('-', '_')
- method = getattr(self, method, None)
+ method_name = "xpath_{}_function".format(function.name.replace("-", "_"))
+ method = cast(
+ "Callable[[XPathExpr, Function], XPathExpr] | None",
+ getattr(self, method_name, None),
+ )
if not method:
- raise ExpressionError(
- "The pseudo-class :%s() is unknown" % function.name)
+ raise ExpressionError(f"The pseudo-class :{function.name}() is unknown")
return method(self.xpath(function.selector), function)
- def xpath_pseudo(self, pseudo):
+ def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr:
"""Translate a pseudo-class."""
- method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_')
- method = getattr(self, method, None)
+ method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_"))
+ method = cast(
+ "Callable[[XPathExpr], XPathExpr] | None",
+ getattr(self, method_name, None),
+ )
if not method:
# TODO: better error message for pseudo-elements?
- raise ExpressionError(
- "The pseudo-class :%s is unknown" % pseudo.ident)
+ raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown")
return method(self.xpath(pseudo.selector))
-
- def xpath_attrib(self, selector):
+ def xpath_attrib(self, selector: Attrib) -> XPathExpr:
"""Translate an attribute selector."""
- operator = self.attribute_operator_mapping.get(selector.operator)
- if not operator:
- raise ExpressionError(
- "Unknown attribute operator: %r" % selector.operator)
- method = getattr(self, 'xpath_attrib_%s' % operator)
- # FIXME: what if attrib is *?
- if selector.namespace == '*':
- name = '@' + selector.attrib
+ operator = self.attribute_operator_mapping[selector.operator]
+ method = cast(
+ "Callable[[XPathExpr, str, str | None], XPathExpr]",
+ getattr(self, f"xpath_attrib_{operator}"),
+ )
+ if self.lower_case_attribute_names:
+ name = selector.attrib.lower()
+ else:
+ name = selector.attrib
+ safe = is_safe_name(name)
+ if selector.namespace:
+ name = f"{selector.namespace}:{name}"
+ safe = safe and is_safe_name(selector.namespace)
+ if safe:
+ attrib = "@" + name
+ else:
+ attrib = f"attribute::*[name() = {self.xpath_literal(name)}]"
+ if selector.value is None:
+ value = None
+ elif self.lower_case_attribute_values:
+ value = cast("str", selector.value.value).lower()
else:
- name = '@%s:%s' % (selector.namespace, selector.attrib)
- return method(self.xpath(selector.selector), name, selector.value)
+ value = selector.value.value
+ return method(self.xpath(selector.selector), attrib, value)
- def xpath_class(self, class_selector):
+ def xpath_class(self, class_selector: Class) -> XPathExpr:
"""Translate a class selector."""
# .foo is defined as [class~=foo] in the spec.
xpath = self.xpath(class_selector.selector)
- return self.xpath_attrib_includes(
- xpath, '@class', class_selector.class_name)
+ return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name)
- def xpath_hash(self, id_selector):
+ def xpath_hash(self, id_selector: Hash) -> XPathExpr:
"""Translate an ID selector."""
xpath = self.xpath(id_selector.selector)
- return xpath.add_condition('@%s = %s' % (
- self.id_attribute, self.xpath_literal(id_selector.id)))
+ return self.xpath_attrib_equals(xpath, "@id", id_selector.id)
- def xpath_element(self, selector):
+ def xpath_element(self, selector: Element) -> XPathExpr:
"""Translate a type or universal selector."""
- if selector.namespace == '*':
- element = selector.element.lower()
+ element = selector.element
+ if not element:
+ element = "*"
+ safe = True
else:
- # FIXME: Should we lowercase here?
- element = '%s:%s' % (selector.namespace, selector.element)
- return XPathExpr(element=element)
-
+ safe = bool(is_safe_name(element))
+ if self.lower_case_element_names:
+ element = element.lower()
+ if selector.namespace:
+ # Namespace prefixes are case-sensitive.
+ # http://www.w3.org/TR/css3-namespace/#prefixes
+ element = f"{selector.namespace}:{element}"
+ safe = safe and bool(is_safe_name(selector.namespace))
+ xpath = self.xpathexpr_cls(element=element)
+ if not safe:
+ xpath.add_name_test()
+ return xpath
# CombinedSelector: dispatch by combinator
- def xpath_descendant_combinator(self, left, right):
+ def xpath_descendant_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
"""right is a child, grand-child or further descendant of left"""
- return left.join('/descendant-or-self::*/', right)
+ return left.join("/descendant-or-self::*/", right)
- def xpath_child_combinator(self, left, right):
+ def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
"""right is an immediate child of left"""
- return left.join('/', right)
+ return left.join("/", right)
- def xpath_direct_adjacent_combinator(self, left, right):
+ def xpath_direct_adjacent_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
"""right is a sibling immediately after left"""
- xpath = left.join('/following-sibling::', right)
+ xpath = left.join("/following-sibling::", right)
xpath.add_name_test()
- return xpath.add_condition('position() = 1')
+ return xpath.add_condition("position() = 1")
- def xpath_indirect_adjacent_combinator(self, left, right):
+ def xpath_indirect_adjacent_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
"""right is a sibling after left, immediately or not"""
- return left.join('/following-sibling::', right)
+ return left.join("/following-sibling::", right)
+
+ def xpath_relation_descendant_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
+ """right is a child, grand-child or further descendant of left; select left"""
+ return left.join(
+ "[descendant::", right, closing_combiner="]", has_inner_condition=True
+ )
+ def xpath_relation_child_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
+ """right is an immediate child of left; select left"""
+ return left.join("[./", right, closing_combiner="]")
+
+ def xpath_relation_direct_adjacent_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
+ """right is a sibling immediately after left; select left"""
+ return left.add_condition(
+ f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]"
+ )
+
+ def xpath_relation_indirect_adjacent_combinator(
+ self, left: XPathExpr, right: XPathExpr
+ ) -> XPathExpr:
+ """right is a sibling after left, immediately or not; select left"""
+ return left.join("[following-sibling::", right, closing_combiner="]")
# Function: dispatch by function/pseudo-class name
- def xpath_nth_child_function(self, xpath, function, last=False,
- add_name_test=True):
- a, b = parse_series(function.arguments)
- if not a and not b and not last:
- # a=0 means nothing is returned...
- return xpath.add_condition('false() and position() = 0')
- if add_name_test:
- xpath.add_name_test()
- xpath.add_star_prefix()
- if a == 0:
- if last:
- b = 'last() - %s' % b
- return xpath.add_condition('position() = %s' % b)
- if last:
- # FIXME: I'm not sure if this is right
- a = -a
- b = -b
- if b > 0:
- b_neg = str(-b)
+ def xpath_nth_child_function(
+ self,
+ xpath: XPathExpr,
+ function: Function,
+ last: bool = False,
+ add_name_test: bool = True,
+ ) -> XPathExpr:
+ try:
+ a, b = parse_series(function.arguments)
+ except ValueError as ex:
+ raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex
+
+ # From https://www.w3.org/TR/css3-selectors/#structural-pseudos:
+ #
+ # :nth-child(an+b)
+ # an+b-1 siblings before
+ #
+ # :nth-last-child(an+b)
+ # an+b-1 siblings after
+ #
+ # :nth-of-type(an+b)
+ # an+b-1 siblings with the same expanded element name before
+ #
+ # :nth-last-of-type(an+b)
+ # an+b-1 siblings with the same expanded element name after
+ #
+ # So,
+ # for :nth-child and :nth-of-type
+ #
+ # count(preceding-sibling::) = an+b-1
+ #
+ # for :nth-last-child and :nth-last-of-type
+ #
+ # count(following-sibling::) = an+b-1
+ #
+ # therefore,
+ # count(...) - (b-1) ≡ 0 (mod a)
+ #
+ # if a == 0:
+ # ~~~~~~~~~~
+ # count(...) = b-1
+ #
+ # if a < 0:
+ # ~~~~~~~~~
+ # count(...) - b +1 <= 0
+ # -> count(...) <= b-1
+ #
+ # if a > 0:
+ # ~~~~~~~~~
+ # count(...) - b +1 >= 0
+ # -> count(...) >= b-1
+
+ # work with b-1 instead
+ b_min_1 = b - 1
+
+ # early-exit condition 1:
+ # ~~~~~~~~~~~~~~~~~~~~~~~
+ # for a == 1, nth-*(an+b) means n+b-1 siblings before/after,
+ # and since n ∈ {0, 1, 2, ...}, if b-1<=0,
+ # there is always an "n" matching any number of siblings (maybe none)
+ if a == 1 and b_min_1 <= 0:
+ return xpath
+
+ # early-exit condition 2:
+ # ~~~~~~~~~~~~~~~~~~~~~~~
+ # an+b-1 siblings with a<0 and (b-1)<0 is not possible
+ if a < 0 and b_min_1 < 0:
+ return xpath.add_condition("0")
+
+ # `add_name_test` boolean is inverted and somewhat counter-intuitive:
+ #
+ # nth_of_type() calls nth_child(add_name_test=False)
+ nodetest = "*" if add_name_test else f"{xpath.element}"
+
+ # count siblings before or after the element
+ if not last:
+ siblings_count = f"count(preceding-sibling::{nodetest})"
else:
- b_neg = '+%s' % (-b)
- if a != 1:
- expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
+ siblings_count = f"count(following-sibling::{nodetest})"
+
+ # special case of fixed position: nth-*(0n+b)
+ # if a == 0:
+ # ~~~~~~~~~~
+ # count(***-sibling::***) = b-1
+ if a == 0:
+ return xpath.add_condition(f"{siblings_count} = {b_min_1}")
+
+ expressions = []
+
+ if a > 0:
+ # siblings count, an+b-1, is always >= 0,
+ # so if a>0, and (b-1)<=0, an "n" exists to satisfy this,
+ # therefore, the predicate is only interesting if (b-1)>0
+ if b_min_1 > 0:
+ expressions.append(f"{siblings_count} >= {b_min_1}")
else:
- expr = []
- if b >= 0:
- expr.append('position() >= %s' % b)
- elif b < 0 and last:
- expr.append('position() < (last() %s)' % b)
- expr = ' and '.join(expr)
- if expr:
- xpath.add_condition(expr)
+ # if a<0, and (b-1)<0, no "n" satisfies this,
+ # this is tested above as an early exist condition
+ # otherwise,
+ expressions.append(f"{siblings_count} <= {b_min_1}")
+
+ # operations modulo 1 or -1 are simpler, one only needs to verify:
+ #
+ # - either:
+ # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc.,
+ # i.e. count(***-sibling::***) >= (b-1)
+ #
+ # - or:
+ # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc.,
+ # i.e. count(***-sibling::***) <= (b-1)
+ # we we just did above.
+ #
+ if abs(a) != 1:
+ # count(***-sibling::***) - (b-1) ≡ 0 (mod a)
+ left = siblings_count
+
+ # apply "modulo a" on 2nd term, -(b-1),
+ # to simplify things like "(... +6) % -3",
+ # and also make it positive with |a|
+ b_neg = (-b_min_1) % abs(a)
+
+ if b_neg != 0:
+ left = f"({left} +{b_neg})"
+
+ expressions.append(f"{left} mod {a} = 0")
+
+ template = "(%s)" if len(expressions) > 1 else "%s"
+ xpath.add_condition(
+ " and ".join(template % expression for expression in expressions)
+ )
return xpath
- # FIXME: handle an+b, odd, even
- # an+b means every-a, plus b, e.g., 2n+1 means odd
- # 0n+b means b
- # n+0 means a=1, i.e., all elements
- # an means every a elements, i.e., 2n means even
- # -n means -1n
- # -1n+6 means elements 6 and previous
-
- def xpath_nth_last_child_function(self, xpath, function):
+
+ def xpath_nth_last_child_function(
+ self, xpath: XPathExpr, function: Function
+ ) -> XPathExpr:
return self.xpath_nth_child_function(xpath, function, last=True)
- def xpath_nth_of_type_function(self, xpath, function):
- if xpath.element == '*':
- raise ExpressionError(
- "*:nth-of-type() is not implemented")
- return self.xpath_nth_child_function(xpath, function,
- add_name_test=False)
+ def xpath_nth_of_type_function(
+ self, xpath: XPathExpr, function: Function
+ ) -> XPathExpr:
+ if xpath.element == "*":
+ raise ExpressionError("*:nth-of-type() is not implemented")
+ return self.xpath_nth_child_function(xpath, function, add_name_test=False)
+
+ def xpath_nth_last_of_type_function(
+ self, xpath: XPathExpr, function: Function
+ ) -> XPathExpr:
+ if xpath.element == "*":
+ raise ExpressionError("*:nth-of-type() is not implemented")
+ return self.xpath_nth_child_function(
+ xpath, function, last=True, add_name_test=False
+ )
- def xpath_nth_last_of_type_function(self, xpath, function):
- if xpath.element == '*':
+ def xpath_contains_function(
+ self, xpath: XPathExpr, function: Function
+ ) -> XPathExpr:
+ # Defined there, removed in later drafts:
+ # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
raise ExpressionError(
- "*:nth-of-type() is not implemented")
- return self.xpath_nth_child_function(xpath, function, last=True,
- add_name_test=False)
-
- def xpath_contains_function(self, xpath, function):
- return xpath.add_condition('contains(string(.), %s)'
- % self.xpath_literal(function.arguments))
-
- def function_unsupported(self, xpath, pseudo):
- raise ExpressionError(
- "The pseudo-class :%s() is not supported" % pseudo.name)
-
- xpath_lang_function = function_unsupported
+ f"Expected a single string or ident for :contains(), got {function.arguments!r}"
+ )
+ value = cast("str", function.arguments[0].value)
+ return xpath.add_condition(f"contains(., {self.xpath_literal(value)})")
+ def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
+ raise ExpressionError(
+ f"Expected a single string or ident for :lang(), got {function.arguments!r}"
+ )
+ value = cast("str", function.arguments[0].value)
+ return xpath.add_condition(f"lang({self.xpath_literal(value)})")
# Pseudo: dispatch by pseudo-class name
- def xpath_root_pseudo(self, xpath):
+ def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr:
return xpath.add_condition("not(parent::*)")
- def xpath_first_child_pseudo(self, xpath):
- xpath.add_star_prefix()
- xpath.add_name_test()
- return xpath.add_condition('position() = 1')
+ # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div")
+ # Works only at the start of a selector
+ # Needed to get immediate children of a processed selector in Scrapy
+ # for product in response.css('.product'):
+ # description = product.css(':scope > div::text').get()
+ def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("1")
- def xpath_last_child_pseudo(self, xpath):
- xpath.add_star_prefix()
- xpath.add_name_test()
- return xpath.add_condition('position() = last()')
+ def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("count(preceding-sibling::*) = 0")
- def xpath_first_of_type_pseudo(self, xpath):
- if xpath.element == '*':
- raise ExpressionError(
- "*:first-of-type is not implemented")
- xpath.add_star_prefix()
- return xpath.add_condition('position() = 1')
+ def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("count(following-sibling::*) = 0")
- def xpath_last_of_type_pseudo(self, xpath):
- if xpath.element == '*':
- raise ExpressionError(
- "*:last-of-type is not implemented")
- xpath.add_star_prefix()
- return xpath.add_condition('position() = last()')
+ def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ if xpath.element == "*":
+ raise ExpressionError("*:first-of-type is not implemented")
+ return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0")
- def xpath_only_child_pseudo(self, xpath):
- xpath.add_name_test()
- xpath.add_star_prefix()
- return xpath.add_condition('last() = 1')
+ def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ if xpath.element == "*":
+ raise ExpressionError("*:last-of-type is not implemented")
+ return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0")
- def xpath_only_of_type_pseudo(self, xpath):
- if xpath.element == '*':
- raise ExpressionError(
- "*:only-of-type is not implemented")
- return xpath.add_condition('last() = 1')
+ def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("count(parent::*/child::*) = 1")
- def xpath_empty_pseudo(self, xpath):
- return xpath.add_condition("not(*) and not(normalize-space())")
+ def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ if xpath.element == "*":
+ raise ExpressionError("*:only-of-type is not implemented")
+ return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1")
- def pseudo_never_matches(self, xpath):
+ def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("not(*) and not(string-length())")
+
+ def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr:
"""Common implementation for pseudo-classes that never match."""
return xpath.add_condition("0")
@@ -407,82 +714,161 @@ def pseudo_never_matches(self, xpath):
# Attrib: dispatch by attribute operator
- def xpath_attrib_exists(self, xpath, name, value):
+ def xpath_attrib_exists(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
assert not value
xpath.add_condition(name)
return xpath
- def xpath_attrib_equals(self, xpath, name, value):
- xpath.add_condition('%s = %s' % (name, self.xpath_literal(value)))
+ def xpath_attrib_equals(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ assert value is not None
+ xpath.add_condition(f"{name} = {self.xpath_literal(value)}")
return xpath
- def xpath_attrib_different(self, xpath, name, value):
+ def xpath_attrib_different(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ assert value is not None
# FIXME: this seems like a weird hack...
if value:
- xpath.add_condition('not(%s) or %s != %s'
- % (name, name, self.xpath_literal(value)))
+ xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}")
else:
- xpath.add_condition('%s != %s'
- % (name, self.xpath_literal(value)))
+ xpath.add_condition(f"{name} != {self.xpath_literal(value)}")
return xpath
- def xpath_attrib_includes(self, xpath, name, value):
- xpath.add_condition(
- "%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
- % (name, name, self.xpath_literal(' '+value+' ')))
+ def xpath_attrib_includes(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ if value and is_non_whitespace(value):
+ arg = self.xpath_literal(" " + value + " ")
+ xpath.add_condition(
+ f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})"
+ )
+ else:
+ xpath.add_condition("0")
return xpath
- def xpath_attrib_dashmatch(self, xpath, name, value):
+ def xpath_attrib_dashmatch(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ assert value is not None
+ arg = self.xpath_literal(value)
+ arg_dash = self.xpath_literal(value + "-")
# Weird, but true...
- xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % (
- name,
- name, self.xpath_literal(value),
- name, self.xpath_literal(value + '-')))
+ xpath.add_condition(
+ f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))"
+ )
return xpath
- def xpath_attrib_prefixmatch(self, xpath, name, value):
- return xpath.add_condition('%s and starts-with(%s, %s)' % (
- name, name, self.xpath_literal(value)))
+ def xpath_attrib_prefixmatch(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ if value:
+ xpath.add_condition(
+ f"{name} and starts-with({name}, {self.xpath_literal(value)})"
+ )
+ else:
+ xpath.add_condition("0")
+ return xpath
- def xpath_attrib_suffixmatch(self, xpath, name, value):
- # Oddly there is a starts-with in XPath 1.0, but not ends-with
- return xpath.add_condition(
- '%s and substring(%s, string-length(%s)-%s) = %s'
- % (name, name, name, len(value)-1, self.xpath_literal(value)))
+ def xpath_attrib_suffixmatch(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ if value:
+ # Oddly there is a starts-with in XPath 1.0, but not ends-with
+ xpath.add_condition(
+ f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}"
+ )
+ else:
+ xpath.add_condition("0")
+ return xpath
- def xpath_attrib_substringmatch(self, xpath, name, value):
- # Attribute selectors are case sensitive
- return xpath.add_condition('%s and contains(%s, %s)' % (
- name, name, self.xpath_literal(value)))
+ def xpath_attrib_substringmatch(
+ self, xpath: XPathExpr, name: str, value: str | None
+ ) -> XPathExpr:
+ if value:
+ # Attribute selectors are case sensitive
+ xpath.add_condition(
+ f"{name} and contains({name}, {self.xpath_literal(value)})"
+ )
+ else:
+ xpath.add_condition("0")
+ return xpath
class HTMLTranslator(GenericTranslator):
"""
- Translator for HTML documents.
+ Translator for (X)HTML documents.
+
+ Has a more useful implementation of some pseudo-classes based on
+ HTML-specific element names and attribute names, as described in
+ the `HTML5 specification`_. It assumes no-quirks mode.
+ The API is the same as :class:`GenericTranslator`.
+
+ .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
+
+ :param xhtml:
+ If false (the default), element names and attribute names
+ are case-insensitive.
+
"""
- def xpath_checked_pseudo(self, xpath):
+
+ lang_attribute = "lang"
+
+ def __init__(self, xhtml: bool = False) -> None:
+ self.xhtml = xhtml # Might be useful for sub-classes?
+ if not xhtml:
+ # See their definition in GenericTranslator.
+ self.lower_case_element_names = True
+ self.lower_case_attribute_names = True
+
+ def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr:
# FIXME: is this really all the elements?
return xpath.add_condition(
"(@selected and name(.) = 'option') or "
- "(@checked and name(.) = 'input')")
+ "(@checked "
+ "and (name(.) = 'input' or name(.) = 'command')"
+ "and (@type = 'checkbox' or @type = 'radio'))"
+ )
- def xpath_link_pseudo(self, xpath):
- return xpath.add_condition("@href and name(.) = 'a'")
+ def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
+ if function.argument_types() not in (["STRING"], ["IDENT"]):
+ raise ExpressionError(
+ f"Expected a single string or ident for :lang(), got {function.arguments!r}"
+ )
+ value = function.arguments[0].value
+ assert value
+ arg = self.xpath_literal(value.lower() + "-")
+ return xpath.add_condition(
+ "ancestor-or-self::*[@lang][1][starts-with(concat("
+ # XPath 1.0 has no lower-case function...
+ f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
+ "'abcdefghijklmnopqrstuvwxyz'), "
+ f"'-'), {arg})]"
+ )
+
+ def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition(
+ "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')"
+ )
# Links are never visited, the implementation for :visited is the same
# as in GenericTranslator
- def xpath_disabled_pseudo(self, xpath):
+ def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr:
# http://www.w3.org/TR/html5/section-index.html#attributes-1
- return xpath.add_condition('''
+ return xpath.add_condition(
+ """
(
@disabled and
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
- name(.) = 'keygen' or
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup' or
@@ -490,39 +876,54 @@ def xpath_disabled_pseudo(self, xpath):
)
) or (
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
- name(.) = 'textarea' or
- name(.) = 'keygen'
+ name(.) = 'textarea'
)
and ancestor::fieldset[@disabled]
)
- ''')
+ """
+ )
# FIXME: in the second half, add "and is not a descendant of that
# fieldset element's first legend element child, if any."
- def xpath_enabled_pseudo(self, xpath):
+ def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr:
# http://www.w3.org/TR/html5/section-index.html#attributes-1
- return xpath.add_condition('''
+ return xpath.add_condition(
+ """
(
+ @href and (
+ name(.) = 'a' or
+ name(.) = 'link' or
+ name(.) = 'area'
+ )
+ ) or (
(
name(.) = 'command' or
name(.) = 'fieldset' or
- name(.) = 'optgroup' or
- name(.) = 'option'
+ name(.) = 'optgroup'
)
and not(@disabled)
) or (
(
- name(.) = 'input' or
+ (name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'keygen'
)
and not (@disabled or ancestor::fieldset[@disabled])
+ ) or (
+ name(.) = 'option' and not(
+ @disabled or ancestor::optgroup[@disabled]
+ )
)
- ''')
- # FIXME: in the second half, add "and is not a descendant of that
- # fieldset element's first legend element child, if any."
+ """
+ )
+ # FIXME: ... or "li elements that are children of menu elements,
+ # and that have a child element that defines a command, if the first
+ # such element's Disabled State facet is false (not disabled)".
+ # FIXME: after ancestor::fieldset[@disabled], add "and is not a
+ # descendant of that fieldset element's first legend element child,
+ # if any."
diff --git a/docs/conf.py b/docs/conf.py
index 22e6032..da3f023 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
#
# cssselect documentation build configuration file, created by
# sphinx-quickstart on Tue Mar 27 14:20:34 2012.
@@ -12,217 +11,210 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys, os, re
+import re
+from pathlib import Path
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration -----------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx',
- 'sphinx.ext.doctest']
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"]
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
# The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = {".rst": "restructuredtext"}
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = 'cssselect'
-copyright = '2012, Simon Sapin'
+project = "cssselect"
+project_copyright = "2012-2017, Simon Sapin, Scrapy developers"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The full version, including alpha/beta/rc tags.
-init_py = open(os.path.join(os.path.dirname(__file__),
- '..', 'cssselect', '__init__.py')).read()
-release = re.search("VERSION = '([^']+)'", init_py).group(1)
+init_py = (Path(__file__).parent.parent / "cssselect" / "__init__.py").read_text()
+release = re.search('VERSION = "([^"]+)"', init_py).group(1)
# The short X.Y version.
-version = release.rstrip('dev')
+version = release.rstrip("dev")
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-#html_theme = 'agogo'
+html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
+# html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Output file base name for HTML help builder.
-htmlhelp_basename = 'cssselectdoc'
+htmlhelp_basename = "cssselectdoc"
# -- Options for LaTeX output --------------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'cssselect.tex', 'cssselect Documentation',
- 'Simon Sapin', 'manual'),
+ ("index", "cssselect.tex", "cssselect Documentation", "Simon Sapin", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [
- ('index', 'cssselect', 'cssselect Documentation',
- ['Simon Sapin'], 1)
-]
+man_pages = [("index", "cssselect", "cssselect Documentation", ["Simon Sapin"], 1)]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output ------------------------------------------------
@@ -231,20 +223,35 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'cssselect', 'cssselect Documentation',
- 'Simon Sapin', 'cssselect', 'One line description of project.',
- 'Miscellaneous'),
+ (
+ "index",
+ "cssselect",
+ "cssselect Documentation",
+ "Simon Sapin",
+ "cssselect",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'http://docs.python.org/': None}
+intersphinx_mapping = {"python": ("https://docs.python.org/3", None)}
+
+
+# --- Nitpicking options ------------------------------------------------------
+
+nitpicky = True
+nitpick_ignore = [
+ # explicitly not a part of the public API
+ ("py:class", "Token"),
+]
diff --git a/docs/conftest.py b/docs/conftest.py
new file mode 100644
index 0000000..a71d108
--- /dev/null
+++ b/docs/conftest.py
@@ -0,0 +1,21 @@
+from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
+
+from sybil import Sybil
+from sybil.parsers.doctest import DocTestParser
+from sybil.parsers.skip import skip
+
+try:
+ # sybil 3.0.0+
+ from sybil.parsers.codeblock import PythonCodeBlockParser
+except ImportError:
+ from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser
+
+
+pytest_collect_file = Sybil(
+ parsers=[
+ DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
+ PythonCodeBlockParser(future_imports=["print_function"]),
+ skip,
+ ],
+ pattern="*.rst",
+).pytest()
diff --git a/docs/index.rst b/docs/index.rst
index 0c060fc..a024f20 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,8 +44,9 @@ The resulting expression can be used with lxml's `XPath engine`_:
User API
========
-In CSS3 terms, a `group of selectors`_ is a sequence of comma-separated
-selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
+In CSS3 Selectors terms, the top-level object is a `group of selectors`_, a
+sequence of comma-separated selectors. For example, ``div, h1.title + p``
+is a group of two selectors.
.. _group of selectors: http://www.w3.org/TR/selectors/#grouping
@@ -53,12 +54,15 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
.. autoclass:: Selector()
:members:
+.. autoclass:: FunctionalPseudoElement
+
.. autoclass:: GenericTranslator
:members: css_to_xpath, selector_to_xpath
.. autoclass:: HTMLTranslator
- The API is the same as :class:`GenericTranslator`.
+Exceptions
+----------
.. autoexception:: SelectorError
.. autoexception:: SelectorSyntaxError
@@ -90,18 +94,29 @@ they never match:
These applicable pseudo-classes are not yet implemented:
-* ``:lang(language)``
* ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``,
``*:nth-last-of-type``, ``*:only-of-type``. All of these work when
you specify an element type, but not with ``*``
On the other hand, *cssselect* supports some selectors that are not
-in the Level 3 specification:
+in the Level 3 specification.
+
+These parts of the Level 4 specification are supported (note that a large part
+of the Level 4 additions is not applicable to cssselect similarly to ``:hover``
+or not representable in XPath 1.0 so the complete specification is unlikely to
+be implemented):
+
+* The ``:scope`` pseudo-class. Limitation: it can only be used at a start of a
+ selector.
+* The ``:is()``, ``:where()`` and ``:has()`` pseudo-classes. Limitation:
+ ``:has()`` cannot contain nested ``:has()`` or ``:not()``.
+
+These are non-standard extensions:
* The ``:contains(text)`` pseudo-class that existed in `an early draft`_
but was then removed.
* The ``!=`` attribute operator. ``[foo!=bar]`` is the same as
- ``:not([foo=bar])``
+ ``:not([foo=bar])``.
* ``:not()`` accepts a *sequence of simple selectors*, not just single
*simple selector*. For example, ``:not(a.important[rel])`` is allowed,
even though the negation contains 3 *simple selectors*.
@@ -134,9 +149,9 @@ implemented without forking or monkey-patching cssselect.
The "customization API" is the set of methods in translation classes
and their signature. You can look at the `source code`_ to see how it works.
However, be aware that this API is not very stable yet. It might change
-and break you sub-class.
+and break your sub-class.
-.. _source code: https://github.com/SimonSapin/cssselect/blob/master/cssselect/xpath.py
+.. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py
Namespaces
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..21cb2eb
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==8.2.3
+sphinx-rtd-theme==3.0.2
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..c7c54a0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,239 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling>=1.27.0"]
+
+[project]
+name = "cssselect"
+license = "BSD-3-Clause"
+license-files = ["LICENSE", "AUTHORS"]
+description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
+readme = "README.rst"
+authors = [{ name = "Ian Bicking", email = "ianb@colorstudy.com" }]
+maintainers = [{ name = "Paul Tremberth", email = "paul.tremberth@gmail.com" }]
+requires-python = ">=3.10"
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+]
+dynamic = ["version"]
+
+[project.urls]
+"Homepage" = "https://github.com/scrapy/cssselect"
+
+[tool.hatch.version]
+path = "cssselect/__init__.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+ "/cssselect",
+ "/docs",
+ "/tests",
+ "/CHANGES",
+ "/README.rst",
+ "/tox.ini",
+]
+exclude = [
+ "/docs/_build",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["cssselect"]
+
+[tool.bumpversion]
+current_version = "1.4.0"
+commit = true
+tag = true
+
+[[tool.bumpversion.files]]
+filename = "cssselect/__init__.py"
+
+[[tool.bumpversion.files]]
+filename = "CHANGES"
+search = "^Unreleased\\.$"
+replace = "Released on {now:%Y-%m-%d}."
+regex = true
+
+[tool.coverage.run]
+branch = true
+source = ["cssselect"]
+
+[tool.coverage.report]
+exclude_also = [
+ "def __repr__",
+ "if sys.version_info",
+ "if __name__ == '__main__':",
+]
+
+[tool.mypy]
+strict = true
+
+[tool.pylint.MASTER]
+persistent = "no"
+extension-pkg-allow-list = ["lxml"]
+
+[tool.pylint."MESSAGES CONTROL"]
+enable = [
+ "useless-suppression",
+]
+disable = [
+ "consider-using-f-string",
+ "fixme",
+ "invalid-name",
+ "line-too-long",
+ "missing-class-docstring",
+ "missing-function-docstring",
+ "missing-module-docstring",
+ "no-member",
+ "not-callable",
+ "redefined-builtin",
+ "redefined-outer-name",
+ "too-few-public-methods",
+ "too-many-arguments",
+ "too-many-branches",
+ "too-many-function-args",
+ "too-many-lines",
+ "too-many-locals",
+ "too-many-positional-arguments",
+ "too-many-public-methods",
+ "too-many-statements",
+ "unused-argument",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff.lint]
+extend-select = [
+ # flake8-builtins
+ "A",
+ # flake8-async
+ "ASYNC",
+ # flake8-bugbear
+ "B",
+ # flake8-comprehensions
+ "C4",
+ # flake8-commas
+ "COM",
+ # pydocstyle
+ "D",
+ # flake8-future-annotations
+ "FA",
+ # flynt
+ "FLY",
+ # refurb
+ "FURB",
+ # isort
+ "I",
+ # flake8-implicit-str-concat
+ "ISC",
+ # flake8-logging
+ "LOG",
+ # Perflint
+ "PERF",
+ # pygrep-hooks
+ "PGH",
+ # flake8-pie
+ "PIE",
+ # pylint
+ "PL",
+ # flake8-pytest-style
+ "PT",
+ # flake8-use-pathlib
+ "PTH",
+ # flake8-pyi
+ "PYI",
+ # flake8-quotes
+ "Q",
+ # flake8-return
+ "RET",
+ # flake8-raise
+ "RSE",
+ # Ruff-specific rules
+ "RUF",
+ # flake8-bandit
+ "S",
+ # flake8-simplify
+ "SIM",
+ # flake8-slots
+ "SLOT",
+ # flake8-debugger
+ "T10",
+ # flake8-type-checking
+ "TC",
+ # pyupgrade
+ "UP",
+ # pycodestyle warnings
+ "W",
+ # flake8-2020
+ "YTT",
+]
+ignore = [
+ # Trailing comma missing
+ "COM812",
+ # Missing docstring in public module
+ "D100",
+ # Missing docstring in public class
+ "D101",
+ # Missing docstring in public method
+ "D102",
+ # Missing docstring in public function
+ "D103",
+ # Missing docstring in public package
+ "D104",
+ # Missing docstring in magic method
+ "D105",
+ # Missing docstring in public nested class
+ "D106",
+ # Missing docstring in __init__
+ "D107",
+ # One-line docstring should fit on one line with quotes
+ "D200",
+ # No blank lines allowed after function docstring
+ "D202",
+ # 1 blank line required between summary line and description
+ "D205",
+ # Multi-line docstring closing quotes should be on a separate line
+ "D209",
+ # First line should end with a period
+ "D400",
+ # First line should be in imperative mood; try rephrasing
+ "D401",
+ # First line should not be the function's "signature"
+ "D402",
+ # First word of the first line should be properly capitalized
+ "D403",
+ # Too many return statements
+ "PLR0911",
+ # Too many branches
+ "PLR0912",
+ # Too many arguments in function definition
+ "PLR0913",
+ # Too many statements
+ "PLR0915",
+ # Magic value used in comparison
+ "PLR2004",
+ # String contains ambiguous {}.
+ "RUF001",
+ # Docstring contains ambiguous {}.
+ "RUF002",
+ # Comment contains ambiguous {}.
+ "RUF003",
+ # Mutable class attributes should be annotated with `typing.ClassVar`
+ "RUF012",
+ # Use of `assert` detected
+ "S101",
+]
+
+[tool.ruff.lint.isort]
+split-on-trailing-comma = false
+
+[tool.ruff.lint.pydocstyle]
+convention = "pep257"
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index ccddf11..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-[build_sphinx]
-source-dir = docs
-build-dir = docs/_build
-#all_files = 1
-
-[upload_sphinx] # Sphinx-PyPI-upload
-upload-dir = docs/_build/html
-
-[pytest]
-python_files=tests.py
diff --git a/setup.py b/setup.py
deleted file mode 100644
index df95379..0000000
--- a/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import re
-import os.path
-from setuptools import setup
-
-
-ROOT = os.path.dirname(__file__)
-README = open(os.path.join(ROOT, 'README.rst')).read()
-INIT_PY = open(os.path.join(ROOT, 'cssselect', '__init__.py')).read()
-VERSION = re.search("VERSION = '([^']+)'", INIT_PY).group(1)
-
-
-setup(
- name='cssselect',
- version=VERSION,
- author='Ian Bicking',
- author_email='ianb@colorstudy.com',
- maintainer='Simon Sapin',
- maintainer_email='simon.sapin@exyr.org',
- description=
- 'cssselect parses CSS3 Selectors and translates them to XPath 1.0',
- long_description=README,
- url='http://packages.python.org/cssselect/',
- license='BSD',
- packages=['cssselect'],
- test_suite='cssselect.tests',
- classifiers=[
- 'Development Status :: 4 - Beta',
- 'Intended Audience :: Developers',
- 'License :: OSI Approved :: BSD License',
- 'Programming Language :: Python :: 2',
- 'Programming Language :: Python :: 2.4',
- 'Programming Language :: Python :: 2.5',
- 'Programming Language :: Python :: 2.6',
- 'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.1',
- 'Programming Language :: Python :: 3.2',
- ],
-)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_cssselect.py b/tests/test_cssselect.py
new file mode 100644
index 0000000..dc67bb7
--- /dev/null
+++ b/tests/test_cssselect.py
@@ -0,0 +1,1540 @@
+#!/usr/bin/env python
+"""
+Tests for cssselect
+===================
+
+These tests can be run either by py.test or by the standard library's
+unittest. They use plain ``assert`` statements and do little reporting
+themselves in case of failure.
+
+Use py.test to get fancy error reporting and assert introspection.
+
+
+:copyright: (c) 2007-2012 Ian Bicking and contributors.
+See AUTHORS for more details.
+:license: BSD, see LICENSE for more details.
+
+"""
+
+from __future__ import annotations
+
+import sys
+import typing
+import unittest
+from typing import TYPE_CHECKING
+
+import pytest
+from lxml import etree, html
+
+from cssselect import (
+ ExpressionError,
+ GenericTranslator,
+ HTMLTranslator,
+ SelectorSyntaxError,
+ parse,
+)
+from cssselect.parser import (
+ Function,
+ FunctionalPseudoElement,
+ PseudoElement,
+ Token,
+ parse_series,
+ tokenize,
+)
+from cssselect.xpath import XPathExpr
+
+if TYPE_CHECKING:
+ from collections.abc import Sequence
+
+
+class TestCssselect(unittest.TestCase):
+ def test_tokenizer(self) -> None:
+ tokens = [
+ str(item)
+ for item in tokenize(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)')
+ ]
+ assert tokens == [
+ "",
+ "",
+ "' at 5>",
+ "",
+ # the no-break space is not whitespace in CSS
+ "", # f\xa0
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ ]
+
+ def test_parser(self) -> None:
+ def repr_parse(css: str) -> list[str]:
+ selectors = parse(css)
+ for selector in selectors:
+ assert selector.pseudo_element is None
+ return [repr(selector.parsed_tree) for selector in selectors]
+
+ def parse_many(first: str, *others: str) -> list[str]:
+ result = repr_parse(first)
+ for other in others:
+ assert repr_parse(other) == result
+ return result
+
+ assert parse_many("*") == ["Element[*]"]
+ assert parse_many("*|*") == ["Element[*]"]
+ assert parse_many("*|foo") == ["Element[foo]"]
+ assert parse_many("|foo") == ["Element[foo]"]
+ assert parse_many("foo|*") == ["Element[foo|*]"]
+ assert parse_many("foo|bar") == ["Element[foo|bar]"]
+ # This will never match, but it is valid:
+ assert parse_many("#foo#bar") == ["Hash[Hash[Element[*]#foo]#bar]"]
+ assert parse_many(
+ "div>.foo",
+ "div> .foo",
+ "div >.foo",
+ "div > .foo",
+ "div \n> \t \t .foo",
+ "div\r>\n\n\n.foo",
+ "div\f>\f.foo",
+ ) == ["CombinedSelector[Element[div] > Class[Element[*].foo]]"]
+ assert parse_many(
+ "td.foo,.bar", "td.foo, .bar", "td.foo\t\r\n\f ,\t\r\n\f .bar"
+ ) == [
+ "Class[Element[td].foo]",
+ "Class[Element[*].bar]",
+ ]
+ assert parse_many("div, td.foo, div.bar span") == [
+ "Element[div]",
+ "Class[Element[td].foo]",
+ "CombinedSelector[Class[Element[div].bar] Element[span]]",
+ ]
+ assert parse_many("div > p") == ["CombinedSelector[Element[div] > Element[p]]"]
+ assert parse_many("td:first") == ["Pseudo[Element[td]:first]"]
+ assert parse_many("td:first") == ["Pseudo[Element[td]:first]"]
+ assert parse_many("td :first") == [
+ "CombinedSelector[Element[td] Pseudo[Element[*]:first]]"
+ ]
+ assert parse_many("td :first") == [
+ "CombinedSelector[Element[td] Pseudo[Element[*]:first]]"
+ ]
+ assert parse_many("a[name]", "a[ name\t]") == ["Attrib[Element[a][name]]"]
+ assert parse_many("a [name]") == [
+ "CombinedSelector[Element[a] Attrib[Element[*][name]]]"
+ ]
+ assert parse_many('a[rel="include"]', "a[rel = include]") == [
+ "Attrib[Element[a][rel = 'include']]"
+ ]
+ assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [
+ "Attrib[Element[a][hreflang |= 'en']]"
+ ]
+ assert parse_many("div:nth-child(10)") == [
+ "Function[Element[div]:nth-child(['10'])]"
+ ]
+ assert parse_many(":nth-child(2n+2)") == [
+ "Function[Element[*]:nth-child(['2', 'n', '+2'])]"
+ ]
+ assert parse_many("div:nth-of-type(10)") == [
+ "Function[Element[div]:nth-of-type(['10'])]"
+ ]
+ assert parse_many("div div:nth-of-type(10) .aclass") == [
+ "CombinedSelector[CombinedSelector[Element[div] "
+ "Function[Element[div]:nth-of-type(['10'])]] "
+ " Class[Element[*].aclass]]"
+ ]
+ assert parse_many("label:only") == ["Pseudo[Element[label]:only]"]
+ assert parse_many("a:lang(fr)") == ["Function[Element[a]:lang(['fr'])]"]
+ assert parse_many('div:contains("foo")') == [
+ "Function[Element[div]:contains(['foo'])]"
+ ]
+ assert parse_many("div#foobar") == ["Hash[Element[div]#foobar]"]
+ assert parse_many("div:not(div.foo)") == [
+ "Negation[Element[div]:not(Class[Element[div].foo])]"
+ ]
+ assert parse_many("div:has(div.foo)") == [
+ "Relation[Element[div]:has(Selector[Class[Element[div].foo]])]"
+ ]
+ assert parse_many("div:is(.foo, #bar)") == [
+ "Matching[Element[div]:is(Class[Element[*].foo], Hash[Element[*]#bar])]"
+ ]
+ assert parse_many(":is(:hover, :visited)") == [
+ "Matching[Element[*]:is(Pseudo[Element[*]:hover], Pseudo[Element[*]:visited])]"
+ ]
+ assert parse_many(":where(:hover, :visited)") == [
+ "SpecificityAdjustment[Element[*]:where(Pseudo[Element[*]:hover],"
+ " Pseudo[Element[*]:visited])]"
+ ]
+ assert parse_many("td ~ th") == ["CombinedSelector[Element[td] ~ Element[th]]"]
+ assert parse_many(":scope > foo") == [
+ "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]"
+ ]
+ assert parse_many(" :scope > foo") == [
+ "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]"
+ ]
+ assert parse_many(":scope > foo bar > div") == [
+ "CombinedSelector[CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > "
+ "Element[foo]] Element[bar]] > Element[div]]"
+ ]
+ assert parse_many(":scope > #foo #bar") == [
+ "CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > "
+ "Hash[Element[*]#foo]] Hash[Element[*]#bar]]"
+ ]
+
+ def test_pseudo_elements(self) -> None:
+ def parse_pseudo(css: str) -> list[tuple[str, str | None]]:
+ result: list[tuple[str, str | None]] = []
+ for selector in parse(css):
+ pseudo = selector.pseudo_element
+ pseudo = str(pseudo) if pseudo else pseudo
+ # No Symbol here
+ assert pseudo is None or isinstance(pseudo, str)
+ selector_as_str = repr(selector.parsed_tree)
+ result.append((selector_as_str, pseudo))
+ return result
+
+ def parse_one(css: str) -> tuple[str, str | None]:
+ result = parse_pseudo(css)
+ assert len(result) == 1
+ return result[0]
+
+ def test_pseudo_repr(css: str) -> str:
+ result = parse(css)
+ assert len(result) == 1
+ selector = result[0]
+ return repr(selector.parsed_tree)
+
+ assert parse_one("foo") == ("Element[foo]", None)
+ assert parse_one("*") == ("Element[*]", None)
+ assert parse_one(":empty") == ("Pseudo[Element[*]:empty]", None)
+ assert parse_one(":scope") == ("Pseudo[Element[*]:scope]", None)
+
+ # Special cases for CSS 2.1 pseudo-elements
+ assert parse_one(":BEfore") == ("Element[*]", "before")
+ assert parse_one(":aftER") == ("Element[*]", "after")
+ assert parse_one(":First-Line") == ("Element[*]", "first-line")
+ assert parse_one(":First-Letter") == ("Element[*]", "first-letter")
+
+ assert parse_one("::befoRE") == ("Element[*]", "before")
+ assert parse_one("::AFter") == ("Element[*]", "after")
+ assert parse_one("::firsT-linE") == ("Element[*]", "first-line")
+ assert parse_one("::firsT-letteR") == ("Element[*]", "first-letter")
+
+ assert parse_one("::text-content") == ("Element[*]", "text-content")
+ assert parse_one("::attr(name)") == (
+ "Element[*]",
+ "FunctionalPseudoElement[::attr(['name'])]",
+ )
+
+ assert parse_one("::Selection") == ("Element[*]", "selection")
+ assert parse_one("foo:after") == ("Element[foo]", "after")
+ assert parse_one("foo::selection") == ("Element[foo]", "selection")
+ assert parse_one("lorem#ipsum ~ a#b.c[href]:empty::selection") == (
+ "CombinedSelector[Hash[Element[lorem]#ipsum] ~ "
+ "Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]",
+ "selection",
+ )
+ assert parse_pseudo(":scope > div, foo bar") == [
+ ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None),
+ ("CombinedSelector[Element[foo] Element[bar]]", None),
+ ]
+ assert parse_pseudo("foo bar, :scope > div") == [
+ ("CombinedSelector[Element[foo] Element[bar]]", None),
+ ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None),
+ ]
+ assert parse_pseudo("foo bar,:scope > div") == [
+ ("CombinedSelector[Element[foo] Element[bar]]", None),
+ ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None),
+ ]
+ assert parse_pseudo("foo:before, bar, baz:after") == [
+ ("Element[foo]", "before"),
+ ("Element[bar]", None),
+ ("Element[baz]", "after"),
+ ]
+
+ # Special cases for CSS 2.1 pseudo-elements are ignored by default
+ for pseudo in ("after", "before", "first-line", "first-letter"):
+ (selector,) = parse(f"e:{pseudo}")
+ assert selector.pseudo_element == pseudo
+ assert GenericTranslator().selector_to_xpath(selector, prefix="") == "e"
+
+ # Pseudo Elements are ignored by default, but if allowed they are not
+ # supported by GenericTranslator
+ tr = GenericTranslator()
+ (selector,) = parse("e::foo")
+ assert selector.pseudo_element == "foo"
+ assert tr.selector_to_xpath(selector, prefix="") == "e"
+ with pytest.raises(ExpressionError):
+ tr.selector_to_xpath(selector, translate_pseudo_elements=True)
+
+ # Special test for the unicode symbols and ':scope' element if check
+ # Errors if use repr() instead of __repr__()
+ assert test_pseudo_repr(":fİrst-child") == "Pseudo[Element[*]:fİrst-child]"
+ assert test_pseudo_repr(":scope") == "Pseudo[Element[*]:scope]"
+
+ def test_specificity(self) -> None:
+ def specificity(css: str) -> tuple[int, int, int]:
+ selectors = parse(css)
+ assert len(selectors) == 1
+ return selectors[0].specificity()
+
+ assert specificity("*") == (0, 0, 0)
+ assert specificity(" foo") == (0, 0, 1)
+ assert specificity(":empty ") == (0, 1, 0)
+ assert specificity(":before") == (0, 0, 1)
+ assert specificity("*:before") == (0, 0, 1)
+ assert specificity(":nth-child(2)") == (0, 1, 0)
+ assert specificity(".bar") == (0, 1, 0)
+ assert specificity("[baz]") == (0, 1, 0)
+ assert specificity('[baz="4"]') == (0, 1, 0)
+ assert specificity('[baz^="4"]') == (0, 1, 0)
+ assert specificity("#lipsum") == (1, 0, 0)
+ assert specificity("::attr(name)") == (0, 0, 1)
+
+ assert specificity(":not(*)") == (0, 0, 0)
+ assert specificity(":not(foo)") == (0, 0, 1)
+ assert specificity(":not(.foo)") == (0, 1, 0)
+ assert specificity(":not([foo])") == (0, 1, 0)
+ assert specificity(":not(:empty)") == (0, 1, 0)
+ assert specificity(":not(#foo)") == (1, 0, 0)
+
+ assert specificity(":has(*)") == (0, 0, 0)
+ assert specificity(":has(foo)") == (0, 0, 1)
+ assert specificity(":has(.foo)") == (0, 1, 0)
+ assert specificity(":has(> foo)") == (0, 0, 1)
+
+ assert specificity(":is(.foo, #bar)") == (1, 0, 0)
+ assert specificity(":is(:hover, :visited)") == (0, 1, 0)
+ assert specificity(":where(:hover, :visited)") == (0, 0, 0)
+
+ assert specificity("foo:empty") == (0, 1, 1)
+ assert specificity("foo:before") == (0, 0, 2)
+ assert specificity("foo::before") == (0, 0, 2)
+ assert specificity("foo:empty::before") == (0, 1, 2)
+
+ assert specificity("#lorem + foo#ipsum:first-child > bar:first-line") == (
+ 2,
+ 1,
+ 3,
+ )
+
+ def test_css_export(self) -> None:
+ def css2css(css: str, res: str | None = None) -> None:
+ selectors = parse(css)
+ assert len(selectors) == 1
+ assert selectors[0].canonical() == (res or css)
+
+ css2css("*")
+ css2css(" foo", "foo")
+ css2css("Foo", "Foo")
+ css2css(":empty ", ":empty")
+ css2css(":before", "::before")
+ css2css(":beFOre", "::before")
+ css2css("*:before", "::before")
+ css2css(":nth-child(2)")
+ css2css(".bar")
+ css2css("[baz]")
+ css2css('[baz="4"]', "[baz='4']")
+ css2css('[baz^="4"]', "[baz^='4']")
+ css2css("[ns|attr='4']")
+ css2css("#lipsum")
+ css2css(":not(*)")
+ css2css(":not(foo)")
+ css2css(":not(*.foo)", ":not(.foo)")
+ css2css(":not(*[foo])", ":not([foo])")
+ css2css(":not(:empty)")
+ css2css(":not(#foo)")
+ css2css(":has(*)")
+ css2css(":has(foo)")
+ css2css(":has(*.foo)", ":has(.foo)")
+ css2css(":is(#bar, .foo)")
+ css2css(":is(:focused, :visited)")
+ css2css(":where(:focused, :visited)")
+ css2css("foo:empty")
+ css2css("foo::before")
+ css2css("foo:empty::before")
+ css2css('::name(arg + "val" - 3)', "::name(arg+'val'-3)")
+ css2css("#lorem + foo#ipsum:first-child > bar::first-line")
+ css2css("foo > *")
+
+ def test_parse_errors(self) -> None:
+ def get_error(css: str) -> str | None:
+ try:
+ parse(css)
+ except SelectorSyntaxError:
+ return str(sys.exc_info()[1])
+ return None
+
+ assert get_error("attributes(href)/html/body/a") == (
+ "Expected selector, got "
+ )
+ assert get_error("attributes(href)") == (
+ "Expected selector, got "
+ )
+ assert get_error("html/body/a") == ("Expected selector, got ")
+ assert get_error(" ") == ("Expected selector, got ")
+ assert get_error("div, ") == ("Expected selector, got ")
+ assert get_error(" , div") == ("Expected selector, got ")
+ assert get_error("p, , div") == ("Expected selector, got ")
+ assert get_error("div > ") == ("Expected selector, got ")
+ assert get_error(" > div") == ("Expected selector, got ' at 2>")
+ assert get_error("foo|#bar") == ("Expected ident or '*', got ")
+ assert get_error("#.foo") == ("Expected selector, got ")
+ assert get_error(".#foo") == ("Expected ident, got ")
+ assert get_error(":#foo") == ("Expected ident, got ")
+ assert get_error("[*]") == ("Expected '|', got ")
+ assert get_error("[foo|]") == ("Expected ident, got ")
+ assert get_error("[#]") == ("Expected ident or '*', got ")
+ assert get_error("[foo=#]") == (
+ "Expected string or ident, got "
+ )
+ assert get_error("[href]a") == ("Expected selector, got ")
+ assert get_error("[rel=stylesheet]") is None
+ assert get_error("[rel:stylesheet]") == (
+ "Operator expected, got "
+ )
+ assert get_error("[rel=stylesheet") == ("Expected ']', got ")
+ assert get_error(":lang(fr)") is None
+ assert get_error(":lang(fr") == ("Expected an argument, got ")
+ assert get_error(':contains("foo') == ("Unclosed string at 10")
+ assert get_error("foo!") == ("Expected selector, got ")
+
+ # Mis-placed pseudo-elements
+ assert get_error("a:before:empty") == (
+ "Got pseudo-element ::before not at the end of a selector"
+ )
+ assert get_error("li:before a") == (
+ "Got pseudo-element ::before not at the end of a selector"
+ )
+ assert get_error(":not(:before)") == (
+ "Got pseudo-element ::before inside :not() at 12"
+ )
+ assert get_error(":not(:not(a))") == ("Got nested :not()")
+ assert get_error(":is(:before)") == (
+ "Got pseudo-element ::before inside function"
+ )
+ assert get_error(":is(a b)") == ("Expected an argument, got ")
+ assert get_error(":where(:before)") == (
+ "Got pseudo-element ::before inside function"
+ )
+ assert get_error(":where(a b)") == (
+ "Expected an argument, got "
+ )
+ assert get_error(":scope > div :scope header") == (
+ 'Got immediate child pseudo-element ":scope" not at the start of a selector'
+ )
+ assert get_error("div :scope header") == (
+ 'Got immediate child pseudo-element ":scope" not at the start of a selector'
+ )
+ assert get_error("> div p") == ("Expected selector, got ' at 0>")
+
+ # Unsupported :has() with several arguments
+ assert get_error(":has(a, b)") == ("Expected an argument, got ")
+ assert get_error(":has()") == ("Expected selector, got ")
+
+ def test_translation(self) -> None:
+ def xpath(css: str) -> str:
+ return str(GenericTranslator().css_to_xpath(css, prefix=""))
+
+ assert xpath("*") == "*"
+ assert xpath("e") == "e"
+ assert xpath("*|e") == "e"
+ assert xpath("e|f") == "e:f"
+ assert xpath("e[foo]") == "e[@foo]"
+ assert xpath("e[foo|bar]") == "e[@foo:bar]"
+ assert xpath('e[foo="bar"]') == "e[@foo = 'bar']"
+ assert xpath('e[foo~="bar"]') == (
+ "e[@foo and contains(concat(' ', normalize-space(@foo), ' '), ' bar ')]"
+ )
+ assert xpath('e[foo^="bar"]') == ("e[@foo and starts-with(@foo, 'bar')]")
+ assert xpath('e[foo$="bar"]') == (
+ "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']"
+ )
+ assert xpath('e[foo*="bar"]') == ("e[@foo and contains(@foo, 'bar')]")
+ assert xpath('e[hreflang|="en"]') == (
+ "e[@hreflang and (@hreflang = 'en' or starts-with(@hreflang, 'en-'))]"
+ )
+
+ # --- nth-* and nth-last-* -------------------------------------
+ assert xpath("e:nth-child(1)") == ("e[count(preceding-sibling::*) = 0]")
+
+ # always true
+ assert xpath("e:nth-child(n)") == ("e")
+ assert xpath("e:nth-child(n+1)") == ("e")
+ # always true too
+ assert xpath("e:nth-child(n-10)") == ("e")
+ # b=2 is the limit...
+ assert xpath("e:nth-child(n+2)") == ("e[count(preceding-sibling::*) >= 1]")
+ # always false
+ assert xpath("e:nth-child(-n)") == ("e[0]")
+ # equivalent to first child
+ assert xpath("e:nth-child(-n+1)") == ("e[count(preceding-sibling::*) <= 0]")
+
+ assert xpath("e:nth-child(3n+2)") == (
+ "e[(count(preceding-sibling::*) >= 1) and "
+ "((count(preceding-sibling::*) +2) mod 3 = 0)]"
+ )
+ assert xpath("e:nth-child(3n-2)") == (
+ "e[count(preceding-sibling::*) mod 3 = 0]"
+ )
+ assert xpath("e:nth-child(-n+6)") == ("e[count(preceding-sibling::*) <= 5]")
+
+ assert xpath("e:nth-last-child(1)") == ("e[count(following-sibling::*) = 0]")
+ assert xpath("e:nth-last-child(2n)") == (
+ "e[(count(following-sibling::*) +1) mod 2 = 0]"
+ )
+ assert xpath("e:nth-last-child(2n+1)") == (
+ "e[count(following-sibling::*) mod 2 = 0]"
+ )
+ assert xpath("e:nth-last-child(2n+2)") == (
+ "e[(count(following-sibling::*) >= 1) and "
+ "((count(following-sibling::*) +1) mod 2 = 0)]"
+ )
+ assert xpath("e:nth-last-child(3n+1)") == (
+ "e[count(following-sibling::*) mod 3 = 0]"
+ )
+ # represents the two last e elements
+ assert xpath("e:nth-last-child(-n+2)") == (
+ "e[count(following-sibling::*) <= 1]"
+ )
+
+ assert xpath("e:nth-of-type(1)") == ("e[count(preceding-sibling::e) = 0]")
+ assert xpath("e:nth-last-of-type(1)") == ("e[count(following-sibling::e) = 0]")
+ assert xpath("div e:nth-last-of-type(1) .aclass") == (
+ "div/descendant-or-self::*/e[count(following-sibling::e) = 0]"
+ "/descendant-or-self::*/*[@class and contains("
+ "concat(' ', normalize-space(@class), ' '), ' aclass ')]"
+ )
+
+ assert xpath("e:first-child") == ("e[count(preceding-sibling::*) = 0]")
+ assert xpath("e:last-child") == ("e[count(following-sibling::*) = 0]")
+ assert xpath("e:first-of-type") == ("e[count(preceding-sibling::e) = 0]")
+ assert xpath("e:last-of-type") == ("e[count(following-sibling::e) = 0]")
+ assert xpath("e:only-child") == ("e[count(parent::*/child::*) = 1]")
+ assert xpath("e:only-of-type") == ("e[count(parent::*/child::e) = 1]")
+ assert xpath("e:empty") == ("e[not(*) and not(string-length())]")
+ assert xpath("e:EmPTY") == ("e[not(*) and not(string-length())]")
+ assert xpath("e:root") == ("e[not(parent::*)]")
+ assert xpath("e:hover") == ("e[0]") # never matches
+ assert (
+ xpath("div:has(bar.foo)") == "div[descendant::bar"
+ "[@class and contains(concat(' ', normalize-space(@class), ' '), ' foo ')]]"
+ )
+ assert xpath("e:has(> f)") == "e[./f]"
+ assert xpath("e:has(f)") == "e[descendant::f]"
+ assert xpath("e:has(~ f)") == "e[following-sibling::f]"
+ assert (
+ xpath("e:has(+ f)")
+ == "e[following-sibling::*[(name() = 'f') and (position() = 1)]]"
+ )
+ assert xpath('e:contains("foo")') == ("e[contains(., 'foo')]")
+ assert xpath("e:ConTains(foo)") == ("e[contains(., 'foo')]")
+ assert xpath("e.warning") == (
+ "e[@class and contains("
+ "concat(' ', normalize-space(@class), ' '), ' warning ')]"
+ )
+ assert xpath("e#myid") == ("e[@id = 'myid']")
+ assert xpath("e:not(:nth-child(odd))") == (
+ "e[not(count(preceding-sibling::*) mod 2 = 0)]"
+ )
+ assert xpath("e:nOT(*)") == ("e[0]") # never matches
+ assert xpath("e f") == ("e/descendant-or-self::*/f")
+ assert xpath("e > f") == ("e/f")
+ assert xpath("e + f") == (
+ "e/following-sibling::*[(name() = 'f') and (position() = 1)]"
+ )
+ assert xpath("e ~ f") == ("e/following-sibling::f")
+ assert xpath("e ~ f:nth-child(3)") == (
+ "e/following-sibling::f[count(preceding-sibling::*) = 2]"
+ )
+ assert xpath("div#container p") == (
+ "div[@id = 'container']/descendant-or-self::*/p"
+ )
+ assert xpath("e:where(foo)") == "e[name() = 'foo']"
+ assert xpath("e:where(foo, bar)") == "e[(name() = 'foo') or (name() = 'bar')]"
+
+ # Invalid characters in XPath element names
+ assert xpath(r"di\a0 v") == ("*[name() = 'di v']") # di\xa0v
+ assert xpath(r"di\[v") == ("*[name() = 'di[v']")
+ assert xpath(r"[h\a0 ref]") == ("*[attribute::*[name() = 'h ref']]") # h\xa0ref
+ assert xpath(r"[h\]ref]") == ("*[attribute::*[name() = 'h]ref']]")
+
+ with pytest.raises(ExpressionError):
+ xpath(":fİrst-child")
+ with pytest.raises(ExpressionError):
+ xpath(":first-of-type")
+ with pytest.raises(ExpressionError):
+ xpath(":only-of-type")
+ with pytest.raises(ExpressionError):
+ xpath(":last-of-type")
+ with pytest.raises(ExpressionError):
+ xpath(":nth-of-type(1)")
+ with pytest.raises(ExpressionError):
+ xpath(":nth-last-of-type(1)")
+ with pytest.raises(ExpressionError):
+ xpath(":nth-child(n-)")
+ with pytest.raises(ExpressionError):
+ xpath(":after")
+ with pytest.raises(ExpressionError):
+ xpath(":lorem-ipsum")
+ with pytest.raises(ExpressionError):
+ xpath(":lorem(ipsum)")
+ with pytest.raises(ExpressionError):
+ xpath("::lorem-ipsum")
+ with pytest.raises(TypeError):
+ GenericTranslator().css_to_xpath(4) # type: ignore[arg-type]
+ with pytest.raises(TypeError):
+ GenericTranslator().selector_to_xpath("foo") # type: ignore[arg-type]
+
+ def test_unicode(self) -> None:
+ css = ".a\xc1b"
+ xpath = GenericTranslator().css_to_xpath(css)
+ assert css[1:] in xpath
+ xpath = xpath.encode("ascii", "xmlcharrefreplace").decode("ASCII")
+ assert xpath == (
+ "descendant-or-self::*[@class and contains("
+ "concat(' ', normalize-space(@class), ' '), ' aÁb ')]"
+ )
+
+ def test_quoting(self) -> None:
+ css_to_xpath = GenericTranslator().css_to_xpath
+ assert css_to_xpath('*[aval="\'"]') == (
+ """descendant-or-self::*[@aval = "'"]"""
+ )
+ assert css_to_xpath("*[aval=\"'''\"]") == (
+ """descendant-or-self::*[@aval = "'''"]"""
+ )
+ assert css_to_xpath("*[aval='\"']") == (
+ """descendant-or-self::*[@aval = '"']"""
+ )
+ assert css_to_xpath('*[aval=\'"""\']') == (
+ '''descendant-or-self::*[@aval = '"""']'''
+ )
+ assert css_to_xpath(':scope > div[dataimg=""]') == (
+ "descendant-or-self::*[1]/div[@dataimg = '']"
+ )
+
+ def test_unicode_escapes(self) -> None:
+ # \22 == '"' \20 == ' '
+ css_to_xpath = GenericTranslator().css_to_xpath
+ assert css_to_xpath(r'*[aval="\'\22\'"]') == (
+ """descendant-or-self::*[@aval = concat("'",'"',"'")]"""
+ )
+ assert css_to_xpath(r'*[aval="\'\22 2\'"]') == (
+ """descendant-or-self::*[@aval = concat("'",'"2',"'")]"""
+ )
+ assert css_to_xpath(r'*[aval="\'\20 \'"]') == (
+ """descendant-or-self::*[@aval = "' '"]"""
+ )
+ assert css_to_xpath("*[aval=\"'\\20\r\n '\"]") == (
+ """descendant-or-self::*[@aval = "' '"]"""
+ )
+
+ def test_xpath_pseudo_elements(self) -> None:
+ class CustomTranslator(GenericTranslator):
+ def xpath_pseudo_element(
+ self, xpath: XPathExpr, pseudo_element: PseudoElement
+ ) -> XPathExpr:
+ if isinstance(pseudo_element, FunctionalPseudoElement):
+ method_name = "xpath_{}_functional_pseudo_element".format(
+ pseudo_element.name.replace("-", "_")
+ )
+ method = getattr(self, method_name, None)
+ if not method:
+ raise ExpressionError(
+ f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
+ )
+ xpath = method(xpath, pseudo_element.arguments)
+ else:
+ method_name = "xpath_{}_simple_pseudo_element".format(
+ pseudo_element.replace("-", "_")
+ )
+ method = getattr(self, method_name, None)
+ if not method:
+ raise ExpressionError(
+ f"The pseudo-element ::{pseudo_element} is unknown"
+ )
+ xpath = method(xpath)
+ return xpath
+
+ # functional pseudo-class:
+ # elements that have a certain number of attributes
+ def xpath_nb_attr_function(
+ self, xpath: XPathExpr, function: Function
+ ) -> XPathExpr:
+ assert function.arguments[0].value
+ nb_attributes = int(function.arguments[0].value)
+ return xpath.add_condition(f"count(@*)={nb_attributes}")
+
+ # pseudo-class:
+ # elements that have 5 attributes
+ def xpath_five_attributes_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("count(@*)=5")
+
+ # functional pseudo-element:
+ # element's attribute by name
+ def xpath_attr_functional_pseudo_element(
+ self, xpath: XPathExpr, arguments: Sequence[Token]
+ ) -> XPathExpr:
+ attribute_name = arguments[0].value
+ other = XPathExpr(
+ f"@{attribute_name}",
+ "",
+ )
+ return xpath.join("/", other)
+
+ # pseudo-element:
+ # element's text() nodes
+ def xpath_text_node_simple_pseudo_element(
+ self, xpath: XPathExpr
+ ) -> XPathExpr:
+ other = XPathExpr(
+ "text()",
+ "",
+ )
+ return xpath.join("/", other)
+
+ # pseudo-element:
+ # element's href attribute
+ def xpath_attr_href_simple_pseudo_element(
+ self, xpath: XPathExpr
+ ) -> XPathExpr:
+ other = XPathExpr(
+ "@href",
+ "",
+ )
+ return xpath.join("/", other)
+
+ # pseudo-element:
+ # used to demonstrate operator precedence
+ def xpath_first_or_second_pseudo(self, xpath: XPathExpr) -> XPathExpr:
+ return xpath.add_condition("@id = 'first' or @id = 'second'")
+
+ def xpath(css: str) -> str:
+ return str(CustomTranslator().css_to_xpath(css))
+
+ assert xpath(":five-attributes") == "descendant-or-self::*[count(@*)=5]"
+ assert xpath(":nb-attr(3)") == "descendant-or-self::*[count(@*)=3]"
+ assert xpath("::attr(href)") == "descendant-or-self::*/@href"
+ assert xpath("::text-node") == "descendant-or-self::*/text()"
+ assert xpath("::attr-href") == "descendant-or-self::*/@href"
+ assert xpath("p img::attr(src)") == (
+ "descendant-or-self::p/descendant-or-self::*/img/@src"
+ )
+ assert xpath(":scope") == "descendant-or-self::*[1]"
+ assert xpath(":first-or-second[href]") == (
+ "descendant-or-self::*[(@id = 'first' or @id = 'second') and (@href)]"
+ )
+
+ assert str(XPathExpr("", "", condition="@href")) == "[@href]"
+
+ document = etree.fromstring(OPERATOR_PRECEDENCE_IDS)
+ sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__
+
+ def operator_id(selector: str) -> list[str]:
+ xpath = CustomTranslator().css_to_xpath(selector)
+ items = typing.cast("list[etree._Element]", document.xpath(xpath))
+ items.sort(key=sort_key)
+ return [element.get("id", "nil") for element in items]
+
+ assert operator_id(":first-or-second") == ["first", "second"]
+ assert operator_id(":first-or-second[href]") == ["second"]
+ assert operator_id("[href]:first-or-second") == ["second"]
+
+ def test_series(self) -> None:
+ def series(css: str) -> tuple[int, int] | None:
+ (selector,) = parse(f":nth-child({css})")
+ args = typing.cast(
+ "FunctionalPseudoElement", selector.parsed_tree
+ ).arguments
+ try:
+ return parse_series(args)
+ except ValueError:
+ return None
+
+ assert series("1n+3") == (1, 3)
+ assert series("1n +3") == (1, 3)
+ assert series("1n + 3") == (1, 3)
+ assert series("1n+ 3") == (1, 3)
+ assert series("1n-3") == (1, -3)
+ assert series("1n -3") == (1, -3)
+ assert series("1n - 3") == (1, -3)
+ assert series("1n- 3") == (1, -3)
+ assert series("n-5") == (1, -5)
+ assert series("odd") == (2, 1)
+ assert series("even") == (2, 0)
+ assert series("3n") == (3, 0)
+ assert series("n") == (1, 0)
+ assert series("+n") == (1, 0)
+ assert series("-n") == (-1, 0)
+ assert series("5") == (0, 5)
+ assert series("foo") is None
+ assert series("n+") is None
+
+ def test_lang(self) -> None:
+ document = etree.fromstring(XMLLANG_IDS)
+ sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__
+ css_to_xpath = GenericTranslator().css_to_xpath
+
+ def langid(selector: str) -> list[str]:
+ xpath = css_to_xpath(selector)
+ items = typing.cast("list[etree._Element]", document.xpath(xpath))
+ items.sort(key=sort_key)
+ return [element.get("id", "nil") for element in items]
+
+ assert langid(':lang("EN")') == ["first", "second", "third", "fourth"]
+ assert langid(':lang("en-us")') == ["second", "fourth"]
+ assert langid(":lang(en-nz)") == ["third"]
+ assert langid(":lang(fr)") == ["fifth"]
+ assert langid(":lang(ru)") == ["sixth"]
+ assert langid(":lang('ZH')") == ["eighth"]
+ assert langid(":lang(de) :lang(zh)") == ["eighth"]
+ assert langid(":lang(en), :lang(zh)") == [
+ "first",
+ "second",
+ "third",
+ "fourth",
+ "eighth",
+ ]
+ assert langid(":lang(es)") == []
+
+ def test_argument_types(self) -> None:
+ class CustomTranslator(GenericTranslator):
+ def __init__(self) -> None:
+ self.argument_types: list[str] = []
+
+ def xpath_pseudo_element(
+ self, xpath: XPathExpr, pseudo_element: PseudoElement
+ ) -> XPathExpr:
+ self.argument_types += typing.cast(
+ "FunctionalPseudoElement", pseudo_element
+ ).argument_types()
+ return xpath
+
+ def argument_types(css: str) -> list[str]:
+ translator = CustomTranslator()
+ translator.css_to_xpath(css)
+ return translator.argument_types
+
+ mappings: list[tuple[str, list[str]]] = [
+ ("", []),
+ ("ident", ["IDENT"]),
+ ('"string"', ["STRING"]),
+ ("1", ["NUMBER"]),
+ ]
+ for argument_string, argument_list in mappings:
+ css = f"::pseudo_element({argument_string})"
+ assert argument_types(css) == argument_list
+
+ def test_select(self) -> None:
+ document = etree.fromstring(HTML_IDS)
+ sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__
+ css_to_xpath = GenericTranslator().css_to_xpath
+ html_css_to_xpath = HTMLTranslator().css_to_xpath
+
+ def select_ids(selector: str, html_only: bool) -> list[str]:
+ xpath = css_to_xpath(selector)
+ items = typing.cast("list[etree._Element]", document.xpath(xpath))
+ if html_only:
+ assert items == []
+ xpath = html_css_to_xpath(selector)
+ items = typing.cast("list[etree._Element]", document.xpath(xpath))
+ items.sort(key=sort_key)
+ return [element.get("id", "nil") for element in items]
+
+ def pcss(main: str, *selectors: str, **kwargs: bool) -> list[str]:
+ html_only = kwargs.pop("html_only", False)
+ result = select_ids(main, html_only)
+ for selector in selectors:
+ assert select_ids(selector, html_only) == result
+ return result
+
+ all_ids = pcss("*")
+ assert all_ids[:6] == [
+ "html",
+ "nil",
+ "link-href",
+ "link-nohref",
+ "nil",
+ "outer-div",
+ ]
+ assert all_ids[-1:] == ["foobar-span"]
+ assert pcss("div") == ["outer-div", "li-div", "foobar-div"]
+ assert pcss("DIV", html_only=True) == [
+ "outer-div",
+ "li-div",
+ "foobar-div",
+ ] # case-insensitive in HTML
+ assert pcss("div div") == ["li-div"]
+ assert pcss("div, div div") == ["outer-div", "li-div", "foobar-div"]
+ assert pcss("a[name]") == ["name-anchor"]
+ assert pcss("a[NAme]", html_only=True) == [
+ "name-anchor"
+ ] # case-insensitive in HTML:
+ assert pcss("a[rel]") == ["tag-anchor", "nofollow-anchor"]
+ assert pcss('a[rel="tag"]') == ["tag-anchor"]
+ assert pcss('a[href*="localhost"]') == ["tag-anchor"]
+ assert pcss('a[href*=""]') == []
+ assert pcss('a[href^="http"]') == ["tag-anchor", "nofollow-anchor"]
+ assert pcss('a[href^="http:"]') == ["tag-anchor"]
+ assert pcss('a[href^=""]') == []
+ assert pcss('a[href$="org"]') == ["nofollow-anchor"]
+ assert pcss('a[href$=""]') == []
+ assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ["foobar-div"]
+ assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == []
+ assert pcss('div[foobar~="cd"]') == []
+ assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ["second-li"]
+ # Attribute values are case sensitive
+ assert pcss('*[lang|="en"]', '[lang|="en-US"]') == []
+ assert pcss('*[lang|="e"]') == []
+ # ... :lang() is not.
+ assert pcss(':lang("EN")', "*:lang(en-US)", html_only=True) == [
+ "second-li",
+ "li-div",
+ ]
+ assert pcss(':lang("e")', html_only=True) == []
+ assert pcss(":scope > div") == []
+ assert pcss(":scope body") == ["nil"]
+ assert pcss(":scope body > div") == ["outer-div", "foobar-div"]
+ assert pcss(":scope head") == ["nil"]
+ assert pcss(":scope html") == []
+
+ # --- nth-* and nth-last-* -------------------------------------
+
+ # select nothing
+ assert pcss("li:nth-child(-n)") == []
+ # select all children
+ assert pcss("li:nth-child(n)") == [
+ "first-li",
+ "second-li",
+ "third-li",
+ "fourth-li",
+ "fifth-li",
+ "sixth-li",
+ "seventh-li",
+ ]
+
+ assert pcss("li:nth-child(3)", "#first-li ~ :nth-child(3)") == ["third-li"]
+ assert pcss("li:nth-child(10)") == []
+ assert pcss("li:nth-child(2n)", "li:nth-child(even)", "li:nth-child(2n+0)") == [
+ "second-li",
+ "fourth-li",
+ "sixth-li",
+ ]
+ assert pcss("li:nth-child(+2n+1)", "li:nth-child(odd)") == [
+ "first-li",
+ "third-li",
+ "fifth-li",
+ "seventh-li",
+ ]
+ assert pcss("li:nth-child(2n+4)") == ["fourth-li", "sixth-li"]
+ assert pcss("li:nth-child(3n+1)") == ["first-li", "fourth-li", "seventh-li"]
+ assert pcss("li:nth-child(-n+3)") == ["first-li", "second-li", "third-li"]
+ assert pcss("li:nth-child(-2n+4)") == ["second-li", "fourth-li"]
+ assert pcss("li:nth-last-child(0)") == []
+ assert pcss("li:nth-last-child(1)") == ["seventh-li"]
+ assert pcss("li:nth-last-child(2n)", "li:nth-last-child(even)") == [
+ "second-li",
+ "fourth-li",
+ "sixth-li",
+ ]
+ assert pcss("li:nth-last-child(2n+1)") == [
+ "first-li",
+ "third-li",
+ "fifth-li",
+ "seventh-li",
+ ]
+ assert pcss("li:nth-last-child(2n+2)") == ["second-li", "fourth-li", "sixth-li"]
+ assert pcss("li:nth-last-child(3n+1)") == [
+ "first-li",
+ "fourth-li",
+ "seventh-li",
+ ]
+ assert pcss("ol:first-of-type") == ["first-ol"]
+ assert pcss("ol:nth-child(1)") == []
+ assert pcss("ol:nth-of-type(2)") == ["second-ol"]
+ assert pcss("ol:nth-last-of-type(1)") == ["second-ol"]
+
+ # "+" and "~" tests
+ assert pcss("ol#first-ol li + li:nth-child(4)") == ["fourth-li"]
+ assert pcss("li + li:nth-child(1)") == []
+ assert pcss("li ~ li:nth-child(2n+1)") == [
+ "third-li",
+ "fifth-li",
+ "seventh-li",
+ ] # all but the first
+ assert pcss("li ~ li:nth-last-child(2n+1)") == [
+ "third-li",
+ "fifth-li",
+ "seventh-li",
+ ] # all but the first
+
+ assert pcss("span:only-child") == ["foobar-span"]
+ assert pcss("li div:only-child") == ["li-div"]
+ assert pcss("div *:only-child") == ["li-div", "foobar-span"]
+ with pytest.raises(ExpressionError):
+ pcss("p *:only-of-type")
+ assert pcss("p:only-of-type") == ["paragraph"]
+ assert pcss("a:empty", "a:EMpty") == ["name-anchor"]
+ assert pcss("li:empty") == ["third-li", "fourth-li", "fifth-li", "sixth-li"]
+ assert pcss(":root", "html:root") == ["html"]
+ assert pcss("li:root", "* :root") == []
+ assert pcss('*:contains("link")', ':CONtains("link")') == [
+ "html",
+ "nil",
+ "outer-div",
+ "tag-anchor",
+ "nofollow-anchor",
+ ]
+ assert pcss('*:contains("LInk")') == [] # case sensitive
+ assert pcss('*:contains("e")') == [
+ "html",
+ "nil",
+ "outer-div",
+ "first-ol",
+ "first-li",
+ "paragraph",
+ "p-em",
+ ]
+ assert pcss('*:contains("E")') == [] # case-sensitive
+ assert pcss(".a", ".b", "*.a", "ol.a") == ["first-ol"]
+ assert pcss(".c", "*.c") == ["first-ol", "third-li", "fourth-li"]
+ assert pcss("ol *.c", "ol li.c", "li ~ li.c", "ol > li.c") == [
+ "third-li",
+ "fourth-li",
+ ]
+ assert pcss("#first-li", "li#first-li", "*#first-li") == ["first-li"]
+ assert pcss("li div", "li > div", "div div") == ["li-div"]
+ assert pcss("div > div") == []
+ assert pcss("div>.c", "div > .c") == ["first-ol"]
+ assert pcss("div + div") == ["foobar-div"]
+ assert pcss("a ~ a") == ["tag-anchor", "nofollow-anchor"]
+ assert pcss('a[rel="tag"] ~ a') == ["nofollow-anchor"]
+ assert pcss("ol#first-ol li:last-child") == ["seventh-li"]
+ assert pcss("ol#first-ol *:last-child") == ["li-div", "seventh-li"]
+ assert pcss("#outer-div:first-child") == ["outer-div"]
+ assert pcss("#outer-div :first-child") == [
+ "name-anchor",
+ "first-li",
+ "li-div",
+ "p-b",
+ "checkbox-fieldset-disabled",
+ "area-href",
+ ]
+ assert pcss("a[href]") == ["tag-anchor", "nofollow-anchor"]
+ assert pcss(":not(*)") == []
+ assert pcss("a:not([href])") == ["name-anchor"]
+ assert pcss("ol :Not(li[class])") == [
+ "first-li",
+ "second-li",
+ "li-div",
+ "fifth-li",
+ "sixth-li",
+ "seventh-li",
+ ]
+ assert pcss("link:has(*)") == []
+ assert pcss("ol:has(div)") == ["first-ol"]
+ assert pcss(":is(#first-li, #second-li)") == ["first-li", "second-li"]
+ assert pcss("a:is(#name-anchor, #tag-anchor)") == ["name-anchor", "tag-anchor"]
+ assert pcss(":is(.c)") == ["first-ol", "third-li", "fourth-li"]
+ assert pcss("ol.a.b.c > li.c:nth-child(3)") == ["third-li"]
+
+ # Invalid characters in XPath element names, should not crash
+ assert pcss(r"di\a0 v", r"div\[") == []
+ assert pcss(r"[h\a0 ref]", r"[h\]ref]") == []
+
+ # HTML-specific
+ assert pcss(":link", html_only=True) == [
+ "link-href",
+ "tag-anchor",
+ "nofollow-anchor",
+ "area-href",
+ ]
+ assert pcss(":visited", html_only=True) == []
+ assert pcss(":enabled", html_only=True) == [
+ "link-href",
+ "tag-anchor",
+ "nofollow-anchor",
+ "checkbox-unchecked",
+ "text-checked",
+ "checkbox-checked",
+ "area-href",
+ ]
+ assert pcss(":disabled", html_only=True) == [
+ "checkbox-disabled",
+ "checkbox-disabled-checked",
+ "fieldset",
+ "checkbox-fieldset-disabled",
+ ]
+ assert pcss(":checked", html_only=True) == [
+ "checkbox-checked",
+ "checkbox-disabled-checked",
+ ]
+
+ def test_select_shakespeare(self) -> None:
+ document = html.document_fromstring(HTML_SHAKESPEARE)
+ body = typing.cast("list[etree._Element]", document.xpath("//body"))[0]
+ css_to_xpath = GenericTranslator().css_to_xpath
+
+ basestring_ = (str, bytes)
+
+ def count(selector: str) -> int:
+ xpath = css_to_xpath(selector)
+ results = typing.cast("list[etree._Element]", body.xpath(xpath))
+ assert not isinstance(results, basestring_)
+ found = set()
+ for item in results:
+ assert item not in found
+ found.add(item)
+ assert not isinstance(item, basestring_)
+ return len(results)
+
+ # Data borrowed from http://mootools.net/slickspeed/
+
+ ## Changed from original; probably because I'm only
+ ## searching the body.
+ # assert count('*') == 252
+ assert count("*") == 246
+ assert count("div:contains(CELIA)") == 26
+ assert count("div:only-child") == 22 # ?
+ assert count("div:nth-child(even)") == 106
+ assert count("div:nth-child(2n)") == 106
+ assert count("div:nth-child(odd)") == 137
+ assert count("div:nth-child(2n+1)") == 137
+ assert count("div:nth-child(n)") == 243
+ assert count("div:last-child") == 53
+ assert count("div:first-child") == 51
+ assert count("div > div") == 242
+ assert count("div + div") == 190
+ assert count("div ~ div") == 190
+ assert count("body") == 1
+ assert count("body div") == 243
+ assert count("div") == 243
+ assert count("div div") == 242
+ assert count("div div div") == 241
+ assert count("div, div, div") == 243
+ assert count("div, a, span") == 243
+ assert count(".dialog") == 51
+ assert count("div.dialog") == 51
+ assert count("div .dialog") == 51
+ assert count("div.character, div.dialog") == 99
+ assert count("div.direction.dialog") == 0
+ assert count("div.dialog.direction") == 0
+ assert count("div.dialog.scene") == 1
+ assert count("div.scene.scene") == 1
+ assert count("div.scene .scene") == 0
+ assert count("div.direction .dialog ") == 0
+ assert count("div .dialog .direction") == 4
+ assert count("div.dialog .dialog .direction") == 4
+ assert count("#speech5") == 1
+ assert count("div#speech5") == 1
+ assert count("div #speech5") == 1
+ assert count("div.scene div.dialog") == 49
+ assert count("div#scene1 div.dialog div") == 142
+ assert count("#scene1 #speech1") == 1
+ assert count("div[class]") == 103
+ assert count("div[class=dialog]") == 50
+ assert count("div[class^=dia]") == 51
+ assert count("div[class$=log]") == 50
+ assert count("div[class*=sce]") == 1
+ assert count("div[class|=dialog]") == 50 # ? Seems right
+ assert count("div[class!=madeup]") == 243 # ? Seems right
+ assert count("div[class~=dialog]") == 51 # ? Seems right
+ assert count(":scope > div") == 1
+ assert count(":scope > div > div[class=dialog]") == 1
+ assert count(":scope > div div") == 242
+
+
+OPERATOR_PRECEDENCE_IDS = """
+
+
+
+
+
+"""
+
+XMLLANG_IDS = """
+
+ a
+ b
+ c
+ d
+ e
+ f
+
+
+
+
+"""
+
+HTML_IDS = """
+
+
+
+
+
+
+
+"""
+
+
+HTML_SHAKESPEARE = """
+
+
+
+
+
+
+
+
+
As You Like It
+
+ by William Shakespeare
+
+
+
ACT I, SCENE III. A room in the palace.
+
+
Enter CELIA and ROSALIND
+
+
CELIA
+
+
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
+
+
ROSALIND
+
+
Not one to throw at a dog.
+
+
CELIA
+
+
No, thy words are too precious to be cast away upon
+
curs; throw some of them at me; come, lame me with reasons.
+
+
ROSALIND
+
CELIA
+
+
But is all this for your father?
+
+
+
Then there were two cousins laid up; when the one
+
should be lamed with reasons and the other mad
+
without any.
+
+
ROSALIND
+
+
No, some of it is for my child's father. O, how
+
full of briers is this working-day world!
+
+
CELIA
+
+
They are but burs, cousin, thrown upon thee in
+
holiday foolery: if we walk not in the trodden
+
paths our very petticoats will catch them.
+
+
ROSALIND
+
+
I could shake them off my coat: these burs are in my heart.
+
+
CELIA
+
+
ROSALIND
+
+
I would try, if I could cry 'hem' and have him.
+
+
CELIA
+
+
Come, come, wrestle with thy affections.
+
+
ROSALIND
+
+
O, they take the part of a better wrestler than myself!
+
+
CELIA
+
+
O, a good wish upon you! you will try in time, in
+
despite of a fall. But, turning these jests out of
+
service, let us talk in good earnest: is it
+
possible, on such a sudden, you should fall into so
+
strong a liking with old Sir Rowland's youngest son?
+
+
ROSALIND
+
+
The duke my father loved his father dearly.
+
+
CELIA
+
+
Doth it therefore ensue that you should love his son
+
dearly? By this kind of chase, I should hate him,
+
for my father hated his father dearly; yet I hate
+
not Orlando.
+
+
ROSALIND
+
+
No, faith, hate him not, for my sake.
+
+
CELIA
+
+
Why should I not? doth he not deserve well?
+
+
ROSALIND
+
+
Let me love him for that, and do you love him
+
because I do. Look, here comes the duke.
+
+
CELIA
+
+
With his eyes full of anger.
+
Enter DUKE FREDERICK, with Lords
+
+
DUKE FREDERICK
+
+
Mistress, dispatch you with your safest haste
+
And get you from our court.
+
+
ROSALIND
+
+
DUKE FREDERICK
+
+
You, cousin
+
Within these ten days if that thou be'st found
+
So near our public court as twenty miles,
+
Thou diest for it.
+
+
ROSALIND
+
+
I do beseech your grace,
+
Let me the knowledge of my fault bear with me:
+
If with myself I hold intelligence
+
Or have acquaintance with mine own desires,
+
If that I do not dream or be not frantic,--
+
As I do trust I am not--then, dear uncle,
+
Never so much as in a thought unborn
+
Did I offend your highness.
+
+
DUKE FREDERICK
+
+
Thus do all traitors:
+
If their purgation did consist in words,
+
They are as innocent as grace itself:
+
Let it suffice thee that I trust thee not.
+
+
ROSALIND
+
+
Yet your mistrust cannot make me a traitor:
+
Tell me whereon the likelihood depends.
+
+
DUKE FREDERICK
+
+
Thou art thy father's daughter; there's enough.
+
+
ROSALIND
+
+
So was I when your highness took his dukedom;
+
So was I when your highness banish'd him:
+
Treason is not inherited, my lord;
+
Or, if we did derive it from our friends,
+
What's that to me? my father was no traitor:
+
Then, good my liege, mistake me not so much
+
To think my poverty is treacherous.
+
+
CELIA
+
+
Dear sovereign, hear me speak.
+
+
DUKE FREDERICK
+
+
Ay, Celia; we stay'd her for your sake,
+
Else had she with her father ranged along.
+
+
CELIA
+
+
I did not then entreat to have her stay;
+
It was your pleasure and your own remorse:
+
I was too young that time to value her;
+
But now I know her: if she be a traitor,
+
Why so am I; we still have slept together,
+
Rose at an instant, learn'd, play'd, eat together,
+
And wheresoever we went, like Juno's swans,
+
Still we went coupled and inseparable.
+
+
DUKE FREDERICK
+
+
She is too subtle for thee; and her smoothness,
+
Her very silence and her patience
+
Speak to the people, and they pity her.
+
Thou art a fool: she robs thee of thy name;
+
And thou wilt show more bright and seem more virtuous
+
When she is gone. Then open not thy lips:
+
Firm and irrevocable is my doom
+
Which I have pass'd upon her; she is banish'd.
+
+
CELIA
+
+
Pronounce that sentence then on me, my liege:
+
I cannot live out of her company.
+
+
DUKE FREDERICK
+
+
You are a fool. You, niece, provide yourself:
+
If you outstay the time, upon mine honour,
+
And in the greatness of my word, you die.
+
Exeunt DUKE FREDERICK and Lords
+
+
CELIA
+
+
O my poor Rosalind, whither wilt thou go?
+
Wilt thou change fathers? I will give thee mine.
+
I charge thee, be not thou more grieved than I am.
+
+
ROSALIND
+
+
CELIA
+
+
Thou hast not, cousin;
+
Prithee be cheerful: know'st thou not, the duke
+
Hath banish'd me, his daughter?
+
+
ROSALIND
+
+
CELIA
+
+
No, hath not? Rosalind lacks then the love
+
Which teacheth thee that thou and I am one:
+
Shall we be sunder'd? shall we part, sweet girl?
+
No: let my father seek another heir.
+
Therefore devise with me how we may fly,
+
Whither to go and what to bear with us;
+
And do not seek to take your change upon you,
+
To bear your griefs yourself and leave me out;
+
For, by this heaven, now at our sorrows pale,
+
Say what thou canst, I'll go along with thee.
+
+
ROSALIND
+
+
Why, whither shall we go?
+
+
CELIA
+
+
To seek my uncle in the forest of Arden.
+
+
ROSALIND
+
+
Alas, what danger will it be to us,
+
Maids as we are, to travel forth so far!
+
Beauty provoketh thieves sooner than gold.
+
+
CELIA
+
+
I'll put myself in poor and mean attire
+
And with a kind of umber smirch my face;
+
The like do you: so shall we pass along
+
And never stir assailants.
+
+
ROSALIND
+
+
Were it not better,
+
Because that I am more than common tall,
+
That I did suit me all points like a man?
+
A gallant curtle-axe upon my thigh,
+
A boar-spear in my hand; and--in my heart
+
Lie there what hidden woman's fear there will--
+
We'll have a swashing and a martial outside,
+
As many other mannish cowards have
+
That do outface it with their semblances.
+
+
CELIA
+
+
What shall I call thee when thou art a man?
+
+
ROSALIND
+
+
I'll have no worse a name than Jove's own page;
+
And therefore look you call me Ganymede.
+
But what will you be call'd?
+
+
CELIA
+
+
Something that hath a reference to my state
+
No longer Celia, but Aliena.
+
+
ROSALIND
+
+
But, cousin, what if we assay'd to steal
+
The clownish fool out of your father's court?
+
Would he not be a comfort to our travel?
+
+
CELIA
+
+
He'll go along o'er the wide world with me;
+
Leave me alone to woo him. Let's away,
+
And get our jewels and our wealth together,
+
Devise the fittest time and safest way
+
To hide us from pursuit that will be made
+
After my flight. Now go we in content
+
To liberty and not to banishment.
+
Exeunt
+
+
+
+
+
+
+"""
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tox.ini b/tox.ini
index 9a552c2..9ff54cf 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,49 @@
[tox]
-envlist = py24,py25,py26,py27,py31,py32
+envlist = pre-commit,pylint,py,docs,typing
[testenv]
-deps=lxml
-commands = python cssselect/tests.py
+deps =
+ lxml>=4.4
+ pytest-cov>=7.0.0
+ pytest>=5.4
+ sybil
+commands =
+ pytest --cov=cssselect \
+ --cov-report=term-missing --cov-report=html --cov-report=xml \
+ {posargs: cssselect tests docs}
+
+[testenv:pylint]
+deps =
+ {[testenv]deps}
+ pylint==4.0.4
+commands =
+ pylint {posargs: cssselect tests docs}
+
+[testenv:docs]
+changedir = docs
+deps =
+ -r docs/requirements.txt
+commands =
+ sphinx-build -W -b html . {envtmpdir}/html
+
+[testenv:typing]
+deps =
+ {[testenv]deps}
+ mypy==1.19.1
+ types-lxml==2026.1.1
+commands =
+ mypy {posargs: cssselect tests}
+
+[testenv:pre-commit]
+deps = pre-commit
+commands = pre-commit run --all-files --show-diff-on-failure
+skip_install = true
+
+[testenv:twinecheck]
+basepython = python3
+deps =
+ twine==6.2.0
+ build==1.4.0
+commands =
+ python -m build --sdist
+ twine check dist/*