diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..38558bf --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 4 +insert_final_newline = true +end_of_line = lf + +[*.{yml,yaml}] +indent_size = 2 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..bb4f6e1 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# applying pre-commit hooks to the project +e91101b37f82558db84a6b8ee9a6dba1fd2ae0bb diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000..41ff7e1 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,43 @@ +name: Checks +on: [push, pull_request] + +jobs: + checks: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: 3.14 + env: + TOXENV: pylint + - python-version: 3.14 # Keep in sync with .readthedocs.yml + env: + TOXENV: docs + - python-version: 3.14 + env: + TOXENV: typing + - python-version: 3.14 + env: + TOXENV: twinecheck + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run check + env: ${{ matrix.env }} + run: | + pip install -U pip + pip install -U tox + tox + + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..526c458 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,32 @@ +name: Publish +on: + push: + tags: + - 'v[0-9]+.[0-9]+.[0-9]+' + +jobs: + publish: + runs-on: ubuntu-latest + + environment: + name: pypi + url: https://pypi.org/p/cssselect + + permissions: + id-token: write + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: 3.14 + + - name: Build + run: | + python -m pip install --upgrade build + python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tests-macos.yml b/.github/workflows/tests-macos.yml new file mode 100644 index 0000000..4947937 --- /dev/null +++ b/.github/workflows/tests-macos.yml @@ -0,0 +1,27 @@ +name: macOS +on: [push, pull_request] + +jobs: + tests: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-ubuntu.yml b/.github/workflows/tests-ubuntu.yml new file mode 100644 index 0000000..1ef905b --- /dev/null +++ b/.github/workflows/tests-ubuntu.yml @@ -0,0 +1,33 @@ +name: Ubuntu +on: [push, pull_request] + +jobs: + tests: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy3.11"] + + steps: + - uses: actions/checkout@v6 + + - name: Install system libraries + if: contains(matrix.python-version, 'pypy') + run: | + sudo apt-get update + sudo apt-get install libxml2-dev libxslt-dev + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml new file mode 100644 index 0000000..24d7ee8 --- /dev/null +++ b/.github/workflows/tests-windows.yml @@ -0,0 +1,27 @@ +name: Windows +on: [push, pull_request] + +jobs: + tests: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.gitignore b/.gitignore index 36120ab..c276bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ /MANIFEST /dist /docs/_build +/.coverage +.idea +htmlcov/ +coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..81ca890 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.4 + hooks: + - id: ruff-check + args: [ --fix ] + - id: ruff-format +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.20.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==26.1.0 +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v1.0.0 + hooks: + - id: sphinx-lint +- repo: https://github.com/rhysd/actionlint + rev: v1.7.10 + hooks: + - id: actionlint diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..b91642a --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py + fail_on_warning: true +build: + os: ubuntu-24.04 + tools: + # For available versions, see: + # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python + python: "3.14" # Keep in sync with .github/workflows/checks.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/AUTHORS b/AUTHORS index 8c69e8f..66dcc22 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,13 @@ +Daniel Graña Ian Bicking +James Salter Laurence Rowe +Mikhail Korobov +Nik Nyby +Paul Tremberth +Simon Potter Simon Sapin Stefan Behnel +Thomas Grainger +Varialus +Arthur Darcet diff --git a/CHANGES b/CHANGES index 4583cef..5ca2959 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,251 @@ Changelog ========= +Version 1.4.0 +------------- + +Released on 2026-01-29. + +* Dropped support for Python 3.9 and PyPy 3.10. + +* Added support for Python 3.14 and PyPy 3.11. + +* Switched the build system to ``hatchling``. + +* CI fixes and improvements. + +Version 1.3.0 +------------- + +Released on 2025-03-10. + +* Dropped support for Python 3.7-3.8, added support for Python 3.12-3.13 and + PyPy 3.10. + +* Removed ``_unicode_safe_getattr()``, deprecated in 1.2.0. + +* Added ``pre-commit`` and formatted the code with ``ruff``. + +* Many CI additions and improvements. + + +Version 1.2.0 +------------- + +Released on 2022-10-27. + +* Drop support for Python 2.7, 3.4-3.6, add support for Python 3.7-3.11. + +* Add type annotations (PEP 484 and PEP 561). + +* More features from the CSS Selectors Level 4: + + * The ``:is()`` pseudo-class. + + * The ``:where()`` pseudo-class. + + * The ``:has()`` pseudo-class, with some limitations. + +* Fix parsing ``:scope`` after a comma. + +* Add parentheses to fix condition precedence in some cases. + +* Private API changes related to the removal of the Python 2 support: + + * Remove ``_unicode`` and ``_unichr`` aliases from ``csselect.parser``. + + * Remove ``_basestring`` and ``_unicode`` aliases from ``csselect.xpath``. + + * Deprecate ``csselect.xpath._unicode_safe_getattr()`` and change it to just + call ``getattr()``. + +* Include tests in the PyPI tarball. + +* Many CI additions and improvements. + +* Improve the test coverage. + + +Version 1.1.0 +------------- + +Released on 2019-08-09. + +* Support for the ``:scope`` selector, which allows to access immediate + children of a selector. + +* Support for the ``|E`` syntax for type selectors without a namespace. + +* A new selector method, ``canonical``, returns the CSS expression of the + selector, as a string. + + +Version 1.0.3 +------------- + +Released on 2017-12-27. + +* Fix artifact uploads to pypi + + +Version 1.0.2 +------------- + +Released on 2017-12-26. + +* Drop support for Python 2.6 and Python 3.3. +* Fix deprecation warning in Python 3.6. +* Minor cleanups. + + +Version 1.0.1 +------------- + +Released on 2017-01-10. + +* Add support for Python 3.6. +* Documentation hosted `on Read the Docs `_ + + +Version 1.0.0 +------------- + +Released on 2016-10-21. + +* Add code coverage reports. +* Fix ``:nth-*(an+b)`` pseudo-classes selectors. + (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.) + + +Version 0.9.2 +------------- + +Released on 2016-06-15. + +* Distribute as universal wheel. +* Add support for Python 3.3, 3.4 and 3.5. +* Drop support for Python 2.5 as testing is getting difficult. +* Improve tests on pseudo-elements. + + +Version 0.9.1 +------------- + +Released on 2013-10-17. + +* **Backward incompatible change from 0.9**: + :meth:`~GenericTranslator.selector_to_xpath` defaults to + ignoring pseudo-elements, + as it did in 0.8 and previous versions. + (:meth:`~GenericTranslator.css_to_xpath` doesn’t change.) +* Drop official support for Python 2.4 and 3.1, + as testing was becoming difficult. + Nothing will break overnight, + but future releases may on may not work on these versions. + Older releases will remain available on PyPI. + + +Version 0.9 +----------- + +Released on 2013-10-11. + +Add parser support for :attr:`functional +pseudo-elements `. + +*Update:* +This version accidentally introduced a **backward incompatible** change: +:meth:`~GenericTranslator.selector_to_xpath` defaults to +rejecting pseudo-elements instead of ignoring them. + + +Version 0.8 +----------- + +Released on 2013-03-15. + +Improvements: + +* `#22 `_ + Let extended translators override what XPathExpr class is used +* `#19 `_ + Use the built-in ``lang()`` XPath function + for implementing the ``:lang()`` pseudo-class + with XML documents. + This is probably faster than ``ancestor-or-self::``. + +Bug fixes: + +* `#14 `_ + Fix non-ASCII pseudo-classes. (Invalid selector instead of crash.) +* `#20 `_ + As per the spec, elements containing only whitespace are not considered empty + for the ``:empty`` pseudo-class. + + +Version 0.7.1 +------------- + +Released on 2012-06-14. Code name *remember-to-test-with-tox*. + +0.7 broke the parser in Python 2.4 and 2.5; the tests in 2.x. +Now all is well again. + +Also, pseudo-elements are now correctly made lower-case. (They are supposed +to be case-insensitive.) + + +Version 0.7 +----------- + +Released on 2012-06-14. + +Bug fix release: see #2, #7 and #10 on GitHub. + +* The tokenizer and parser have been rewritten to be much closer to the + specified grammar. In particular, non-ASCII characters and backslash-escapes + are now handled correctly. +* Special characters are protected in the output so that generated XPath + exrpessions should always be valid +* The ``~=``, ``^=`` and ``*=`` attribute operators now correctly never match + when used with an empty string. + + +Version 0.6.1 +------------- + +Released on 2012-04-25. + +Make sure that internal token objects do not "leak" into the public API and +:attr:`Selector.pseudo_element` is an unicode string. + + +Version 0.6 +----------- + +Released on 2012-04-24. + +* In ``setup.py`` use setuptools/distribute if available, but fall back + on distutils. +* Implement the ``:lang()`` pseudo-class, although it is only based on + ``xml:lang`` or ``lang`` attributes. If the document language is known from + some other meta-data (like a ``Content-Language`` HTTP header or ```` + element), a workaround is to set a lang attribute on the root element. + + +Version 0.5 +----------- + +Released on 2012-04-20. + +* Fix case sensitivity issues. +* Implement :class:`HTMLTranslator` based on the `HTML5 specification`_ + rather than guessing; add the ``xhtml`` parameter. +* Several bug fixes and better test coverage. + +.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors + + Version 0.4 ----------- @@ -19,14 +264,14 @@ Version 0.3 Released on 2012-04-17. * Fix many parsing bugs. -* Rename the :class:`Translator` class to :class:`GenericTranslator` +* Rename the ``Translator`` class to :class:`GenericTranslator` * There, implement ``:target``, ``:hover``, ``:focus``, ``:active`` ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as never matching. * Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as appropriate for HTML, with all links "not visited". -* Remove the :func:`css_to_xpath` function. The translator classes +* Remove the ``css_to_xpath`` function. The translator classes are the new API. * Add support for ``:contains()`` back, but case-sensitive. lxml will override it to be case-insensitive for backward-compatibility. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index c8f5dc3..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include AUTHORS CHANGES LICENSE README.rst tox.ini -recursive-include docs * -prune docs/_build diff --git a/README.rst b/README.rst index fa53a5b..c055295 100644 --- a/README.rst +++ b/README.rst @@ -1,25 +1,40 @@ + =================================== cssselect: CSS Selectors for Python =================================== -*cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ -expressions. Such expressions can be used in lxml_ or another XPath engine -to find the matching elements in an XML or HTML document. +.. image:: https://img.shields.io/pypi/v/cssselect.svg + :target: https://pypi.python.org/pypi/cssselect + :alt: PyPI Version + +.. image:: https://img.shields.io/pypi/pyversions/cssselect.svg + :target: https://pypi.python.org/pypi/cssselect + :alt: Supported Python Versions + +.. image:: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml/badge.svg + :target: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml + :alt: Tests -This module used to live inside of lxml as ``lxml.cssselect`` before it was -extracted as a stand-alone project. +.. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg + :target: https://codecov.io/github/scrapy/cssselect?branch=master + :alt: Coverage report -.. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ -.. _XPath 1.0: http://www.w3.org/TR/xpath/ -.. _lxml: http://lxml.de/ +**cssselect** is a BSD-licensed Python library to parse `CSS3 selectors`_ and +translate them to `XPath 1.0`_ expressions. +`XPath 1.0`_ expressions can be used in lxml_ or another XPath engine to find +the matching elements in an XML or HTML document. + +Find the cssselect online documentation at https://cssselect.readthedocs.io. Quick facts: -* Free software: BSD licensed -* Compatible with Python 2.4+ and 3.x -* Latest documentation `on python.org `_ -* Source, issues and pull requests `on Github - `_ -* Releases `on PyPI `_ +* Source, issues and pull requests `on GitHub + `_ +* Releases `on PyPI `_ * Install with ``pip install cssselect`` + + +.. _CSS3 selectors: https://www.w3.org/TR/selectors-3/ +.. _XPath 1.0: https://www.w3.org/TR/xpath/all/ +.. _lxml: https://lxml.de/ diff --git a/cssselect/__init__.py b/cssselect/__init__.py index 3129a42..59d62df 100644 --- a/cssselect/__init__.py +++ b/cssselect/__init__.py @@ -1,21 +1,36 @@ """ - CSS Selectors based on XPath - ============================ +CSS Selectors based on XPath +============================ - This module supports selecting XML/HTML elements based on CSS selectors. - See the `CSSSelector` class for details. +This module supports selecting XML/HTML elements based on CSS selectors. +See the `CSSSelector` class for details. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ -from cssselect.parser import (parse, Selector, SelectorError, - SelectorSyntaxError) -from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError +from cssselect.parser import ( + FunctionalPseudoElement, + Selector, + SelectorError, + SelectorSyntaxError, + parse, +) +from cssselect.xpath import ExpressionError, GenericTranslator, HTMLTranslator +__all__ = ( + "ExpressionError", + "FunctionalPseudoElement", + "GenericTranslator", + "HTMLTranslator", + "Selector", + "SelectorError", + "SelectorSyntaxError", + "parse", +) -VERSION = '0.4' +VERSION = "1.4.0" __version__ = VERSION diff --git a/cssselect/parser.py b/cssselect/parser.py index f6b42c8..f969769 100644 --- a/cssselect/parser.py +++ b/cssselect/parser.py @@ -1,26 +1,33 @@ """ - cssselect.parser - ================ +cssselect.parser +================ - Tokenizer, parser and parsed objects for CSS selectors. +Tokenizer, parser and parsed objects for CSS selectors. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ +from __future__ import annotations + +import operator import re +import sys +from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias, Union, cast, overload + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Sequence + + # typing.Self requires Python 3.11 + from typing_extensions import Self -try: - _unicode = unicode - _unichr = unichr -except NameError: - # Python 3 - _unicode = str - _unichr = chr +def ascii_lower(string: str) -> str: + """Lower-case, but only in the ASCII range.""" + return string.encode("utf8").lower().decode("utf8") class SelectorError(Exception): @@ -32,205 +39,444 @@ class SelectorError(Exception): """ + class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects -class Selector(object): +Tree: TypeAlias = Union[ + "Element", + "Hash", + "Class", + "Function", + "Pseudo", + "Attrib", + "Negation", + "Relation", + "Matching", + "SpecificityAdjustment", + "CombinedSelector", +] +PseudoElement: TypeAlias = Union["FunctionalPseudoElement", str] + + +class Selector: """ - Represents a selector with an optional pseudo element. + Represents a parsed selector. + + :meth:`~GenericTranslator.selector_to_xpath` accepts this object, + but ignores :attr:`pseudo_element`. It is the user’s responsibility + to account for pseudo-elements and reject selectors with unknown + or unsupported pseudo-elements. + """ - def __init__(self, tree, pseudo_element=None): - self._tree = tree - #: If the selector has a pseudo-element: a string like ``'after'``. - #: Otherwise, ``None``. - #: Any identifier preceded by ``::`` is accepted as a pseudo-element. - #: It is the user’s responsibility to reject selectors with - #: unknown or unsupported pseudo-elements. + + def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None: + self.parsed_tree = tree + if pseudo_element is not None and not isinstance( + pseudo_element, FunctionalPseudoElement + ): + pseudo_element = ascii_lower(pseudo_element) + #: A :class:`FunctionalPseudoElement`, + #: or the identifier for the pseudo-element as a string, + # or ``None``. + #: + #: +-------------------------+----------------+--------------------------------+ + #: | | Selector | Pseudo-element | + #: +=========================+================+================================+ + #: | CSS3 syntax | ``a::before`` | ``'before'`` | + #: +-------------------------+----------------+--------------------------------+ + #: | Older syntax | ``a:before`` | ``'before'`` | + #: +-------------------------+----------------+--------------------------------+ + #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | + #: | not in Selectors3 | | | + #: +-------------------------+----------------+--------------------------------+ + #: | Invalid pseudo-class | ``li:marker`` | ``None`` | + #: +-------------------------+----------------+--------------------------------+ + #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | + #: +-------------------------+----------------+--------------------------------+ + #: + #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element - def __repr__(self): - if self.pseudo_element: - pseudo_element = '::%s' % self.pseudo_element + def __repr__(self) -> str: + if isinstance(self.pseudo_element, FunctionalPseudoElement): + pseudo_element = repr(self.pseudo_element) + elif self.pseudo_element: + pseudo_element = f"::{self.pseudo_element}" + else: + pseudo_element = "" + return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]" + + def canonical(self) -> str: + """Return a CSS representation for this selector (a string)""" + if isinstance(self.pseudo_element, FunctionalPseudoElement): + pseudo_element = f"::{self.pseudo_element.canonical()}" + elif self.pseudo_element: + pseudo_element = f"::{self.pseudo_element}" else: - pseudo_element = '' - return '%s[%r%s]' % ( - self.__class__.__name__, self._tree, pseudo_element) + pseudo_element = "" + res = f"{self.parsed_tree.canonical()}{pseudo_element}" + if len(res) > 1: + res = res.lstrip("*") + return res - def specificity(self): + def specificity(self) -> tuple[int, int, int]: """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity """ - a, b, c = self._tree.specificity() + a, b, c = self.parsed_tree.specificity() if self.pseudo_element: c += 1 return a, b, c -class Class(object): +class Class: """ Represents selector.class_name """ - def __init__(self, selector, class_name): + + def __init__(self, selector: Tree, class_name: str) -> None: self.selector = selector self.class_name = class_name - def __repr__(self): - return '%s[%r.%s]' % ( - self.__class__.__name__, self.selector, self.class_name) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]" + + def canonical(self) -> str: + return f"{self.selector.canonical()}.{self.class_name}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Function(object): +class FunctionalPseudoElement: + """ + Represents selector::name(arguments) + + .. attribute:: name + + The name (identifier) of the pseudo-element, as a string. + + .. attribute:: arguments + + The arguments of the pseudo-element, as a list of tokens. + + **Note:** tokens are not part of the public API, + and may change between cssselect versions. + Use at your own risks. + + """ + + def __init__(self, name: str, arguments: Sequence[Token]): + self.name = ascii_lower(name) + self.arguments = arguments + + def __repr__(self) -> str: + token_values = [token.value for token in self.arguments] + return f"{self.__class__.__name__}[::{self.name}({token_values!r})]" + + def argument_types(self) -> list[str]: + return [token.type for token in self.arguments] + + def canonical(self) -> str: + args = "".join(token.css() for token in self.arguments) + return f"{self.name}({args})" + + +class Function: """ Represents selector:name(expr) """ - def __init__(self, selector, name, arguments): + + def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None: self.selector = selector - self.name = name + self.name = ascii_lower(name) self.arguments = arguments - def __repr__(self): - return '%s[%r:%s(%r)]' % ( - self.__class__.__name__, self.selector, self.name, self.arguments) + def __repr__(self) -> str: + token_values = [token.value for token in self.arguments] + return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]" - def specificity(self): + def argument_types(self) -> list[str]: + return [token.type for token in self.arguments] + + def canonical(self) -> str: + args = "".join(token.css() for token in self.arguments) + return f"{self.selector.canonical()}:{self.name}({args})" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Pseudo(object): +class Pseudo: """ Represents selector:ident """ - def __init__(self, selector, ident): + + def __init__(self, selector: Tree, ident: str) -> None: self.selector = selector - self.ident = ident + self.ident = ascii_lower(ident) - def __repr__(self): - return '%s[%r:%s]' % ( - self.__class__.__name__, self.selector, self.ident) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]" - def specificity(self): + def canonical(self) -> str: + return f"{self.selector.canonical()}:{self.ident}" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Negation(object): +class Negation: """ Represents selector:not(subselector) """ - def __init__(self, selector, subselector): + + def __init__(self, selector: Tree, subselector: Tree) -> None: self.selector = selector self.subselector = subselector - def __repr__(self): - return '%s[%r:not(%r)]' % ( - self.__class__.__name__, self.selector, self.subselector) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]" + + def canonical(self) -> str: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()}:not({subsel})" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() - a2, b2, c2 = self.sub_selector.specificity() + a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 -class Attrib(object): +class Relation: + """ + Represents selector:has(subselector) + """ + + def __init__(self, selector: Tree, combinator: Token, subselector: Selector): + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]" + + def canonical(self) -> str: + try: + subsel = self.subselector[0].canonical() # type: ignore[index] + except TypeError: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()}:has({subsel})" + + def specificity(self) -> tuple[int, int, int]: + a1, b1, c1 = self.selector.specificity() + try: + a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index] + except TypeError: + a2, b2, c2 = self.subselector.specificity() + return a1 + a2, b1 + b2, c1 + c2 + + +class Matching: + """ + Represents selector:is(selector_list) + """ + + def __init__(self, selector: Tree, selector_list: Iterable[Tree]): + self.selector = selector + self.selector_list = selector_list + + def __repr__(self) -> str: + args_str = ", ".join(repr(s) for s in self.selector_list) + return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]" + + def canonical(self) -> str: + selector_arguments = [] + for s in self.selector_list: + selarg = s.canonical() + selector_arguments.append(selarg.lstrip("*")) + args_str = ", ".join(str(s) for s in selector_arguments) + return f"{self.selector.canonical()}:is({args_str})" + + def specificity(self) -> tuple[int, int, int]: + return max(x.specificity() for x in self.selector_list) + + +class SpecificityAdjustment: + """ + Represents selector:where(selector_list) + Same as selector:is(selector_list), but its specificity is always 0 + """ + + def __init__(self, selector: Tree, selector_list: list[Tree]): + self.selector = selector + self.selector_list = selector_list + + def __repr__(self) -> str: + args_str = ", ".join(repr(s) for s in self.selector_list) + return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]" + + def canonical(self) -> str: + selector_arguments = [] + for s in self.selector_list: + selarg = s.canonical() + selector_arguments.append(selarg.lstrip("*")) + args_str = ", ".join(str(s) for s in selector_arguments) + return f"{self.selector.canonical()}:where({args_str})" + + def specificity(self) -> tuple[int, int, int]: + return 0, 0, 0 + + +class Attrib: """ Represents selector[namespace|attrib operator value] """ - def __init__(self, selector, namespace, attrib, operator, value): + + @overload + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: Literal["exists"], + value: None, + ) -> None: ... + + @overload + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: str, + value: Token, + ) -> None: ... + + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: str, + value: Token | None, + ) -> None: self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value - def __repr__(self): - if self.namespace == '*': - attrib = self.attrib - else: - attrib = '%s|%s' % (self.namespace, self.attrib) - if self.operator == 'exists': - return '%s[%r[%s]]' % ( - self.__class__.__name__, self.selector, attrib) + def __repr__(self) -> str: + attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib + if self.operator == "exists": + return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]" + assert self.value is not None + return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]" + + def canonical(self) -> str: + attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib + + if self.operator == "exists": + op = attrib else: - return '%s[%r[%s %s %r]]' % ( - self.__class__.__name__, self.selector, attrib, - self.operator, self.value) + assert self.value is not None + op = f"{attrib}{self.operator}{self.value.css()}" - def specificity(self): + return f"{self.selector.canonical()}[{op}]" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Element(object): +class Element: """ Represents namespace|element + + `None` is for the universal selector '*' + """ - def __init__(self, namespace, element): + + def __init__( + self, namespace: str | None = None, element: str | None = None + ) -> None: self.namespace = namespace self.element = element - def __repr__(self): - if self.namespace == '*': - element = self.element - else: - element = '%s|%s' % (self.namespace, self.element) - return '%s[%s]' % ( - self.__class__.__name__, element) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.canonical()}]" - def specificity(self): - if self.element == '*': - return 0, 0, 0 - else: + def canonical(self) -> str: + element = self.element or "*" + if self.namespace: + element = f"{self.namespace}|{element}" + return element + + def specificity(self) -> tuple[int, int, int]: + if self.element: return 0, 0, 1 + return 0, 0, 0 -class Hash(object): +class Hash: """ Represents selector#id """ - def __init__(self, selector, id): + + def __init__(self, selector: Tree, id: str) -> None: # noqa: A002 self.selector = selector self.id = id - def __repr__(self): - return '%s[%r#%s]' % ( - self.__class__.__name__, self.selector, self.id) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]" + + def canonical(self) -> str: + return f"{self.selector.canonical()}#{self.id}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() a += 1 return a, b, c -class CombinedSelector(object): - def __init__(self, selector, combinator, subselector): +class CombinedSelector: + def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None: assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector - def __repr__(self): - if self.combinator == ' ': - comb = '' - else: - comb = self.combinator - return '%s[%r %s %r]' % ( - self.__class__.__name__, self.selector, comb, self.subselector) + def __repr__(self) -> str: + comb = "" if self.combinator == " " else self.combinator + return ( + f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]" + ) + + def canonical(self) -> str: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()} {self.combinator} {subsel}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 @@ -238,19 +484,26 @@ def specificity(self): #### Parser -_el_re = re.compile(r'^\s*(\w+)$') -_id_re = re.compile(r'^\s*(\w*)#(\w+)\s*$') -_class_re = re.compile(r'^\s*(\w*)\.(\w+)\s*$') +# foo +_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$") + +# foo#bar or #bar +_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$") +# foo.bar or .bar +_class_re = re.compile( + r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$" +) -def parse(css): + +def parse(css: str) -> list[Selector]: """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: - A *group of selectors* as an Unicode string. + A *group of selectors* as a string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: @@ -261,415 +514,533 @@ def parse(css): # Fast path for simple cases match = _el_re.match(css) if match: - return [Selector(Element('*', match.group(1)))] + return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: - return [Selector(Hash(Element( - '*', match.group(1) or '*'), match.group(2)))] + return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] match = _class_re.match(css) if match is not None: - return [Selector(Class(Element( - '*', match.group(1) or '*'), match.group(2)))] + return [ + Selector(Class(Element(element=match.group(1) or None), match.group(2))) + ] stream = TokenStream(tokenize(css)) stream.source = css - try: - return list(parse_selector_group(stream)) - except SelectorSyntaxError: - import sys - e = sys.exc_info()[1] - message = "%s at %s -> %r" % ( - e, stream.used, stream.peek()) - e.msg = message - if sys.version_info < (2,6): - e.message = message - e.args = tuple([message]) - raise - - -def parse_selector_group(stream): + return list(parse_selector_group(stream)) + + +# except SelectorSyntaxError: +# e = sys.exc_info()[1] +# message = "%s at %s -> %r" % ( +# e, stream.used, stream.peek()) +# e.msg = message +# e.args = tuple([message]) +# raise + + +def parse_selector_group(stream: TokenStream) -> Iterator[Selector]: stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) - if stream.peek() == ',': + if stream.peek() == ("DELIM", ","): stream.next() stream.skip_whitespace() else: break -def parse_selector(stream): + +def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]: result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() - if peek == ',' or peek is None: + if peek in (("EOF", None), ("DELIM", ",")): break if pseudo_element: raise SelectorSyntaxError( - 'A pseudo-element must be at the end of a selector') - if peek in ('+', '>', '~'): + f"Got pseudo-element ::{pseudo_element} not at the end of a selector" + ) + if peek.is_delim("+", ">", "~"): # A combinator - combinator = stream.next() + combinator = cast("str", stream.next().value) stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' - combinator = ' ' + combinator = " " next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element -def parse_simple_selector(stream, inside_negation=False): +def parse_simple_selector( + stream: TokenStream, inside_negation: bool = False +) -> tuple[Tree, PseudoElement | None]: stream.skip_whitespace() + selector_start = len(stream.used) peek = stream.peek() - consumed = len(stream.used) - if peek == '*' or isinstance(peek, Symbol): - next = stream.next() - if stream.peek() == '|': - namespace = next + if peek.type == "IDENT" or peek == ("DELIM", "*"): + if peek.type == "IDENT": + namespace = stream.next().value + else: + stream.next() + namespace = None + if stream.peek() == ("DELIM", "|"): stream.next() - element = stream.next_symbol_or_star() + element = stream.next_ident_or_star() else: - namespace = '*' - element = next + element = namespace + namespace = None else: - element = namespace = '*' - result = Element(namespace, element) - pseudo_element = None + element = namespace = None + result: Tree = Element(namespace, element) + pseudo_element: PseudoElement | None = None while 1: peek = stream.peek() - if peek in (None, ' ', ',', '+', '>', '~') or ( - inside_negation and peek == ')'): + if ( + peek.type in ("S", "EOF") + or peek.is_delim(",", "+", ">", "~") + or (inside_negation and peek == ("DELIM", ")")) + ): break if pseudo_element: raise SelectorSyntaxError( - 'A pseudo-element must be at the end of a selector') - if peek == '#': + f"Got pseudo-element ::{pseudo_element} not at the end of a selector" + ) + if peek.type == "HASH": + result = Hash(result, cast("str", stream.next().value)) + elif peek == ("DELIM", "."): stream.next() - result = Hash(result, stream.next_symbol()) - continue - elif peek == '.': + result = Class(result, stream.next_ident()) + elif peek == ("DELIM", "|"): stream.next() - result = Class(result, stream.next_symbol()) - continue - elif peek == '[': + result = Element(None, stream.next_ident()) + elif peek == ("DELIM", "["): stream.next() result = parse_attrib(result, stream) - next = stream.next() - if next != ']': - raise SelectorSyntaxError( - "] expected, got '%s'" % next) - continue - elif peek == '::': + elif peek == ("DELIM", ":"): stream.next() - pseudo_element = stream.next_symbol() - continue - elif peek == ':': - stream.next() - ident = stream.next_symbol() - if ident in ('first-line', 'first-letter', 'before', 'after'): + if stream.peek() == ("DELIM", ":"): + stream.next() + pseudo_element = stream.next_ident() + if stream.peek() == ("DELIM", "("): + stream.next() + pseudo_element = FunctionalPseudoElement( + pseudo_element, parse_arguments(stream) + ) + continue + ident = stream.next_ident() + if ident.lower() in ("first-line", "first-letter", "before", "after"): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. - pseudo_element = ident + pseudo_element = str(ident) continue - if stream.peek() == '(': - stream.next() - stream.skip_whitespace() - if ident == 'not': - if inside_negation: - raise SelectorSyntaxError('Got nested :not()') - argument, argument_pseudo_element = parse_simple_selector( - stream, inside_negation=True) - if argument_pseudo_element: - raise SelectorSyntaxError( - 'Pseudo-elements are not allowed inside :not()') - else: - peek = stream.peek() - if isinstance(peek, (Symbol, String)): - argument = stream.next() - else: - raise SelectorSyntaxError( - "Expected argument, got '%s'" % peek) - stream.skip_whitespace() - next = stream.next() - if not next == ')': + if stream.peek() != ("DELIM", "("): + result = Pseudo(result, ident) + if repr(result) == "Pseudo[Element[*]:scope]" and not ( + len(stream.used) == 2 + or (len(stream.used) == 3 and stream.used[0].type == "S") + or (len(stream.used) >= 3 and stream.used[-3].is_delim(",")) + or ( + len(stream.used) >= 4 + and stream.used[-3].type == "S" + and stream.used[-4].is_delim(",") + ) + ): raise SelectorSyntaxError( - "Expected ')', got '%s'" % next) - if ident == 'not': - result = Negation(result, argument) - else: - result = Function(result, ident, argument) + 'Got immediate child pseudo-element ":scope" ' + "not at the start of a selector" + ) + continue + stream.next() + stream.skip_whitespace() + if ident.lower() == "not": + if inside_negation: + raise SelectorSyntaxError("Got nested :not()") + argument, argument_pseudo_element = parse_simple_selector( + stream, inside_negation=True + ) + next_ = stream.next() + if argument_pseudo_element: + raise SelectorSyntaxError( + f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next_.pos}" + ) + if next_ != ("DELIM", ")"): + raise SelectorSyntaxError(f"Expected ')', got {next_}") + result = Negation(result, argument) + elif ident.lower() == "has": + combinator, arguments = parse_relative_selector(stream) + result = Relation(result, combinator, arguments) + + elif ident.lower() in ("matches", "is"): + selectors = parse_simple_selector_arguments(stream) + result = Matching(result, selectors) + elif ident.lower() == "where": + selectors = parse_simple_selector_arguments(stream) + result = SpecificityAdjustment(result, selectors) else: - result = Pseudo(result, ident) - continue + result = Function(result, ident, parse_arguments(stream)) else: - raise SelectorSyntaxError( - "Expected selector, got '%s'" % peek) - if consumed == len(stream.used): - raise SelectorSyntaxError( - "Expected selector, got '%s'" % stream.peek()) + raise SelectorSyntaxError(f"Expected selector, got {peek}") + if len(stream.used) == selector_start: + raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}") return result, pseudo_element -def parse_attrib(selector, stream): +def parse_arguments(stream: TokenStream) -> list[Token]: # noqa: RET503 + arguments: list[Token] = [] + while 1: + stream.skip_whitespace() + next_ = stream.next() + if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ + ("DELIM", "+"), + ("DELIM", "-"), + ]: + arguments.append(next_) + elif next_ == ("DELIM", ")"): + return arguments + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + + +def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]: # noqa: RET503 stream.skip_whitespace() - attrib = stream.next_symbol_or_star() - if attrib == '*' and stream.peek() != '|': - raise SelectorSyntaxError( - "Expected '|', got '%s'" % stream.peek()) - if stream.peek() == '|': - namespace = attrib - stream.next() - attrib = stream.next_symbol() + subselector = "" + next_ = stream.next() + + if next_ in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]: + combinator = next_ + stream.skip_whitespace() + next_ = stream.next() else: - namespace = '*' + combinator = Token("DELIM", " ", pos=0) + + while 1: + if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ + ("DELIM", "."), + ("DELIM", "*"), + ]: + subselector += cast("str", next_.value) + elif next_ == ("DELIM", ")"): + result = parse(subselector) + return combinator, result[0] + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + next_ = stream.next() + + +def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]: + arguments = [] + while 1: + result, pseudo_element = parse_simple_selector(stream, True) + if pseudo_element: + raise SelectorSyntaxError( + f"Got pseudo-element ::{pseudo_element} inside function" + ) + stream.skip_whitespace() + next_ = stream.next() + if next_ in (("EOF", None), ("DELIM", ",")): + stream.next() + stream.skip_whitespace() + arguments.append(result) + elif next_ == ("DELIM", ")"): + arguments.append(result) + break + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + return arguments + + +def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib: stream.skip_whitespace() - if stream.peek() == ']': - return Attrib(selector, namespace, attrib, 'exists', None) - op = stream.next() - if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): - raise SelectorSyntaxError( - "Operator expected, got '%s'" % op) + attrib = stream.next_ident_or_star() + if attrib is None and stream.peek() != ("DELIM", "|"): + raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}") + namespace: str | None + op: str | None + if stream.peek() == ("DELIM", "|"): + stream.next() + if stream.peek() == ("DELIM", "="): + namespace = None + stream.next() + op = "|=" + else: + namespace = attrib + attrib = stream.next_ident() + op = None + else: + namespace = op = None + if op is None: + stream.skip_whitespace() + next_ = stream.next() + if next_ == ("DELIM", "]"): + return Attrib(selector, namespace, cast("str", attrib), "exists", None) + if next_ == ("DELIM", "="): + op = "=" + elif next_.is_delim("^", "$", "*", "~", "|", "!") and ( + stream.peek() == ("DELIM", "=") + ): + op = cast("str", next_.value) + "=" + stream.next() + else: + raise SelectorSyntaxError(f"Operator expected, got {next_}") stream.skip_whitespace() value = stream.next() - if not isinstance(value, (Symbol, String)): - raise SelectorSyntaxError( - "Expected string or symbol, got '%s'" % value) + if value.type not in ("IDENT", "STRING"): + raise SelectorSyntaxError(f"Expected string or ident, got {value}") stream.skip_whitespace() - return Attrib(selector, namespace, attrib, op, value) + next_ = stream.next() + if next_ != ("DELIM", "]"): + raise SelectorSyntaxError(f"Expected ']', got {next_}") + return Attrib(selector, namespace, cast("str", attrib), op, value) -def parse_series(s): +def parse_series(tokens: Iterable[Token]) -> tuple[int, int]: """ - Parses things like '1n+2', or 'an+b' generally, returning (a, b) + Parses the arguments for :nth-child() and friends. + + :raises: A list of tokens + :returns: :``(a, b)`` + """ - if isinstance(s, Element): - s = s._format_element() - if not s or s == '*': - # Happens when there's nothing, which the CSS parser thinks of as * - return (0, 0) - if isinstance(s, int): - # Happens when you just get a number - return (0, s) - if s == 'odd': - return (2, 1) - elif s == 'even': - return (2, 0) - elif s == 'n': - return (1, 0) - if 'n' not in s: - # Just a b - return (0, int(s)) - a, b = s.split('n', 1) + for token in tokens: + if token.type == "STRING": + raise ValueError("String tokens not allowed in series.") + s = "".join(cast("str", token.value) for token in tokens).strip() + if s == "odd": + return 2, 1 + if s == "even": + return 2, 0 + if s == "n": + return 1, 0 + if "n" not in s: + # Just b + return 0, int(s) + a, b = s.split("n", 1) + a_as_int: int if not a: - a = 1 - elif a == '-' or a == '+': - a = int(a+'1') - else: - a = int(a) - if not b: - b = 0 - elif b == '-' or b == '+': - b = int(b+'1') + a_as_int = 1 + elif a in {"-", "+"}: + a_as_int = int(a + "1") else: - b = int(b) - return (a, b) + a_as_int = int(a) + b_as_int = int(b) if b else 0 + return a_as_int, b_as_int #### Token objects -class _UniToken(_unicode): - def __new__(cls, contents, pos): - obj = _unicode.__new__(cls, contents) + +class Token(tuple[str, str | None]): # noqa: SLOT001 + @overload + def __new__( + cls, + type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"], + value: str, + pos: int, + ) -> Self: ... + + @overload + def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ... + + def __new__(cls, type_: str, value: str | None, pos: int) -> Self: + obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj - def __repr__(self): - return '%s(%s, %r)' % ( - self.__class__.__name__, - _unicode.__repr__(self), - self.pos) + def __repr__(self) -> str: + return f"<{self.type} '{self.value}' at {self.pos}>" + + def is_delim(self, *values: str) -> bool: + return self.type == "DELIM" and self.value in values + + pos: int + + @property + def type(self) -> str: + return self[0] -class Symbol(_UniToken): - pass + @property + def value(self) -> str | None: + return self[1] -class String(_UniToken): - pass + def css(self) -> str: + if self.type == "STRING": + return repr(self.value) + return cast("str", self.value) -class Token(_UniToken): - pass + +class EOFToken(Token): + def __new__(cls, pos: int) -> Self: + return Token.__new__(cls, "EOF", None, pos) + + def __repr__(self) -> str: + return f"<{self.type} at {self.pos}>" #### Tokenizer -_match_whitespace = re.compile(r'\s+', re.UNICODE).match -_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub +class TokenMacros: + unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?" + escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]" + string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape + nonascii = r"[^\0-\177]" + nmchar = f"[_a-z0-9-]|{escape}|{nonascii}" + nmstart = f"[_a-z]|{escape}|{nonascii}" + + +class MatchFunc(Protocol): + def __call__( + self, string: str, pos: int = ..., endpos: int = ... + ) -> re.Match[str] | None: ... + + +def _compile(pattern: str) -> MatchFunc: + return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match -_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match -def tokenize(s): +_match_whitespace = _compile(r"[ \t\r\n\f]+") +_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)") +_match_hash = _compile("#(?:%(nmchar)s)+") +_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*") +_match_string_by_quote = { + "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), + '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), +} + +_sub_simple_escape = re.compile(r"\\(.)").sub +_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub +_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub + +# Same as r'\1', but faster on CPython +_replace_simple = operator.methodcaller("group", 1) + + +def _replace_unicode(match: re.Match[str]) -> str: + codepoint = int(match.group(1), 16) + if codepoint > sys.maxunicode: + codepoint = 0xFFFD + return chr(codepoint) + + +def unescape_ident(value: str) -> str: + value = _sub_unicode_escape(_replace_unicode, value) + return _sub_simple_escape(_replace_simple, value) + + +def tokenize(s: str) -> Iterator[Token]: pos = 0 - s = _replace_comments('', s) len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: - yield Token(' ', pos) + yield Token("S", " ", pos) pos = match.end() continue - match = _match_count_number(s, pos=pos) - if match and match.group() != 'n': - sym = s[pos:match.end()] - yield Symbol(sym, pos) + + match = _match_ident(s, pos=pos) + if match: + value = _sub_simple_escape( + _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()) + ) + yield Token("IDENT", value, pos) pos = match.end() continue - c = s[pos] - c2 = s[pos:pos+2] - if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): - yield Token(c2, pos) - pos += 2 - continue - if c in '>+~,.*=[]()|:#': - yield Token(c, pos) - pos += 1 - continue - if c == '"' or c == "'": - # Quoted string - old_pos = pos - sym, pos = tokenize_escaped_string(s, pos) - yield String(sym, old_pos) - continue - old_pos = pos - sym, pos = tokenize_symbol(s, pos) - yield Symbol(sym, old_pos) - continue -split_at_string_escapes = re.compile(r'(\\(?:%s))' - % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', - '[^A-Fa-f0-9]'])).split + match = _match_hash(s, pos=pos) + if match: + value = _sub_simple_escape( + _replace_simple, + _sub_unicode_escape(_replace_unicode, match.group()[1:]), + ) + yield Token("HASH", value, pos) + pos = match.end() + continue + quote = s[pos] + if quote in _match_string_by_quote: + match = _match_string_by_quote[quote](s, pos=pos + 1) + assert match, "Should have found at least an empty match" + end_pos = match.end() + if end_pos == len_s: + raise SelectorSyntaxError(f"Unclosed string at {pos}") + if s[end_pos] != quote: + raise SelectorSyntaxError(f"Invalid string at {pos}") + value = _sub_simple_escape( + _replace_simple, + _sub_unicode_escape( + _replace_unicode, _sub_newline_escape("", match.group()) + ), + ) + yield Token("STRING", value, pos) + pos = end_pos + 1 + continue -def unescape_string_literal(literal): - substrings = [] - for substring in split_at_string_escapes(literal): - if not substring: + match = _match_number(s, pos=pos) + if match: + value = match.group() + yield Token("NUMBER", value, pos) + pos = match.end() continue - elif '\\' in substring: - if substring[0] == '\\' and len(substring) > 1: - substring = substring[1:] - if substring[0] in '0123456789ABCDEFabcdef': - # int() correctly ignores the potentially trailing whitespace - substring = _unichr(int(substring, 16)) + + pos2 = pos + 2 + if s[pos:pos2] == "/*": + pos = s.find("*/", pos2) + if pos == -1: + pos = len_s else: - raise SelectorSyntaxError( - "Invalid escape sequence %r in string %r" - % (substring.split('\\')[1], literal)) - substrings.append(substring) - return ''.join(substrings) - - -def tokenize_escaped_string(s, pos): - quote = s[pos] - assert quote in ('"', "'") - pos = pos+1 - start = pos - while 1: - next = s.find(quote, pos) - if next == -1: - raise SelectorSyntaxError( - "Expected closing %s for string in: %r" - % (quote, s[start:])) - result = s[start:next] - if result.endswith('\\'): - # next quote character is escaped - pos = next+1 + pos += 2 continue - if '\\' in result: - result = unescape_string_literal(result) - return result, next+1 - - -_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) - -def tokenize_symbol(s, pos): - start = pos - match = _illegal_symbol.search(s, pos=pos) - if not match: - # Goes to end of s - return s[start:], len(s) - if match.start() == pos: - raise SelectorSyntaxError( - "Unexpected symbol: %r" % s[pos]) - if not match: - result = s[start:] - pos = len(s) - else: - result = s[start:match.start()] - pos = match.start() - try: - result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') - except UnicodeDecodeError: - import sys - e = sys.exc_info()[1] - raise SelectorSyntaxError( - "Bad symbol %r: %s" % (result, e)) - return result, pos - - -class TokenStream(object): - def __init__(self, tokens, source=None): - self.used = [] + + yield Token("DELIM", s[pos], pos) + pos += 1 + + assert pos == len_s + yield EOFToken(pos) + + +class TokenStream: + def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None: + self.used: list[Token] = [] self.tokens = iter(tokens) self.source = source - self.peeked = None + self.peeked: Token | None = None self._peeking = False - try: - self.next_token = self.tokens.next - except AttributeError: - # Python 3 - self.next_token = self.tokens.__next__ + self.next_token = self.tokens.__next__ - def next(self): + def next(self) -> Token: if self._peeking: self._peeking = False + assert self.peeked is not None self.used.append(self.peeked) return self.peeked - else: - try: - next = self.next_token() - self.used.append(next) - return next - except StopIteration: - return None + next_ = self.next_token() + self.used.append(next_) + return next_ - def __iter__(self): - return iter(self.next, None) - - def peek(self): + def peek(self) -> Token: if not self._peeking: - try: - self.peeked = self.next_token() - except StopIteration: - return None + self.peeked = self.next_token() self._peeking = True + assert self.peeked is not None return self.peeked - def next_symbol(self): - next = self.next() - if not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol, got '%s'" % next) - return next - - def next_symbol_or_star(self): - next = self.next() - if next != '*' and not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol or '*', got '%s'" % next) - return next - - def skip_whitespace(self): - if self.peek() == ' ': + def next_ident(self) -> str: + next_ = self.next() + if next_.type != "IDENT": + raise SelectorSyntaxError(f"Expected ident, got {next_}") + return cast("str", next_.value) + + def next_ident_or_star(self) -> str | None: + next_ = self.next() + if next_.type == "IDENT": + return next_.value + if next_ == ("DELIM", "*"): + return None + raise SelectorSyntaxError(f"Expected ident or '*', got {next_}") + + def skip_whitespace(self) -> None: + peek = self.peek() + if peek.type == "S": self.next() diff --git a/cssselect/py.typed b/cssselect/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/cssselect/tests.py b/cssselect/tests.py deleted file mode 100755 index 086f01f..0000000 --- a/cssselect/tests.py +++ /dev/null @@ -1,935 +0,0 @@ -#!/usr/bin/env python -""" - Tests for cssselect - =================== - - These tests can be run either by py.test or by the standard library's - unittest. They use plain ``assert`` statements and do little reporting - themselves in case of failure. - - Use py.test to get fancy error reporting and assert introspection. - - - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. - -""" - -import sys -import operator -import unittest - -from lxml import html -from cssselect import (parse, GenericTranslator, HTMLTranslator, - SelectorSyntaxError, ExpressionError) -from cssselect.parser import tokenize, parse_series - - -class TestCssselect(unittest.TestCase): - def test_tokenizer(self): - tokens = [repr(item).replace("u'", "'") - for item in tokenize('E > f[a~="y\\"x"]')] - assert tokens == [ - "Symbol('E', 0)", - "Token(' ', 1)", - "Token('>', 2)", - "Token(' ', 3)", - "Symbol('f', 4)", - "Token('[', 5)", - "Symbol('a', 6)", - "Token('~=', 7)", - "String('y\"x', 9)", - "Token(']', 15)"] - - def test_parser(self): - def repr_parse(css): - selectors = parse(css) - for selector in selectors: - assert selector.pseudo_element is None - return [repr(selector._tree).replace("(u'", "('") - for selector in selectors] - - def parse_many(first, *others): - result = repr_parse(first) - for other in others: - assert repr_parse(other) == result - return result - - assert parse_many('*') == ['Element[*]'] - assert parse_many('*|*') == ['Element[*]'] - assert parse_many('*|foo') == ['Element[foo]'] - assert parse_many('foo|*') == ['Element[foo|*]'] - assert parse_many('foo|bar') == ['Element[foo|bar]'] - # This will never match, but it is valid: - assert parse_many('#foo#bar') == ['Hash[Hash[Element[*]#foo]#bar]'] - assert parse_many( - 'div>.foo', - 'div> .foo', - 'div >.foo', - 'div > .foo', - 'div \n> \t \t .foo', 'div\r>\n\n\n.foo', 'div\f>\f.foo' - ) == ['CombinedSelector[Element[div] > Class[Element[*].foo]]'] - assert parse_many('td.foo,.bar', - 'td.foo, .bar', - 'td.foo\t\r\n\f ,\t\r\n\f .bar' - ) == [ - 'Class[Element[td].foo]', - 'Class[Element[*].bar]' - ] - assert parse_many('div, td.foo, div.bar span') == [ - 'Element[div]', - 'Class[Element[td].foo]', - 'CombinedSelector[Class[Element[div].bar] ' - ' Element[span]]'] - assert parse_many('div > p') == [ - 'CombinedSelector[Element[div] > Element[p]]'] - assert parse_many('td:first') == [ - 'Pseudo[Element[td]:first]'] - assert parse_many('td:first') == [ - 'Pseudo[Element[td]:first]'] - assert parse_many('td :first') == [ - 'CombinedSelector[Element[td] ' - ' Pseudo[Element[*]:first]]'] - assert parse_many('td :first') == [ - 'CombinedSelector[Element[td] ' - ' Pseudo[Element[*]:first]]'] - assert parse_many('a[name]', 'a[ name\t]') == [ - 'Attrib[Element[a][name]]'] - assert parse_many('a [name]') == [ - 'CombinedSelector[Element[a] Attrib[Element[*][name]]]'] - assert parse_many('a[rel="include"]') == [ - "Attrib[Element[a][rel = String('include', 6)]]"] - assert parse_many('a[rel = include]') == [ - "Attrib[Element[a][rel = Symbol('include', 8)]]"] - assert parse_many("a[hreflang |= 'en']") == [ - "Attrib[Element[a][hreflang |= String('en', 14)]]"] - assert parse_many('div:nth-child(10)') == [ - "Function[Element[div]:nth-child(Symbol('10', 14))]"] - assert parse_many(':nth-child(2n+2)') == [ - "Function[Element[*]:nth-child(Symbol('2n+2', 11))]"] - assert parse_many('div:nth-of-type(10)') == [ - "Function[Element[div]:nth-of-type(Symbol('10', 16))]"] - assert parse_many('div div:nth-of-type(10) .aclass') == [ - 'CombinedSelector[CombinedSelector[Element[div] ' - "Function[Element[div]:nth-of-type(Symbol('10', 20))]] " - ' Class[Element[*].aclass]]'] - assert parse_many('label:only') == [ - 'Pseudo[Element[label]:only]'] - assert parse_many('a:lang(fr)') == [ - "Function[Element[a]:lang(Symbol('fr', 7))]"] - assert parse_many('div:contains("foo")') == [ - "Function[Element[div]:contains(String('foo', 13))]"] - assert parse_many('div#foobar') == [ - 'Hash[Element[div]#foobar]'] - assert parse_many('div:not(div.foo)') == [ - 'Negation[Element[div]:not(Class[Element[div].foo])]'] - assert parse_many('td ~ th') == [ - 'CombinedSelector[Element[td] ~ Element[th]]'] - - def test_pseudo_elements(self): - def parse_pseudo(css): - result = [] - for selector in parse(css): - result.append(( - repr(selector._tree).replace("(u'", "('"), - selector.pseudo_element)) - return result - - def parse_one(css): - result = parse_pseudo(css) - assert len(result) == 1 - return result[0] - - assert parse_one('foo') == ('Element[foo]', None) - assert parse_one('*') == ('Element[*]', None) - assert parse_one(':empty') == ('Pseudo[Element[*]:empty]', None) - - # Special cases for CSS 2.1 pseudo-elements - assert parse_one(':before') == ('Element[*]', 'before') - assert parse_one(':after') == ('Element[*]', 'after') - assert parse_one(':first-line') == ('Element[*]', 'first-line') - assert parse_one(':first-letter') == ('Element[*]', 'first-letter') - - assert parse_one('::before') == ('Element[*]', 'before') - assert parse_one('::after') == ('Element[*]', 'after') - assert parse_one('::first-line') == ('Element[*]', 'first-line') - assert parse_one('::first-letter') == ('Element[*]', 'first-letter') - - assert parse_one('::selection') == ('Element[*]', 'selection') - assert parse_one('foo:after') == ('Element[foo]', 'after') - assert parse_one('foo::selection') == ('Element[foo]', 'selection') - assert parse_one('lorem#ipsum ~ a#b.c[href]:empty::selection') == ( - 'CombinedSelector[Hash[Element[lorem]#ipsum] ~ ' - 'Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]', - 'selection') - - parse_pseudo('foo:before, bar, baz:after') == [ - ('Element[foo]', 'before'), - ('Element[bar]', None), - ('Element[baz]', 'after')] - - def test_specificity(self): - def specificity(css): - selectors = parse(css) - assert len(selectors) == 1 - return selectors[0].specificity() - - assert specificity('*') == (0, 0, 0) - assert specificity(' foo') == (0, 0, 1) - assert specificity(':empty ') == (0, 1, 0) - assert specificity(':before') == (0, 0, 1) - assert specificity('*:before') == (0, 0, 1) - assert specificity(':nth-child(2)') == (0, 1, 0) - assert specificity('.bar') == (0, 1, 0) - assert specificity('[baz]') == (0, 1, 0) - assert specificity('[baz="4"]') == (0, 1, 0) - assert specificity('[baz^="4"]') == (0, 1, 0) - assert specificity('#lipsum') == (1, 0, 0) - - assert specificity('foo:empty') == (0, 1, 1) - assert specificity('foo:before') == (0, 0, 2) - assert specificity('foo::before') == (0, 0, 2) - assert specificity('foo:empty::before') == (0, 1, 2) - - assert specificity('#lorem + foo#ipsum:first-child > bar:first-line' - ) == (2, 1, 3) - - def test_parse_errors(self): - def get_error(css): - try: - parse(css) - except SelectorSyntaxError: - # Py2, Py3, ... - return str(sys.exc_info()[1]).replace("(u'", "('") - - assert get_error('attributes(href)/html/body/a') == ( - "Expected selector, got '(' at " - "[Symbol('attributes', 0)] -> Token('(', 10)") - assert get_error('attributes(href)') == ( - "Expected selector, got '(' at " - "[Symbol('attributes', 0)] -> Token('(', 10)") - assert get_error('html/body/a') == ( - "Unexpected symbol: '/' at [Symbol('html', 0)] -> None") - assert get_error(' ') == ( - "Expected selector, got 'None' at [Token(' ', 0)] -> None") - assert get_error('div, ') == ( - "Expected selector, got 'None' at " - "[Symbol('div', 0), Token(',', 3), Token(' ', 4)] -> None") - assert get_error(' , div') == ( - "Expected selector, got ',' at " - "[Token(' ', 0)] -> Token(',', 1)") - assert get_error('p, , div') == ( - "Expected selector, got ',' at " - "[Symbol('p', 0), Token(',', 1), Token(' ', 2)] -> Token(',', 3)") - assert get_error('div > ') == ( - "Expected selector, got 'None' at " - "[Symbol('div', 0), Token(' ', 3), Token('>', 4), Token(' ', 5)]" - " -> None") - assert get_error(' > div') == ( - "Expected selector, got '>' at [Token(' ', 0)] -> Token('>', 2)") - assert get_error('foo|#bar') == ( - "Expected symbol or '*', got '#' at " - "[Symbol('foo', 0), Token('|', 3), " - "Token('#', 4)] -> Symbol('bar', 5)") - assert get_error('#.foo') == ( - "Expected symbol, got '.' at " - "[Token('#', 0), Token('.', 1)] -> Symbol('foo', 2)") - assert get_error('.#foo') == ( - "Expected symbol, got '#' at " - "[Token('.', 0), Token('#', 1)] -> Symbol('foo', 2)") - assert get_error(':#foo') == ( - "Expected symbol, got '#' at " - "[Token(':', 0), Token('#', 1)] -> Symbol('foo', 2)") - assert get_error('[*]') == ( - "Expected '|', got ']' at " - "[Token('[', 0), Token('*', 1)] -> Token(']', 2)") - assert get_error('[foo|]') == ( - "Expected symbol, got ']' at " - "[Token('[', 0), Symbol('foo', 1), Token('|', 4), Token(']', 5)]" - " -> None") - assert get_error('[#]') == ( - "Expected symbol or '*', got '#' at " - "[Token('[', 0), Token('#', 1)] -> Token(']', 2)") - assert get_error('[foo=#]') == ( - "Expected string or symbol, got '#' at " - "[Token('[', 0), Symbol('foo', 1), Token('=', 4), Token('#', 5)]" - " -> Token(']', 6)") - assert get_error(':nth-child()') == ( - "Expected argument, got ')' at " - "[Token(':', 0), Symbol('nth-child', 1), Token('(', 10)]" - " -> Token(')', 11)") - assert get_error('[href]a') == ( - "Expected selector, got 'a' at " - "[Token('[', 0), Symbol('href', 1), Token(']', 5)]" - " -> Symbol('a', 6)") - - # Mis-placed pseudo-elements - assert get_error('a:before:empty') == ( - "A pseudo-element must be at the end of a selector at " - "[Symbol('a', 0), Token(':', 1), Symbol('before', 2)] " - "-> Token(':', 8)") - assert get_error('li:before a') == ( - "A pseudo-element must be at the end of a selector at " - "[Symbol('li', 0), Token(':', 2), Symbol('before', 3), " - "Token(' ', 9)] -> Symbol('a', 10)") - assert get_error(':not(:before)') == ( - "Pseudo-elements are not allowed inside :not() at " - "[Token(':', 0), Symbol('not', 1), Token('(', 4), Token(':', 5)," - " Symbol('before', 6)] -> Token(')', 12)") - - - def test_translation(self): - def xpath(css): - return str(GenericTranslator().css_to_xpath(css, prefix='')) - - assert xpath('*') == "*" - assert xpath('E') == "e" - assert xpath('E[foo]') == "e[@foo]" - assert xpath('E[foo="bar"]') == "e[@foo = 'bar']" - assert xpath('E[foo~="bar"]') == ( - "e[@foo and contains(" - "concat(' ', normalize-space(@foo), ' '), ' bar ')]") - assert xpath('E[foo^="bar"]') == ( - "e[@foo and starts-with(@foo, 'bar')]") - assert xpath('E[foo$="bar"]') == ( - "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']") - assert xpath('E[foo*="bar"]') == ( - "e[@foo and contains(@foo, 'bar')]") - assert xpath('E[hreflang|="en"]') == ( - "e[@hreflang and (" - "@hreflang = 'en' or starts-with(@hreflang, 'en-'))]") - assert xpath('E:nth-child(1)') == ( - "*/*[name() = 'e' and (position() = 1)]") - assert xpath('E:nth-last-child(1)') == ( - "*/*[name() = 'e' and (position() = last() - 1)]") - assert xpath('E:nth-last-child(2n+2)') == ( - "*/*[name() = 'e' and (" - "(position() +2) mod -2 = 0 and position() < (last() -2))]") - assert xpath('E:nth-of-type(1)') == ( - "*/e[position() = 1]") - assert xpath('E:nth-last-of-type(1)') == ( - "*/e[position() = last() - 1]") - assert xpath('E:nth-last-of-type(1)') == ( - "*/e[position() = last() - 1]") - assert xpath('div E:nth-last-of-type(1) .aclass') == ( - "div/descendant-or-self::*/e[position() = last() - 1]" - "/descendant-or-self::*/*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aclass ')]") - assert xpath('E:first-child') == ( - "*/*[name() = 'e' and (position() = 1)]") - assert xpath('E:last-child') == ( - "*/*[name() = 'e' and (position() = last())]") - assert xpath('E:first-of-type') == ( - "*/e[position() = 1]") - assert xpath('E:last-of-type') == ( - "*/e[position() = last()]") - assert xpath('E:only-child') == ( - "*/*[name() = 'e' and (last() = 1)]") - assert xpath('E:only-of-type') == ( - "e[last() = 1]") - assert xpath('E:empty') == ( - "e[not(*) and not(normalize-space())]") - assert xpath('E:root') == ( - "e[not(parent::*)]") - assert xpath('E:contains("foo")') == ( - "e[contains(string(.), 'foo')]") - assert xpath('E:contains(foo)') == ( - "e[contains(string(.), 'foo')]") - assert xpath('E.warning') == ( - "e[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' warning ')]") - assert xpath('E#myid') == ( - "e[@id = 'myid']") - assert xpath('E:not(:nth-child(odd))') == ( - "e[not((position() -1) mod 2 = 0 and position() >= 1)]") - assert xpath('E F') == ( - "e/descendant-or-self::*/f") - assert xpath('E > F') == ( - "e/f") - assert xpath('E + F') == ( - "e/following-sibling::*[name() = 'f' and (position() = 1)]") - assert xpath('E ~ F') == ( - "e/following-sibling::f") - assert xpath('div#container p') == ( - "div[@id = 'container']/descendant-or-self::*/p") - self.assertRaises(ExpressionError, xpath, 'p *:only-of-type') - - def test_unicode(self): - if sys.version_info[0] >= 3: - css = '.a\xc1b' - else: - css = '.a\xc1b'.decode('ISO-8859-1') - - xpath = GenericTranslator().css_to_xpath(css) - assert css[1:] in xpath - xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII') - assert xpath == ( - "descendant-or-self::*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aÁb ')]") - - def test_quoting(self): - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath('*[aval="\'"]') == ( - '''descendant-or-self::*[@aval = "'"]''') - assert css_to_xpath('*[aval="\'\'\'"]') == ( - """descendant-or-self::*[@aval = "'''"]""") - assert css_to_xpath('*[aval=\'"\']') == ( - '''descendant-or-self::*[@aval = '"']''') - assert css_to_xpath('*[aval=\'"""\']') == ( - '''descendant-or-self::*[@aval = '"""']''') - - def test_unicode_escapes(self): - # \22 == '"' \20 == ' ' - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath(r'*[aval="\'\22\'"]') == ( - '''descendant-or-self::*[@aval = concat("'",'"',"'")]''') - assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( - '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''') - assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( - '''descendant-or-self::*[@aval = "' '"]''') - assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == ( - '''descendant-or-self::*[@aval = "' '"]''') - - def test_series(self): - assert parse_series('1n+3') == (1, 3) - assert parse_series('n-5') == (1, -5) - assert parse_series('odd') == (2, 1) - assert parse_series('even') == (2, 0) - assert parse_series('3n') == (3, 0) - assert parse_series('n') == (1, 0) - assert parse_series('5') == (0, 5) - - def test_select(self): - document = html.document_fromstring(HTML_IDS) - sort_key = dict( - (el, count) for count, el in enumerate(document.getiterator()) - ).__getitem__ - css_to_xpath = GenericTranslator().css_to_xpath - html_css_to_xpath = HTMLTranslator().css_to_xpath - - def select_ids(selector, html_only): - xpath = css_to_xpath(selector) - items = document.xpath(xpath) - if html_only: - assert items == [] - xpath = html_css_to_xpath(selector) - items = document.xpath(xpath) - items.sort(key=sort_key) - return [element.get('id', 'nil') for element in items] - - def pcss(main, *selectors, **kwargs): - html_only = kwargs.pop('html_only', False) - result = select_ids(main, html_only) - for selector in selectors: - assert select_ids(selector, html_only) == result - return result - - all_ids = pcss('*') - assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div'] - assert all_ids[-1:] == ['foobar-span'] - assert pcss('div') == ['outer-div', 'li-div', 'foobar-div'] - assert pcss('div div') == ['li-div'] - assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div'] - assert pcss('a[name]') == ['name-anchor'] - assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[rel="tag"]') == ['tag-anchor'] - assert pcss('a[href*="localhost"]') == ['tag-anchor'] - assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[href^="http:"]') == ['tag-anchor'] - assert pcss('a[href$="org"]') == ['nofollow-anchor'] - assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [ - 'foobar-div'] - assert pcss('div[foobar~="cd"]') == [] - assert pcss('*[lang|="en"]', '*[lang|="en-US"]') == ['second-li'] - assert pcss('*[lang|="e"]') == [] - assert pcss('li:nth-child(3)') == ['third-li'] - assert pcss('li:nth-child(10)') == [] - assert pcss('li:nth-child(2n)', 'li:nth-child(even)', - 'li:nth-child(2n+0)') == [ - 'second-li', 'fourth-li', 'sixth-li'] - assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [ - 'first-li', 'third-li', 'fifth-li', 'seventh-li'] - assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li'] - # FIXME: I'm not 100% sure this is right: - assert pcss('li:nth-child(3n+1)') == [ - 'first-li', 'fourth-li', 'seventh-li'] - assert pcss('li:nth-last-child(0)') == [ - 'seventh-li'] - assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [ - 'second-li', 'fourth-li', 'sixth-li'] - assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li'] - assert pcss('ol:first-of-type') == ['first-ol'] - assert pcss('ol:nth-child(1)') == [] - assert pcss('ol:nth-of-type(2)') == ['second-ol'] - # FIXME: like above', '(1) or (2)? - assert pcss('ol:nth-last-of-type(1)') == ['first-ol'] - assert pcss('span:only-child') == ['foobar-span'] - assert pcss('li div:only-child') == ['li-div'] - assert pcss('div *:only-child') == [ - 'li-div', 'checkbox-disabled', 'foobar-span'] - self.assertRaises(ExpressionError, pcss, 'p *:only-of-type') - self.assertRaises(ExpressionError, pcss, 'p:lang(fr)') - assert pcss('p:only-of-type') == ['paragraph'] - assert pcss('a:empty') == ['name-anchor'] - assert pcss('li:empty') == [ - 'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li'] - assert pcss(':root', 'html:root') == ['html'] - assert pcss('li:root', '* :root') == [] - assert pcss('*:contains("link")') == [ - 'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor'] - assert pcss('*:contains("LInk")') == [] # case sensitive - assert pcss('*:contains("e")') == [ - 'html', 'nil', 'outer-div', 'first-ol', 'first-li', - 'paragraph', 'p-em'] - assert pcss('*:contains("E")') == [] # case-sensitive - assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol'] - assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li'] - assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [ - 'third-li', 'fourth-li'] - assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li'] - # Need some tests of :not()'] - assert pcss('li div', 'li > div', 'div div') == ['li-div'] - assert pcss('div > div') == [] - assert pcss('div>.c', 'div > .c') == ['first-ol'] - assert pcss('div + div') == ['foobar-div'] - assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor'] - assert pcss('ol#first-ol li:last-child') == ['seventh-li'] - assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li'] - assert pcss('#outer-div:first-child') == ['outer-div'] - assert pcss('#outer-div :first-child') == [ - 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled'] - assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss(':link', html_only=True) == pcss('a[href]') - assert pcss(':checked', html_only=True) == ['checkbox-checked'] - assert pcss(':disabled', html_only=True) == [ - 'fieldset', 'checkbox-disabled'] - assert pcss(':enabled', html_only=True) == [ - 'checkbox-unchecked', 'checkbox-checked'] - - def test_select_shakespeare(self): - document = html.document_fromstring(HTML_SHAKESPEARE) - body = document.xpath('//body')[0] - css_to_xpath = GenericTranslator().css_to_xpath - - try: - basestring_ = basestring - except NameError: - basestring_ = (str, bytes) - - def count(selector): - xpath = css_to_xpath(selector) - results = body.xpath(xpath) - assert not isinstance(results, basestring_) - found = set() - for item in results: - assert item not in found - found.add(item) - assert not isinstance(item, basestring_) - return len(results) - - # Data borrowed from http://mootools.net/slickspeed/ - - ## Changed from original; probably because I'm only - ## searching the body. - #assert count('*') == 252 - assert count('*') == 246 - assert count('div:contains(CELIA)') == 26 - assert count('div:only-child') == 22 # ? - assert count('div:nth-child(even)') == 106 - assert count('div:nth-child(2n)') == 106 - assert count('div:nth-child(odd)') == 137 - assert count('div:nth-child(2n+1)') == 137 - assert count('div:nth-child(n)') == 243 - assert count('div:last-child') == 53 - assert count('div:first-child') == 51 - assert count('div > div') == 242 - assert count('div + div') == 190 - assert count('div ~ div') == 190 - assert count('body') == 1 - assert count('body div') == 243 - assert count('div') == 243 - assert count('div div') == 242 - assert count('div div div') == 241 - assert count('div, div, div') == 243 - assert count('div, a, span') == 243 - assert count('.dialog') == 51 - assert count('div.dialog') == 51 - assert count('div .dialog') == 51 - assert count('div.character, div.dialog') == 99 - assert count('div.direction.dialog') == 0 - assert count('div.dialog.direction') == 0 - assert count('div.dialog.scene') == 1 - assert count('div.scene.scene') == 1 - assert count('div.scene .scene') == 0 - assert count('div.direction .dialog ') == 0 - assert count('div .dialog .direction') == 4 - assert count('div.dialog .dialog .direction') == 4 - assert count('#speech5') == 1 - assert count('div#speech5') == 1 - assert count('div #speech5') == 1 - assert count('div.scene div.dialog') == 49 - assert count('div#scene1 div.dialog div') == 142 - assert count('#scene1 #speech1') == 1 - assert count('div[class]') == 103 - assert count('div[class=dialog]') == 50 - assert count('div[class^=dia]') == 51 - assert count('div[class$=log]') == 50 - assert count('div[class*=sce]') == 1 - assert count('div[class|=dialog]') == 50 # ? Seems right - assert count('div[class!=madeup]') == 243 # ? Seems right - assert count('div[class~=dialog]') == 51 # ? Seems right - -HTML_IDS = ''' - -

- - link - - link -

content
-
-
-

- hi there - guy - - -

- -

- -''' - - -HTML_SHAKESPEARE = ''' - - - - - - -

As You Like It

- by William Shakespeare -

ACT I, SCENE III. A room in the palace.

Enter CELIA and ROSALIND

CELIA

Why, cousin! why, Rosalind! Cupid have mercy! not a word?

ROSALIND

Not one to throw at a dog.

CELIA

No, thy words are too precious to be cast away upon

curs; throw some of them at me; come, lame me with reasons.

ROSALIND

CELIA

But is all this for your father?

Then there were two cousins laid up; when the one

should be lamed with reasons and the other mad

without any.

ROSALIND

No, some of it is for my child's father. O, how

full of briers is this working-day world!

CELIA

They are but burs, cousin, thrown upon thee in

holiday foolery: if we walk not in the trodden

paths our very petticoats will catch them.

ROSALIND

I could shake them off my coat: these burs are in my heart.

CELIA

Hem them away.

ROSALIND

I would try, if I could cry 'hem' and have him.

CELIA

Come, come, wrestle with thy affections.

ROSALIND

O, they take the part of a better wrestler than myself!

CELIA

O, a good wish upon you! you will try in time, in

despite of a fall. But, turning these jests out of

service, let us talk in good earnest: is it

possible, on such a sudden, you should fall into so

strong a liking with old Sir Rowland's youngest son?

ROSALIND

The duke my father loved his father dearly.

CELIA

Doth it therefore ensue that you should love his son

dearly? By this kind of chase, I should hate him,

for my father hated his father dearly; yet I hate

not Orlando.

ROSALIND

No, faith, hate him not, for my sake.

CELIA

Why should I not? doth he not deserve well?

ROSALIND

Let me love him for that, and do you love him

because I do. Look, here comes the duke.

CELIA

With his eyes full of anger.

Enter DUKE FREDERICK, with Lords

DUKE FREDERICK

Mistress, dispatch you with your safest haste

And get you from our court.

ROSALIND

Me, uncle?

DUKE FREDERICK

You, cousin

Within these ten days if that thou be'st found

So near our public court as twenty miles,

Thou diest for it.

ROSALIND

I do beseech your grace,

Let me the knowledge of my fault bear with me:

If with myself I hold intelligence

Or have acquaintance with mine own desires,

If that I do not dream or be not frantic,--

As I do trust I am not--then, dear uncle,

Never so much as in a thought unborn

Did I offend your highness.

DUKE FREDERICK

Thus do all traitors:

If their purgation did consist in words,

They are as innocent as grace itself:

Let it suffice thee that I trust thee not.

ROSALIND

Yet your mistrust cannot make me a traitor:

Tell me whereon the likelihood depends.

DUKE FREDERICK

Thou art thy father's daughter; there's enough.

ROSALIND

So was I when your highness took his dukedom;

So was I when your highness banish'd him:

Treason is not inherited, my lord;

Or, if we did derive it from our friends,

What's that to me? my father was no traitor:

Then, good my liege, mistake me not so much

To think my poverty is treacherous.

CELIA

Dear sovereign, hear me speak.

DUKE FREDERICK

Ay, Celia; we stay'd her for your sake,

Else had she with her father ranged along.

CELIA

I did not then entreat to have her stay;

It was your pleasure and your own remorse:

I was too young that time to value her;

But now I know her: if she be a traitor,

Why so am I; we still have slept together,

Rose at an instant, learn'd, play'd, eat together,

And wheresoever we went, like Juno's swans,

Still we went coupled and inseparable.

DUKE FREDERICK

She is too subtle for thee; and her smoothness,

Her very silence and her patience

Speak to the people, and they pity her.

Thou art a fool: she robs thee of thy name;

And thou wilt show more bright and seem more virtuous

When she is gone. Then open not thy lips:

Firm and irrevocable is my doom

Which I have pass'd upon her; she is banish'd.

CELIA

Pronounce that sentence then on me, my liege:

I cannot live out of her company.

DUKE FREDERICK

You are a fool. You, niece, provide yourself:

If you outstay the time, upon mine honour,

And in the greatness of my word, you die.

Exeunt DUKE FREDERICK and Lords

CELIA

O my poor Rosalind, whither wilt thou go?

Wilt thou change fathers? I will give thee mine.

I charge thee, be not thou more grieved than I am.

ROSALIND

I have more cause.

CELIA

Thou hast not, cousin;

Prithee be cheerful: know'st thou not, the duke

Hath banish'd me, his daughter?

ROSALIND

That he hath not.

CELIA

No, hath not? Rosalind lacks then the love

Which teacheth thee that thou and I am one:

Shall we be sunder'd? shall we part, sweet girl?

No: let my father seek another heir.

Therefore devise with me how we may fly,

Whither to go and what to bear with us;

And do not seek to take your change upon you,

To bear your griefs yourself and leave me out;

For, by this heaven, now at our sorrows pale,

Say what thou canst, I'll go along with thee.

ROSALIND

Why, whither shall we go?

CELIA

To seek my uncle in the forest of Arden.

ROSALIND

Alas, what danger will it be to us,

Maids as we are, to travel forth so far!

Beauty provoketh thieves sooner than gold.

CELIA

I'll put myself in poor and mean attire

And with a kind of umber smirch my face;

The like do you: so shall we pass along

And never stir assailants.

ROSALIND

Were it not better,

Because that I am more than common tall,

That I did suit me all points like a man?

A gallant curtle-axe upon my thigh,

A boar-spear in my hand; and--in my heart

Lie there what hidden woman's fear there will--

We'll have a swashing and a martial outside,

As many other mannish cowards have

That do outface it with their semblances.

CELIA

What shall I call thee when thou art a man?

ROSALIND

I'll have no worse a name than Jove's own page;

And therefore look you call me Ganymede.

But what will you be call'd?

CELIA

Something that hath a reference to my state

No longer Celia, but Aliena.

ROSALIND

But, cousin, what if we assay'd to steal

The clownish fool out of your father's court?

Would he not be a comfort to our travel?

CELIA

He'll go along o'er the wide world with me;

Leave me alone to woo him. Let's away,

And get our jewels and our wealth together,

Devise the fittest time and safest way

To hide us from pursuit that will be made

After my flight. Now go we in content

To liberty and not to banishment.

Exeunt

- - -''' - - -if __name__ == '__main__': - unittest.main() diff --git a/cssselect/xpath.py b/cssselect/xpath.py index 23a165c..96eac3f 100644 --- a/cssselect/xpath.py +++ b/cssselect/xpath.py @@ -1,27 +1,46 @@ """ - cssselect.xpath - =============== +cssselect.xpath +=============== - Translation of parsed CSS selectors to XPath expressions. +Translation of parsed CSS selectors to XPath expressions. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ -import re -from cssselect.parser import parse, parse_series, SelectorError - +from __future__ import annotations -try: - _basestring = basestring - _unicode = unicode -except NameError: - # Python 3 - _basestring = str - _unicode = str +import re +from typing import TYPE_CHECKING, cast + +from cssselect.parser import ( + Attrib, + Class, + CombinedSelector, + Element, + Function, + Hash, + Matching, + Negation, + Pseudo, + PseudoElement, + Relation, + Selector, + SelectorError, + SpecificityAdjustment, + Tree, + parse, + parse_series, +) + +if TYPE_CHECKING: + from collections.abc import Callable + + # typing.Self requires Python 3.11 + from typing_extensions import Self class ExpressionError(SelectorError, RuntimeError): @@ -30,368 +49,656 @@ class ExpressionError(SelectorError, RuntimeError): #### XPath Helpers -class XPathExpr(object): - def __init__(self, path='', element='*', condition='', star_prefix=False): +class XPathExpr: + def __init__( + self, + path: str = "", + element: str = "*", + condition: str = "", + star_prefix: bool = False, + ) -> None: self.path = path self.element = element self.condition = condition - self.star_prefix = star_prefix - def __str__(self): - path = _unicode(self.path) + _unicode(self.element) + def __str__(self) -> str: + path = str(self.path) + str(self.element) if self.condition: - path += '[%s]' % self.condition + path += f"[{self.condition}]" return path - def __repr__(self): - return '%s[%s]' % (self.__class__.__name__, self) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self}]" - def add_condition(self, condition): + def add_condition(self, condition: str, conjuction: str = "and") -> Self: if self.condition: - self.condition = '%s and (%s)' % (self.condition, condition) + self.condition = f"({self.condition}) {conjuction} ({condition})" else: self.condition = condition return self - def add_name_test(self): - if self.element == '*': + def add_name_test(self) -> None: + if self.element == "*": # We weren't doing a test anyway return - self.add_condition( - "name() = %s" % GenericTranslator.xpath_literal(self.element)) - self.element = '*' + self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}") + self.element = "*" - def add_star_prefix(self): + def add_star_prefix(self) -> None: """ - Adds a /* prefix if there is no prefix. This is when you need - to keep context's constrained to a single parent. + Append '*/' to the path to keep the context constrained + to a single parent. """ - if self.path: - self.path += '*/' - else: - self.path = '*/' - self.star_prefix = True - - def join(self, combiner, other): - path = _unicode(self) + combiner - # We don't need a star prefix if we are joining to this other - # prefix; so we'll get rid of it - if not(other.star_prefix and other.path == '*/'): + self.path += "*/" + + def join( + self, + combiner: str, + other: XPathExpr, + closing_combiner: str | None = None, + has_inner_condition: bool = False, + ) -> Self: + path = str(self) + combiner + # Any "star prefix" is redundant when joining. + if other.path != "*/": path += other.path self.path = path - self.element = other.element - self.condition = other.condition + if not has_inner_condition: + self.element = ( + other.element + closing_combiner if closing_combiner else other.element + ) + self.condition = other.condition + else: + self.element = other.element + if other.condition: + self.element += "[" + other.condition + "]" + if closing_combiner: + self.element += closing_combiner return self split_at_single_quotes = re.compile("('+)").split +# The spec is actually more permissive than that, but don’t bother. +# This is just for the fast path. +# http://www.w3.org/TR/REC-xml/#NT-NameStartChar +is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match + +# Test that the string is not empty and does not contain whitespace +is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match + #### Translation -class GenericTranslator(object): + +class GenericTranslator: """ Translator for "generic" XML documents. + + Everything is case-sensitive, no assumption is made on the meaning + of element names and attribute names. + """ + + #### + #### HERE BE DRAGONS + #### + #### You are welcome to hook into this to change some behavior, + #### but do so at your own risks. + #### Until it has received a lot more work and review, + #### I reserve the right to change this API in backward-incompatible ways + #### with any minor version of cssselect. + #### See https://github.com/scrapy/cssselect/pull/22 + #### -- Simon Sapin. + #### + combinator_mapping = { - ' ': 'descendant', - '>': 'child', - '+': 'direct_adjacent', - '~': 'indirect_adjacent', + " ": "descendant", + ">": "child", + "+": "direct_adjacent", + "~": "indirect_adjacent", } attribute_operator_mapping = { - 'exists': 'exists', - '=': 'equals', - '~=': 'includes', - '|=': 'dashmatch', - '^=': 'prefixmatch', - '$=': 'suffixmatch', - '*=': 'substringmatch', - '!=': 'different', # XXX Not in Level 3 but meh + "exists": "exists", + "=": "equals", + "~=": "includes", + "|=": "dashmatch", + "^=": "prefixmatch", + "$=": "suffixmatch", + "*=": "substringmatch", + "!=": "different", # XXX Not in Level 3 but meh } #: The attribute used for ID selectors depends on the document language: #: http://www.w3.org/TR/selectors/#id-selectors - id_attribute = 'id' - - def css_to_xpath(self, css, prefix='descendant-or-self::'): + id_attribute = "id" + + #: The attribute used for ``:lang()`` depends on the document language: + #: http://www.w3.org/TR/selectors/#lang-pseudo + lang_attribute = "xml:lang" + + #: The case sensitivity of document language element names, + #: attribute names, and attribute values in selectors depends + #: on the document language. + #: http://www.w3.org/TR/selectors/#casesens + #: + #: When a document language defines one of these as case-insensitive, + #: cssselect assumes that the document parser makes the parsed values + #: lower-case. Making the selector lower-case too makes the comparaison + #: case-insensitive. + #: + #: In HTML, element names and attributes names (but not attribute values) + #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 + #: and HTMLParser make them lower-case in their parse result, so + #: the assumption holds. + lower_case_element_names = False + lower_case_attribute_names = False + lower_case_attribute_values = False + + # class used to represent and xpath expression + xpathexpr_cls = XPathExpr + + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: """Translate a *group of selectors* to XPath. Pseudo-elements are not supported here since XPath only knows about "real" elements. :param css: - A *group of selectors* as an Unicode string. + A *group of selectors* as a string. + :param prefix: + This string is prepended to the XPath expression for each selector. + The default makes selectors scoped to the context node’s subtree. :raises: - :class:`SelectorSyntaxError` on invalid selectors, + :class:`~cssselect.SelectorSyntaxError` on invalid selectors, :class:`ExpressionError` on unknown/unsupported selectors, including pseudo-elements. :returns: - The equivalent XPath 1.0 expression as an Unicode string. + The equivalent XPath 1.0 expression as a string. """ - selectors = parse(css) - for selector in selectors: - if selector.pseudo_element: - raise ExpressionError('Pseudo-elements are not supported.') - - return ' | '.join( - self.selector_to_xpath(selector, prefix) - for selector in selectors) + return " | ".join( + self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) + for selector in parse(css) + ) - def selector_to_xpath(self, selector, prefix='descendant-or-self::'): + def selector_to_xpath( + self, + selector: Selector, + prefix: str = "descendant-or-self::", + translate_pseudo_elements: bool = False, + ) -> str: """Translate a parsed selector to XPath. - The :attr:`~Selector.pseudo_element` attribute of the selector - is ignored. It is the caller's responsibility to reject selectors - with pseudo-elements, or to account for them somehow. :param selector: A parsed :class:`Selector` object. + :param prefix: + This string is prepended to the resulting XPath expression. + The default makes selectors scoped to the context node’s subtree. + :param translate_pseudo_elements: + Unless this is set to ``True`` (as :meth:`css_to_xpath` does), + the :attr:`~Selector.pseudo_element` attribute of the selector + is ignored. + It is the caller's responsibility to reject selectors + with pseudo-elements, or to account for them somehow. :raises: :class:`ExpressionError` on unknown/unsupported selectors. :returns: - The equivalent XPath 1.0 expression as an Unicode string. + The equivalent XPath 1.0 expression as a string. + + """ + tree = getattr(selector, "parsed_tree", None) + if not tree: + raise TypeError(f"Expected a parsed selector, got {selector!r}") + xpath = self.xpath(tree) + assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' + if translate_pseudo_elements and selector.pseudo_element: + xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) + return (prefix or "") + str(xpath) + + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + """Translate a pseudo-element. + + Defaults to not supporting pseudo-elements at all, + but can be overridden by sub-classes. """ - return (prefix or '') + _unicode(self.xpath(selector._tree)) + raise ExpressionError("Pseudo-elements are not supported.") @staticmethod - def xpath_literal(s): - s = _unicode(s) + def xpath_literal(s: str) -> str: + s = str(s) if "'" not in s: - s = "'%s'" % s + s = f"'{s}'" elif '"' not in s: - s = '"%s"' % s + s = f'"{s}"' else: - s = "concat(%s)" % ','.join([ - (("'" in part) and '"%s"' or "'%s'") % part - for part in split_at_single_quotes(s) if part - ]) + parts_quoted = [ + f'"{part}"' if "'" in part else f"'{part}'" + for part in split_at_single_quotes(s) + if part + ] + s = "concat({})".format(",".join(parts_quoted)) return s - def xpath(self, parsed_selector): + def xpath(self, parsed_selector: Tree) -> XPathExpr: """Translate any parsed selector object.""" type_name = type(parsed_selector).__name__ - method = getattr(self, 'xpath_%s' % type_name.lower(), None) - if not method: - raise TypeError('Expected a parsed selector, got %s' % type_name) + method = cast( + "Callable[[Tree], XPathExpr] | None", + getattr(self, f"xpath_{type_name.lower()}", None), + ) + if method is None: + raise ExpressionError(f"{type_name} is not supported.") return method(parsed_selector) - # Dispatched by parsed object type - def xpath_combinedselector(self, combined): + def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: """Translate a combined selector.""" - combinator = self.combinator_mapping.get(combined.combinator) - if not combinator: - raise ExpressionError( - "Unknown combinator: %r" % combined.combinator) - method = getattr(self, 'xpath_%s_combinator' % combinator) - return method(self.xpath(combined.selector), - self.xpath(combined.subselector)) + combinator = self.combinator_mapping[combined.combinator] + method = cast( + "Callable[[XPathExpr, XPathExpr], XPathExpr]", + getattr(self, f"xpath_{combinator}_combinator"), + ) + return method(self.xpath(combined.selector), self.xpath(combined.subselector)) - def xpath_negation(self, negation): + def xpath_negation(self, negation: Negation) -> XPathExpr: xpath = self.xpath(negation.selector) sub_xpath = self.xpath(negation.subselector) sub_xpath.add_name_test() - return xpath.add_condition('not(%s)' % sub_xpath.condition) + if sub_xpath.condition: + return xpath.add_condition(f"not({sub_xpath.condition})") + return xpath.add_condition("0") + + def xpath_relation(self, relation: Relation) -> XPathExpr: + xpath = self.xpath(relation.selector) + combinator = relation.combinator + subselector = relation.subselector + right = self.xpath(subselector.parsed_tree) + method = cast( + "Callable[[XPathExpr, XPathExpr], XPathExpr]", + getattr( + self, + f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator", + ), + ) + return method(xpath, right) + + def xpath_matching(self, matching: Matching) -> XPathExpr: + xpath = self.xpath(matching.selector) + exprs = [self.xpath(selector) for selector in matching.selector_list] + for e in exprs: + e.add_name_test() + if e.condition: + xpath.add_condition(e.condition, "or") + return xpath - def xpath_function(self, function): + def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: + xpath = self.xpath(matching.selector) + exprs = [self.xpath(selector) for selector in matching.selector_list] + for e in exprs: + e.add_name_test() + if e.condition: + xpath.add_condition(e.condition, "or") + return xpath + + def xpath_function(self, function: Function) -> XPathExpr: """Translate a functional pseudo-class.""" - method = 'xpath_%s_function' % function.name.replace('-', '_') - method = getattr(self, method, None) + method_name = "xpath_{}_function".format(function.name.replace("-", "_")) + method = cast( + "Callable[[XPathExpr, Function], XPathExpr] | None", + getattr(self, method_name, None), + ) if not method: - raise ExpressionError( - "The pseudo-class :%s() is unknown" % function.name) + raise ExpressionError(f"The pseudo-class :{function.name}() is unknown") return method(self.xpath(function.selector), function) - def xpath_pseudo(self, pseudo): + def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: """Translate a pseudo-class.""" - method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_') - method = getattr(self, method, None) + method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_")) + method = cast( + "Callable[[XPathExpr], XPathExpr] | None", + getattr(self, method_name, None), + ) if not method: # TODO: better error message for pseudo-elements? - raise ExpressionError( - "The pseudo-class :%s is unknown" % pseudo.ident) + raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown") return method(self.xpath(pseudo.selector)) - - def xpath_attrib(self, selector): + def xpath_attrib(self, selector: Attrib) -> XPathExpr: """Translate an attribute selector.""" - operator = self.attribute_operator_mapping.get(selector.operator) - if not operator: - raise ExpressionError( - "Unknown attribute operator: %r" % selector.operator) - method = getattr(self, 'xpath_attrib_%s' % operator) - # FIXME: what if attrib is *? - if selector.namespace == '*': - name = '@' + selector.attrib + operator = self.attribute_operator_mapping[selector.operator] + method = cast( + "Callable[[XPathExpr, str, str | None], XPathExpr]", + getattr(self, f"xpath_attrib_{operator}"), + ) + if self.lower_case_attribute_names: + name = selector.attrib.lower() + else: + name = selector.attrib + safe = is_safe_name(name) + if selector.namespace: + name = f"{selector.namespace}:{name}" + safe = safe and is_safe_name(selector.namespace) + if safe: + attrib = "@" + name + else: + attrib = f"attribute::*[name() = {self.xpath_literal(name)}]" + if selector.value is None: + value = None + elif self.lower_case_attribute_values: + value = cast("str", selector.value.value).lower() else: - name = '@%s:%s' % (selector.namespace, selector.attrib) - return method(self.xpath(selector.selector), name, selector.value) + value = selector.value.value + return method(self.xpath(selector.selector), attrib, value) - def xpath_class(self, class_selector): + def xpath_class(self, class_selector: Class) -> XPathExpr: """Translate a class selector.""" # .foo is defined as [class~=foo] in the spec. xpath = self.xpath(class_selector.selector) - return self.xpath_attrib_includes( - xpath, '@class', class_selector.class_name) + return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) - def xpath_hash(self, id_selector): + def xpath_hash(self, id_selector: Hash) -> XPathExpr: """Translate an ID selector.""" xpath = self.xpath(id_selector.selector) - return xpath.add_condition('@%s = %s' % ( - self.id_attribute, self.xpath_literal(id_selector.id))) + return self.xpath_attrib_equals(xpath, "@id", id_selector.id) - def xpath_element(self, selector): + def xpath_element(self, selector: Element) -> XPathExpr: """Translate a type or universal selector.""" - if selector.namespace == '*': - element = selector.element.lower() + element = selector.element + if not element: + element = "*" + safe = True else: - # FIXME: Should we lowercase here? - element = '%s:%s' % (selector.namespace, selector.element) - return XPathExpr(element=element) - + safe = bool(is_safe_name(element)) + if self.lower_case_element_names: + element = element.lower() + if selector.namespace: + # Namespace prefixes are case-sensitive. + # http://www.w3.org/TR/css3-namespace/#prefixes + element = f"{selector.namespace}:{element}" + safe = safe and bool(is_safe_name(selector.namespace)) + xpath = self.xpathexpr_cls(element=element) + if not safe: + xpath.add_name_test() + return xpath # CombinedSelector: dispatch by combinator - def xpath_descendant_combinator(self, left, right): + def xpath_descendant_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a child, grand-child or further descendant of left""" - return left.join('/descendant-or-self::*/', right) + return left.join("/descendant-or-self::*/", right) - def xpath_child_combinator(self, left, right): + def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: """right is an immediate child of left""" - return left.join('/', right) + return left.join("/", right) - def xpath_direct_adjacent_combinator(self, left, right): + def xpath_direct_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a sibling immediately after left""" - xpath = left.join('/following-sibling::', right) + xpath = left.join("/following-sibling::", right) xpath.add_name_test() - return xpath.add_condition('position() = 1') + return xpath.add_condition("position() = 1") - def xpath_indirect_adjacent_combinator(self, left, right): + def xpath_indirect_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a sibling after left, immediately or not""" - return left.join('/following-sibling::', right) + return left.join("/following-sibling::", right) + + def xpath_relation_descendant_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a child, grand-child or further descendant of left; select left""" + return left.join( + "[descendant::", right, closing_combiner="]", has_inner_condition=True + ) + def xpath_relation_child_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is an immediate child of left; select left""" + return left.join("[./", right, closing_combiner="]") + + def xpath_relation_direct_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a sibling immediately after left; select left""" + return left.add_condition( + f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]" + ) + + def xpath_relation_indirect_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a sibling after left, immediately or not; select left""" + return left.join("[following-sibling::", right, closing_combiner="]") # Function: dispatch by function/pseudo-class name - def xpath_nth_child_function(self, xpath, function, last=False, - add_name_test=True): - a, b = parse_series(function.arguments) - if not a and not b and not last: - # a=0 means nothing is returned... - return xpath.add_condition('false() and position() = 0') - if add_name_test: - xpath.add_name_test() - xpath.add_star_prefix() - if a == 0: - if last: - b = 'last() - %s' % b - return xpath.add_condition('position() = %s' % b) - if last: - # FIXME: I'm not sure if this is right - a = -a - b = -b - if b > 0: - b_neg = str(-b) + def xpath_nth_child_function( + self, + xpath: XPathExpr, + function: Function, + last: bool = False, + add_name_test: bool = True, + ) -> XPathExpr: + try: + a, b = parse_series(function.arguments) + except ValueError as ex: + raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex + + # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: + # + # :nth-child(an+b) + # an+b-1 siblings before + # + # :nth-last-child(an+b) + # an+b-1 siblings after + # + # :nth-of-type(an+b) + # an+b-1 siblings with the same expanded element name before + # + # :nth-last-of-type(an+b) + # an+b-1 siblings with the same expanded element name after + # + # So, + # for :nth-child and :nth-of-type + # + # count(preceding-sibling::) = an+b-1 + # + # for :nth-last-child and :nth-last-of-type + # + # count(following-sibling::) = an+b-1 + # + # therefore, + # count(...) - (b-1) ≡ 0 (mod a) + # + # if a == 0: + # ~~~~~~~~~~ + # count(...) = b-1 + # + # if a < 0: + # ~~~~~~~~~ + # count(...) - b +1 <= 0 + # -> count(...) <= b-1 + # + # if a > 0: + # ~~~~~~~~~ + # count(...) - b +1 >= 0 + # -> count(...) >= b-1 + + # work with b-1 instead + b_min_1 = b - 1 + + # early-exit condition 1: + # ~~~~~~~~~~~~~~~~~~~~~~~ + # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, + # and since n ∈ {0, 1, 2, ...}, if b-1<=0, + # there is always an "n" matching any number of siblings (maybe none) + if a == 1 and b_min_1 <= 0: + return xpath + + # early-exit condition 2: + # ~~~~~~~~~~~~~~~~~~~~~~~ + # an+b-1 siblings with a<0 and (b-1)<0 is not possible + if a < 0 and b_min_1 < 0: + return xpath.add_condition("0") + + # `add_name_test` boolean is inverted and somewhat counter-intuitive: + # + # nth_of_type() calls nth_child(add_name_test=False) + nodetest = "*" if add_name_test else f"{xpath.element}" + + # count siblings before or after the element + if not last: + siblings_count = f"count(preceding-sibling::{nodetest})" else: - b_neg = '+%s' % (-b) - if a != 1: - expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + siblings_count = f"count(following-sibling::{nodetest})" + + # special case of fixed position: nth-*(0n+b) + # if a == 0: + # ~~~~~~~~~~ + # count(***-sibling::***) = b-1 + if a == 0: + return xpath.add_condition(f"{siblings_count} = {b_min_1}") + + expressions = [] + + if a > 0: + # siblings count, an+b-1, is always >= 0, + # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, + # therefore, the predicate is only interesting if (b-1)>0 + if b_min_1 > 0: + expressions.append(f"{siblings_count} >= {b_min_1}") else: - expr = [] - if b >= 0: - expr.append('position() >= %s' % b) - elif b < 0 and last: - expr.append('position() < (last() %s)' % b) - expr = ' and '.join(expr) - if expr: - xpath.add_condition(expr) + # if a<0, and (b-1)<0, no "n" satisfies this, + # this is tested above as an early exist condition + # otherwise, + expressions.append(f"{siblings_count} <= {b_min_1}") + + # operations modulo 1 or -1 are simpler, one only needs to verify: + # + # - either: + # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., + # i.e. count(***-sibling::***) >= (b-1) + # + # - or: + # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., + # i.e. count(***-sibling::***) <= (b-1) + # we we just did above. + # + if abs(a) != 1: + # count(***-sibling::***) - (b-1) ≡ 0 (mod a) + left = siblings_count + + # apply "modulo a" on 2nd term, -(b-1), + # to simplify things like "(... +6) % -3", + # and also make it positive with |a| + b_neg = (-b_min_1) % abs(a) + + if b_neg != 0: + left = f"({left} +{b_neg})" + + expressions.append(f"{left} mod {a} = 0") + + template = "(%s)" if len(expressions) > 1 else "%s" + xpath.add_condition( + " and ".join(template % expression for expression in expressions) + ) return xpath - # FIXME: handle an+b, odd, even - # an+b means every-a, plus b, e.g., 2n+1 means odd - # 0n+b means b - # n+0 means a=1, i.e., all elements - # an means every a elements, i.e., 2n means even - # -n means -1n - # -1n+6 means elements 6 and previous - - def xpath_nth_last_child_function(self, xpath, function): + + def xpath_nth_last_child_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: return self.xpath_nth_child_function(xpath, function, last=True) - def xpath_nth_of_type_function(self, xpath, function): - if xpath.element == '*': - raise ExpressionError( - "*:nth-of-type() is not implemented") - return self.xpath_nth_child_function(xpath, function, - add_name_test=False) + def xpath_nth_of_type_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:nth-of-type() is not implemented") + return self.xpath_nth_child_function(xpath, function, add_name_test=False) + + def xpath_nth_last_of_type_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:nth-of-type() is not implemented") + return self.xpath_nth_child_function( + xpath, function, last=True, add_name_test=False + ) - def xpath_nth_last_of_type_function(self, xpath, function): - if xpath.element == '*': + def xpath_contains_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + # Defined there, removed in later drafts: + # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors + if function.argument_types() not in (["STRING"], ["IDENT"]): raise ExpressionError( - "*:nth-of-type() is not implemented") - return self.xpath_nth_child_function(xpath, function, last=True, - add_name_test=False) - - def xpath_contains_function(self, xpath, function): - return xpath.add_condition('contains(string(.), %s)' - % self.xpath_literal(function.arguments)) - - def function_unsupported(self, xpath, pseudo): - raise ExpressionError( - "The pseudo-class :%s() is not supported" % pseudo.name) - - xpath_lang_function = function_unsupported + f"Expected a single string or ident for :contains(), got {function.arguments!r}" + ) + value = cast("str", function.arguments[0].value) + return xpath.add_condition(f"contains(., {self.xpath_literal(value)})") + def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: + if function.argument_types() not in (["STRING"], ["IDENT"]): + raise ExpressionError( + f"Expected a single string or ident for :lang(), got {function.arguments!r}" + ) + value = cast("str", function.arguments[0].value) + return xpath.add_condition(f"lang({self.xpath_literal(value)})") # Pseudo: dispatch by pseudo-class name - def xpath_root_pseudo(self, xpath): + def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: return xpath.add_condition("not(parent::*)") - def xpath_first_child_pseudo(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - return xpath.add_condition('position() = 1') + # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") + # Works only at the start of a selector + # Needed to get immediate children of a processed selector in Scrapy + # for product in response.css('.product'): + # description = product.css(':scope > div::text').get() + def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("1") - def xpath_last_child_pseudo(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - return xpath.add_condition('position() = last()') + def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(preceding-sibling::*) = 0") - def xpath_first_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:first-of-type is not implemented") - xpath.add_star_prefix() - return xpath.add_condition('position() = 1') + def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(following-sibling::*) = 0") - def xpath_last_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:last-of-type is not implemented") - xpath.add_star_prefix() - return xpath.add_condition('position() = last()') + def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:first-of-type is not implemented") + return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0") - def xpath_only_child_pseudo(self, xpath): - xpath.add_name_test() - xpath.add_star_prefix() - return xpath.add_condition('last() = 1') + def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:last-of-type is not implemented") + return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0") - def xpath_only_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:only-of-type is not implemented") - return xpath.add_condition('last() = 1') + def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(parent::*/child::*) = 1") - def xpath_empty_pseudo(self, xpath): - return xpath.add_condition("not(*) and not(normalize-space())") + def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:only-of-type is not implemented") + return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1") - def pseudo_never_matches(self, xpath): + def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("not(*) and not(string-length())") + + def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: """Common implementation for pseudo-classes that never match.""" return xpath.add_condition("0") @@ -407,82 +714,161 @@ def pseudo_never_matches(self, xpath): # Attrib: dispatch by attribute operator - def xpath_attrib_exists(self, xpath, name, value): + def xpath_attrib_exists( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: assert not value xpath.add_condition(name) return xpath - def xpath_attrib_equals(self, xpath, name, value): - xpath.add_condition('%s = %s' % (name, self.xpath_literal(value))) + def xpath_attrib_equals( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None + xpath.add_condition(f"{name} = {self.xpath_literal(value)}") return xpath - def xpath_attrib_different(self, xpath, name, value): + def xpath_attrib_different( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None # FIXME: this seems like a weird hack... if value: - xpath.add_condition('not(%s) or %s != %s' - % (name, name, self.xpath_literal(value))) + xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}") else: - xpath.add_condition('%s != %s' - % (name, self.xpath_literal(value))) + xpath.add_condition(f"{name} != {self.xpath_literal(value)}") return xpath - def xpath_attrib_includes(self, xpath, name, value): - xpath.add_condition( - "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" - % (name, name, self.xpath_literal(' '+value+' '))) + def xpath_attrib_includes( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value and is_non_whitespace(value): + arg = self.xpath_literal(" " + value + " ") + xpath.add_condition( + f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})" + ) + else: + xpath.add_condition("0") return xpath - def xpath_attrib_dashmatch(self, xpath, name, value): + def xpath_attrib_dashmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None + arg = self.xpath_literal(value) + arg_dash = self.xpath_literal(value + "-") # Weird, but true... - xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % ( - name, - name, self.xpath_literal(value), - name, self.xpath_literal(value + '-'))) + xpath.add_condition( + f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))" + ) return xpath - def xpath_attrib_prefixmatch(self, xpath, name, value): - return xpath.add_condition('%s and starts-with(%s, %s)' % ( - name, name, self.xpath_literal(value))) + def xpath_attrib_prefixmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + xpath.add_condition( + f"{name} and starts-with({name}, {self.xpath_literal(value)})" + ) + else: + xpath.add_condition("0") + return xpath - def xpath_attrib_suffixmatch(self, xpath, name, value): - # Oddly there is a starts-with in XPath 1.0, but not ends-with - return xpath.add_condition( - '%s and substring(%s, string-length(%s)-%s) = %s' - % (name, name, name, len(value)-1, self.xpath_literal(value))) + def xpath_attrib_suffixmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + # Oddly there is a starts-with in XPath 1.0, but not ends-with + xpath.add_condition( + f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}" + ) + else: + xpath.add_condition("0") + return xpath - def xpath_attrib_substringmatch(self, xpath, name, value): - # Attribute selectors are case sensitive - return xpath.add_condition('%s and contains(%s, %s)' % ( - name, name, self.xpath_literal(value))) + def xpath_attrib_substringmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + # Attribute selectors are case sensitive + xpath.add_condition( + f"{name} and contains({name}, {self.xpath_literal(value)})" + ) + else: + xpath.add_condition("0") + return xpath class HTMLTranslator(GenericTranslator): """ - Translator for HTML documents. + Translator for (X)HTML documents. + + Has a more useful implementation of some pseudo-classes based on + HTML-specific element names and attribute names, as described in + the `HTML5 specification`_. It assumes no-quirks mode. + The API is the same as :class:`GenericTranslator`. + + .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors + + :param xhtml: + If false (the default), element names and attribute names + are case-insensitive. + """ - def xpath_checked_pseudo(self, xpath): + + lang_attribute = "lang" + + def __init__(self, xhtml: bool = False) -> None: + self.xhtml = xhtml # Might be useful for sub-classes? + if not xhtml: + # See their definition in GenericTranslator. + self.lower_case_element_names = True + self.lower_case_attribute_names = True + + def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # FIXME: is this really all the elements? return xpath.add_condition( "(@selected and name(.) = 'option') or " - "(@checked and name(.) = 'input')") + "(@checked " + "and (name(.) = 'input' or name(.) = 'command')" + "and (@type = 'checkbox' or @type = 'radio'))" + ) - def xpath_link_pseudo(self, xpath): - return xpath.add_condition("@href and name(.) = 'a'") + def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: + if function.argument_types() not in (["STRING"], ["IDENT"]): + raise ExpressionError( + f"Expected a single string or ident for :lang(), got {function.arguments!r}" + ) + value = function.arguments[0].value + assert value + arg = self.xpath_literal(value.lower() + "-") + return xpath.add_condition( + "ancestor-or-self::*[@lang][1][starts-with(concat(" + # XPath 1.0 has no lower-case function... + f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " + "'abcdefghijklmnopqrstuvwxyz'), " + f"'-'), {arg})]" + ) + + def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition( + "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" + ) # Links are never visited, the implementation for :visited is the same # as in GenericTranslator - def xpath_disabled_pseudo(self, xpath): + def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition(''' + return xpath.add_condition( + """ ( @disabled and ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or - name(.) = 'keygen' or name(.) = 'command' or name(.) = 'fieldset' or name(.) = 'optgroup' or @@ -490,39 +876,54 @@ def xpath_disabled_pseudo(self, xpath): ) ) or ( ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or - name(.) = 'textarea' or - name(.) = 'keygen' + name(.) = 'textarea' ) and ancestor::fieldset[@disabled] ) - ''') + """ + ) # FIXME: in the second half, add "and is not a descendant of that # fieldset element's first legend element child, if any." - def xpath_enabled_pseudo(self, xpath): + def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition(''' + return xpath.add_condition( + """ ( + @href and ( + name(.) = 'a' or + name(.) = 'link' or + name(.) = 'area' + ) + ) or ( ( name(.) = 'command' or name(.) = 'fieldset' or - name(.) = 'optgroup' or - name(.) = 'option' + name(.) = 'optgroup' ) and not(@disabled) ) or ( ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or name(.) = 'keygen' ) and not (@disabled or ancestor::fieldset[@disabled]) + ) or ( + name(.) = 'option' and not( + @disabled or ancestor::optgroup[@disabled] + ) ) - ''') - # FIXME: in the second half, add "and is not a descendant of that - # fieldset element's first legend element child, if any." + """ + ) + # FIXME: ... or "li elements that are children of menu elements, + # and that have a child element that defines a command, if the first + # such element's Disabled State facet is false (not disabled)". + # FIXME: after ancestor::fieldset[@disabled], add "and is not a + # descendant of that fieldset element's first legend element child, + # if any." diff --git a/docs/conf.py b/docs/conf.py index 22e6032..da3f023 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # cssselect documentation build configuration file, created by # sphinx-quickstart on Tue Mar 27 14:20:34 2012. @@ -12,217 +11,210 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os, re +import re +from pathlib import Path # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', - 'sphinx.ext.doctest'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = {".rst": "restructuredtext"} # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'cssselect' -copyright = '2012, Simon Sapin' +project = "cssselect" +project_copyright = "2012-2017, Simon Sapin, Scrapy developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -init_py = open(os.path.join(os.path.dirname(__file__), - '..', 'cssselect', '__init__.py')).read() -release = re.search("VERSION = '([^']+)'", init_py).group(1) +init_py = (Path(__file__).parent.parent / "cssselect" / "__init__.py").read_text() +release = re.search('VERSION = "([^"]+)"', init_py).group(1) # The short X.Y version. -version = release.rstrip('dev') +version = release.rstrip("dev") # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -#html_theme = 'agogo' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'cssselectdoc' +htmlhelp_basename = "cssselectdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'cssselect.tex', 'cssselect Documentation', - 'Simon Sapin', 'manual'), + ("index", "cssselect.tex", "cssselect Documentation", "Simon Sapin", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'cssselect', 'cssselect Documentation', - ['Simon Sapin'], 1) -] +man_pages = [("index", "cssselect", "cssselect Documentation", ["Simon Sapin"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -231,20 +223,35 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'cssselect', 'cssselect Documentation', - 'Simon Sapin', 'cssselect', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "cssselect", + "cssselect Documentation", + "Simon Sapin", + "cssselect", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} +intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} + + +# --- Nitpicking options ------------------------------------------------------ + +nitpicky = True +nitpick_ignore = [ + # explicitly not a part of the public API + ("py:class", "Token"), +] diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 0000000..a71d108 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,21 @@ +from doctest import ELLIPSIS, NORMALIZE_WHITESPACE + +from sybil import Sybil +from sybil.parsers.doctest import DocTestParser +from sybil.parsers.skip import skip + +try: + # sybil 3.0.0+ + from sybil.parsers.codeblock import PythonCodeBlockParser +except ImportError: + from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser + + +pytest_collect_file = Sybil( + parsers=[ + DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), + PythonCodeBlockParser(future_imports=["print_function"]), + skip, + ], + pattern="*.rst", +).pytest() diff --git a/docs/index.rst b/docs/index.rst index 0c060fc..a024f20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,8 +44,9 @@ The resulting expression can be used with lxml's `XPath engine`_: User API ======== -In CSS3 terms, a `group of selectors`_ is a sequence of comma-separated -selectors. For example, ``div, h1.title + p`` is a group of 2 selectors. +In CSS3 Selectors terms, the top-level object is a `group of selectors`_, a +sequence of comma-separated selectors. For example, ``div, h1.title + p`` +is a group of two selectors. .. _group of selectors: http://www.w3.org/TR/selectors/#grouping @@ -53,12 +54,15 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors. .. autoclass:: Selector() :members: +.. autoclass:: FunctionalPseudoElement + .. autoclass:: GenericTranslator :members: css_to_xpath, selector_to_xpath .. autoclass:: HTMLTranslator - The API is the same as :class:`GenericTranslator`. +Exceptions +---------- .. autoexception:: SelectorError .. autoexception:: SelectorSyntaxError @@ -90,18 +94,29 @@ they never match: These applicable pseudo-classes are not yet implemented: -* ``:lang(language)`` * ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``, ``*:nth-last-of-type``, ``*:only-of-type``. All of these work when you specify an element type, but not with ``*`` On the other hand, *cssselect* supports some selectors that are not -in the Level 3 specification: +in the Level 3 specification. + +These parts of the Level 4 specification are supported (note that a large part +of the Level 4 additions is not applicable to cssselect similarly to ``:hover`` +or not representable in XPath 1.0 so the complete specification is unlikely to +be implemented): + +* The ``:scope`` pseudo-class. Limitation: it can only be used at a start of a + selector. +* The ``:is()``, ``:where()`` and ``:has()`` pseudo-classes. Limitation: + ``:has()`` cannot contain nested ``:has()`` or ``:not()``. + +These are non-standard extensions: * The ``:contains(text)`` pseudo-class that existed in `an early draft`_ but was then removed. * The ``!=`` attribute operator. ``[foo!=bar]`` is the same as - ``:not([foo=bar])`` + ``:not([foo=bar])``. * ``:not()`` accepts a *sequence of simple selectors*, not just single *simple selector*. For example, ``:not(a.important[rel])`` is allowed, even though the negation contains 3 *simple selectors*. @@ -134,9 +149,9 @@ implemented without forking or monkey-patching cssselect. The "customization API" is the set of methods in translation classes and their signature. You can look at the `source code`_ to see how it works. However, be aware that this API is not very stable yet. It might change -and break you sub-class. +and break your sub-class. -.. _source code: https://github.com/SimonSapin/cssselect/blob/master/cssselect/xpath.py +.. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py Namespaces diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..21cb2eb --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx==8.2.3 +sphinx-rtd-theme==3.0.2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c7c54a0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,239 @@ +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling>=1.27.0"] + +[project] +name = "cssselect" +license = "BSD-3-Clause" +license-files = ["LICENSE", "AUTHORS"] +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +readme = "README.rst" +authors = [{ name = "Ian Bicking", email = "ianb@colorstudy.com" }] +maintainers = [{ name = "Paul Tremberth", email = "paul.tremberth@gmail.com" }] +requires-python = ">=3.10" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version"] + +[project.urls] +"Homepage" = "https://github.com/scrapy/cssselect" + +[tool.hatch.version] +path = "cssselect/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/cssselect", + "/docs", + "/tests", + "/CHANGES", + "/README.rst", + "/tox.ini", +] +exclude = [ + "/docs/_build", +] + +[tool.hatch.build.targets.wheel] +packages = ["cssselect"] + +[tool.bumpversion] +current_version = "1.4.0" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = "cssselect/__init__.py" + +[[tool.bumpversion.files]] +filename = "CHANGES" +search = "^Unreleased\\.$" +replace = "Released on {now:%Y-%m-%d}." +regex = true + +[tool.coverage.run] +branch = true +source = ["cssselect"] + +[tool.coverage.report] +exclude_also = [ + "def __repr__", + "if sys.version_info", + "if __name__ == '__main__':", +] + +[tool.mypy] +strict = true + +[tool.pylint.MASTER] +persistent = "no" +extension-pkg-allow-list = ["lxml"] + +[tool.pylint."MESSAGES CONTROL"] +enable = [ + "useless-suppression", +] +disable = [ + "consider-using-f-string", + "fixme", + "invalid-name", + "line-too-long", + "missing-class-docstring", + "missing-function-docstring", + "missing-module-docstring", + "no-member", + "not-callable", + "redefined-builtin", + "redefined-outer-name", + "too-few-public-methods", + "too-many-arguments", + "too-many-branches", + "too-many-function-args", + "too-many-lines", + "too-many-locals", + "too-many-positional-arguments", + "too-many-public-methods", + "too-many-statements", + "unused-argument", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff.lint] +extend-select = [ + # flake8-builtins + "A", + # flake8-async + "ASYNC", + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # flake8-commas + "COM", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Trailing comma missing + "COM812", + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in public nested class + "D106", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # First word of the first line should be properly capitalized + "D403", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", +] + +[tool.ruff.lint.isort] +split-on-trailing-comma = false + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index ccddf11..0000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[build_sphinx] -source-dir = docs -build-dir = docs/_build -#all_files = 1 - -[upload_sphinx] # Sphinx-PyPI-upload -upload-dir = docs/_build/html - -[pytest] -python_files=tests.py diff --git a/setup.py b/setup.py deleted file mode 100644 index df95379..0000000 --- a/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -import re -import os.path -from setuptools import setup - - -ROOT = os.path.dirname(__file__) -README = open(os.path.join(ROOT, 'README.rst')).read() -INIT_PY = open(os.path.join(ROOT, 'cssselect', '__init__.py')).read() -VERSION = re.search("VERSION = '([^']+)'", INIT_PY).group(1) - - -setup( - name='cssselect', - version=VERSION, - author='Ian Bicking', - author_email='ianb@colorstudy.com', - maintainer='Simon Sapin', - maintainer_email='simon.sapin@exyr.org', - description= - 'cssselect parses CSS3 Selectors and translates them to XPath 1.0', - long_description=README, - url='http://packages.python.org/cssselect/', - license='BSD', - packages=['cssselect'], - test_suite='cssselect.tests', - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2', - ], -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cssselect.py b/tests/test_cssselect.py new file mode 100644 index 0000000..dc67bb7 --- /dev/null +++ b/tests/test_cssselect.py @@ -0,0 +1,1540 @@ +#!/usr/bin/env python +""" +Tests for cssselect +=================== + +These tests can be run either by py.test or by the standard library's +unittest. They use plain ``assert`` statements and do little reporting +themselves in case of failure. + +Use py.test to get fancy error reporting and assert introspection. + + +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. + +""" + +from __future__ import annotations + +import sys +import typing +import unittest +from typing import TYPE_CHECKING + +import pytest +from lxml import etree, html + +from cssselect import ( + ExpressionError, + GenericTranslator, + HTMLTranslator, + SelectorSyntaxError, + parse, +) +from cssselect.parser import ( + Function, + FunctionalPseudoElement, + PseudoElement, + Token, + parse_series, + tokenize, +) +from cssselect.xpath import XPathExpr + +if TYPE_CHECKING: + from collections.abc import Sequence + + +class TestCssselect(unittest.TestCase): + def test_tokenizer(self) -> None: + tokens = [ + str(item) + for item in tokenize(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)') + ] + assert tokens == [ + "", + "", + "' at 5>", + "", + # the no-break space is not whitespace in CSS + "", # f\xa0 + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + def test_parser(self) -> None: + def repr_parse(css: str) -> list[str]: + selectors = parse(css) + for selector in selectors: + assert selector.pseudo_element is None + return [repr(selector.parsed_tree) for selector in selectors] + + def parse_many(first: str, *others: str) -> list[str]: + result = repr_parse(first) + for other in others: + assert repr_parse(other) == result + return result + + assert parse_many("*") == ["Element[*]"] + assert parse_many("*|*") == ["Element[*]"] + assert parse_many("*|foo") == ["Element[foo]"] + assert parse_many("|foo") == ["Element[foo]"] + assert parse_many("foo|*") == ["Element[foo|*]"] + assert parse_many("foo|bar") == ["Element[foo|bar]"] + # This will never match, but it is valid: + assert parse_many("#foo#bar") == ["Hash[Hash[Element[*]#foo]#bar]"] + assert parse_many( + "div>.foo", + "div> .foo", + "div >.foo", + "div > .foo", + "div \n> \t \t .foo", + "div\r>\n\n\n.foo", + "div\f>\f.foo", + ) == ["CombinedSelector[Element[div] > Class[Element[*].foo]]"] + assert parse_many( + "td.foo,.bar", "td.foo, .bar", "td.foo\t\r\n\f ,\t\r\n\f .bar" + ) == [ + "Class[Element[td].foo]", + "Class[Element[*].bar]", + ] + assert parse_many("div, td.foo, div.bar span") == [ + "Element[div]", + "Class[Element[td].foo]", + "CombinedSelector[Class[Element[div].bar] Element[span]]", + ] + assert parse_many("div > p") == ["CombinedSelector[Element[div] > Element[p]]"] + assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] + assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] + assert parse_many("td :first") == [ + "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" + ] + assert parse_many("td :first") == [ + "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" + ] + assert parse_many("a[name]", "a[ name\t]") == ["Attrib[Element[a][name]]"] + assert parse_many("a [name]") == [ + "CombinedSelector[Element[a] Attrib[Element[*][name]]]" + ] + assert parse_many('a[rel="include"]', "a[rel = include]") == [ + "Attrib[Element[a][rel = 'include']]" + ] + assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [ + "Attrib[Element[a][hreflang |= 'en']]" + ] + assert parse_many("div:nth-child(10)") == [ + "Function[Element[div]:nth-child(['10'])]" + ] + assert parse_many(":nth-child(2n+2)") == [ + "Function[Element[*]:nth-child(['2', 'n', '+2'])]" + ] + assert parse_many("div:nth-of-type(10)") == [ + "Function[Element[div]:nth-of-type(['10'])]" + ] + assert parse_many("div div:nth-of-type(10) .aclass") == [ + "CombinedSelector[CombinedSelector[Element[div] " + "Function[Element[div]:nth-of-type(['10'])]] " + " Class[Element[*].aclass]]" + ] + assert parse_many("label:only") == ["Pseudo[Element[label]:only]"] + assert parse_many("a:lang(fr)") == ["Function[Element[a]:lang(['fr'])]"] + assert parse_many('div:contains("foo")') == [ + "Function[Element[div]:contains(['foo'])]" + ] + assert parse_many("div#foobar") == ["Hash[Element[div]#foobar]"] + assert parse_many("div:not(div.foo)") == [ + "Negation[Element[div]:not(Class[Element[div].foo])]" + ] + assert parse_many("div:has(div.foo)") == [ + "Relation[Element[div]:has(Selector[Class[Element[div].foo]])]" + ] + assert parse_many("div:is(.foo, #bar)") == [ + "Matching[Element[div]:is(Class[Element[*].foo], Hash[Element[*]#bar])]" + ] + assert parse_many(":is(:hover, :visited)") == [ + "Matching[Element[*]:is(Pseudo[Element[*]:hover], Pseudo[Element[*]:visited])]" + ] + assert parse_many(":where(:hover, :visited)") == [ + "SpecificityAdjustment[Element[*]:where(Pseudo[Element[*]:hover]," + " Pseudo[Element[*]:visited])]" + ] + assert parse_many("td ~ th") == ["CombinedSelector[Element[td] ~ Element[th]]"] + assert parse_many(":scope > foo") == [ + "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" + ] + assert parse_many(" :scope > foo") == [ + "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" + ] + assert parse_many(":scope > foo bar > div") == [ + "CombinedSelector[CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " + "Element[foo]] Element[bar]] > Element[div]]" + ] + assert parse_many(":scope > #foo #bar") == [ + "CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " + "Hash[Element[*]#foo]] Hash[Element[*]#bar]]" + ] + + def test_pseudo_elements(self) -> None: + def parse_pseudo(css: str) -> list[tuple[str, str | None]]: + result: list[tuple[str, str | None]] = [] + for selector in parse(css): + pseudo = selector.pseudo_element + pseudo = str(pseudo) if pseudo else pseudo + # No Symbol here + assert pseudo is None or isinstance(pseudo, str) + selector_as_str = repr(selector.parsed_tree) + result.append((selector_as_str, pseudo)) + return result + + def parse_one(css: str) -> tuple[str, str | None]: + result = parse_pseudo(css) + assert len(result) == 1 + return result[0] + + def test_pseudo_repr(css: str) -> str: + result = parse(css) + assert len(result) == 1 + selector = result[0] + return repr(selector.parsed_tree) + + assert parse_one("foo") == ("Element[foo]", None) + assert parse_one("*") == ("Element[*]", None) + assert parse_one(":empty") == ("Pseudo[Element[*]:empty]", None) + assert parse_one(":scope") == ("Pseudo[Element[*]:scope]", None) + + # Special cases for CSS 2.1 pseudo-elements + assert parse_one(":BEfore") == ("Element[*]", "before") + assert parse_one(":aftER") == ("Element[*]", "after") + assert parse_one(":First-Line") == ("Element[*]", "first-line") + assert parse_one(":First-Letter") == ("Element[*]", "first-letter") + + assert parse_one("::befoRE") == ("Element[*]", "before") + assert parse_one("::AFter") == ("Element[*]", "after") + assert parse_one("::firsT-linE") == ("Element[*]", "first-line") + assert parse_one("::firsT-letteR") == ("Element[*]", "first-letter") + + assert parse_one("::text-content") == ("Element[*]", "text-content") + assert parse_one("::attr(name)") == ( + "Element[*]", + "FunctionalPseudoElement[::attr(['name'])]", + ) + + assert parse_one("::Selection") == ("Element[*]", "selection") + assert parse_one("foo:after") == ("Element[foo]", "after") + assert parse_one("foo::selection") == ("Element[foo]", "selection") + assert parse_one("lorem#ipsum ~ a#b.c[href]:empty::selection") == ( + "CombinedSelector[Hash[Element[lorem]#ipsum] ~ " + "Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]", + "selection", + ) + assert parse_pseudo(":scope > div, foo bar") == [ + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ("CombinedSelector[Element[foo] Element[bar]]", None), + ] + assert parse_pseudo("foo bar, :scope > div") == [ + ("CombinedSelector[Element[foo] Element[bar]]", None), + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ] + assert parse_pseudo("foo bar,:scope > div") == [ + ("CombinedSelector[Element[foo] Element[bar]]", None), + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ] + assert parse_pseudo("foo:before, bar, baz:after") == [ + ("Element[foo]", "before"), + ("Element[bar]", None), + ("Element[baz]", "after"), + ] + + # Special cases for CSS 2.1 pseudo-elements are ignored by default + for pseudo in ("after", "before", "first-line", "first-letter"): + (selector,) = parse(f"e:{pseudo}") + assert selector.pseudo_element == pseudo + assert GenericTranslator().selector_to_xpath(selector, prefix="") == "e" + + # Pseudo Elements are ignored by default, but if allowed they are not + # supported by GenericTranslator + tr = GenericTranslator() + (selector,) = parse("e::foo") + assert selector.pseudo_element == "foo" + assert tr.selector_to_xpath(selector, prefix="") == "e" + with pytest.raises(ExpressionError): + tr.selector_to_xpath(selector, translate_pseudo_elements=True) + + # Special test for the unicode symbols and ':scope' element if check + # Errors if use repr() instead of __repr__() + assert test_pseudo_repr(":fİrst-child") == "Pseudo[Element[*]:fİrst-child]" + assert test_pseudo_repr(":scope") == "Pseudo[Element[*]:scope]" + + def test_specificity(self) -> None: + def specificity(css: str) -> tuple[int, int, int]: + selectors = parse(css) + assert len(selectors) == 1 + return selectors[0].specificity() + + assert specificity("*") == (0, 0, 0) + assert specificity(" foo") == (0, 0, 1) + assert specificity(":empty ") == (0, 1, 0) + assert specificity(":before") == (0, 0, 1) + assert specificity("*:before") == (0, 0, 1) + assert specificity(":nth-child(2)") == (0, 1, 0) + assert specificity(".bar") == (0, 1, 0) + assert specificity("[baz]") == (0, 1, 0) + assert specificity('[baz="4"]') == (0, 1, 0) + assert specificity('[baz^="4"]') == (0, 1, 0) + assert specificity("#lipsum") == (1, 0, 0) + assert specificity("::attr(name)") == (0, 0, 1) + + assert specificity(":not(*)") == (0, 0, 0) + assert specificity(":not(foo)") == (0, 0, 1) + assert specificity(":not(.foo)") == (0, 1, 0) + assert specificity(":not([foo])") == (0, 1, 0) + assert specificity(":not(:empty)") == (0, 1, 0) + assert specificity(":not(#foo)") == (1, 0, 0) + + assert specificity(":has(*)") == (0, 0, 0) + assert specificity(":has(foo)") == (0, 0, 1) + assert specificity(":has(.foo)") == (0, 1, 0) + assert specificity(":has(> foo)") == (0, 0, 1) + + assert specificity(":is(.foo, #bar)") == (1, 0, 0) + assert specificity(":is(:hover, :visited)") == (0, 1, 0) + assert specificity(":where(:hover, :visited)") == (0, 0, 0) + + assert specificity("foo:empty") == (0, 1, 1) + assert specificity("foo:before") == (0, 0, 2) + assert specificity("foo::before") == (0, 0, 2) + assert specificity("foo:empty::before") == (0, 1, 2) + + assert specificity("#lorem + foo#ipsum:first-child > bar:first-line") == ( + 2, + 1, + 3, + ) + + def test_css_export(self) -> None: + def css2css(css: str, res: str | None = None) -> None: + selectors = parse(css) + assert len(selectors) == 1 + assert selectors[0].canonical() == (res or css) + + css2css("*") + css2css(" foo", "foo") + css2css("Foo", "Foo") + css2css(":empty ", ":empty") + css2css(":before", "::before") + css2css(":beFOre", "::before") + css2css("*:before", "::before") + css2css(":nth-child(2)") + css2css(".bar") + css2css("[baz]") + css2css('[baz="4"]', "[baz='4']") + css2css('[baz^="4"]', "[baz^='4']") + css2css("[ns|attr='4']") + css2css("#lipsum") + css2css(":not(*)") + css2css(":not(foo)") + css2css(":not(*.foo)", ":not(.foo)") + css2css(":not(*[foo])", ":not([foo])") + css2css(":not(:empty)") + css2css(":not(#foo)") + css2css(":has(*)") + css2css(":has(foo)") + css2css(":has(*.foo)", ":has(.foo)") + css2css(":is(#bar, .foo)") + css2css(":is(:focused, :visited)") + css2css(":where(:focused, :visited)") + css2css("foo:empty") + css2css("foo::before") + css2css("foo:empty::before") + css2css('::name(arg + "val" - 3)', "::name(arg+'val'-3)") + css2css("#lorem + foo#ipsum:first-child > bar::first-line") + css2css("foo > *") + + def test_parse_errors(self) -> None: + def get_error(css: str) -> str | None: + try: + parse(css) + except SelectorSyntaxError: + return str(sys.exc_info()[1]) + return None + + assert get_error("attributes(href)/html/body/a") == ( + "Expected selector, got " + ) + assert get_error("attributes(href)") == ( + "Expected selector, got " + ) + assert get_error("html/body/a") == ("Expected selector, got ") + assert get_error(" ") == ("Expected selector, got ") + assert get_error("div, ") == ("Expected selector, got ") + assert get_error(" , div") == ("Expected selector, got ") + assert get_error("p, , div") == ("Expected selector, got ") + assert get_error("div > ") == ("Expected selector, got ") + assert get_error(" > div") == ("Expected selector, got ' at 2>") + assert get_error("foo|#bar") == ("Expected ident or '*', got ") + assert get_error("#.foo") == ("Expected selector, got ") + assert get_error(".#foo") == ("Expected ident, got ") + assert get_error(":#foo") == ("Expected ident, got ") + assert get_error("[*]") == ("Expected '|', got ") + assert get_error("[foo|]") == ("Expected ident, got ") + assert get_error("[#]") == ("Expected ident or '*', got ") + assert get_error("[foo=#]") == ( + "Expected string or ident, got " + ) + assert get_error("[href]a") == ("Expected selector, got ") + assert get_error("[rel=stylesheet]") is None + assert get_error("[rel:stylesheet]") == ( + "Operator expected, got " + ) + assert get_error("[rel=stylesheet") == ("Expected ']', got ") + assert get_error(":lang(fr)") is None + assert get_error(":lang(fr") == ("Expected an argument, got ") + assert get_error(':contains("foo') == ("Unclosed string at 10") + assert get_error("foo!") == ("Expected selector, got ") + + # Mis-placed pseudo-elements + assert get_error("a:before:empty") == ( + "Got pseudo-element ::before not at the end of a selector" + ) + assert get_error("li:before a") == ( + "Got pseudo-element ::before not at the end of a selector" + ) + assert get_error(":not(:before)") == ( + "Got pseudo-element ::before inside :not() at 12" + ) + assert get_error(":not(:not(a))") == ("Got nested :not()") + assert get_error(":is(:before)") == ( + "Got pseudo-element ::before inside function" + ) + assert get_error(":is(a b)") == ("Expected an argument, got ") + assert get_error(":where(:before)") == ( + "Got pseudo-element ::before inside function" + ) + assert get_error(":where(a b)") == ( + "Expected an argument, got " + ) + assert get_error(":scope > div :scope header") == ( + 'Got immediate child pseudo-element ":scope" not at the start of a selector' + ) + assert get_error("div :scope header") == ( + 'Got immediate child pseudo-element ":scope" not at the start of a selector' + ) + assert get_error("> div p") == ("Expected selector, got ' at 0>") + + # Unsupported :has() with several arguments + assert get_error(":has(a, b)") == ("Expected an argument, got ") + assert get_error(":has()") == ("Expected selector, got ") + + def test_translation(self) -> None: + def xpath(css: str) -> str: + return str(GenericTranslator().css_to_xpath(css, prefix="")) + + assert xpath("*") == "*" + assert xpath("e") == "e" + assert xpath("*|e") == "e" + assert xpath("e|f") == "e:f" + assert xpath("e[foo]") == "e[@foo]" + assert xpath("e[foo|bar]") == "e[@foo:bar]" + assert xpath('e[foo="bar"]') == "e[@foo = 'bar']" + assert xpath('e[foo~="bar"]') == ( + "e[@foo and contains(concat(' ', normalize-space(@foo), ' '), ' bar ')]" + ) + assert xpath('e[foo^="bar"]') == ("e[@foo and starts-with(@foo, 'bar')]") + assert xpath('e[foo$="bar"]') == ( + "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']" + ) + assert xpath('e[foo*="bar"]') == ("e[@foo and contains(@foo, 'bar')]") + assert xpath('e[hreflang|="en"]') == ( + "e[@hreflang and (@hreflang = 'en' or starts-with(@hreflang, 'en-'))]" + ) + + # --- nth-* and nth-last-* ------------------------------------- + assert xpath("e:nth-child(1)") == ("e[count(preceding-sibling::*) = 0]") + + # always true + assert xpath("e:nth-child(n)") == ("e") + assert xpath("e:nth-child(n+1)") == ("e") + # always true too + assert xpath("e:nth-child(n-10)") == ("e") + # b=2 is the limit... + assert xpath("e:nth-child(n+2)") == ("e[count(preceding-sibling::*) >= 1]") + # always false + assert xpath("e:nth-child(-n)") == ("e[0]") + # equivalent to first child + assert xpath("e:nth-child(-n+1)") == ("e[count(preceding-sibling::*) <= 0]") + + assert xpath("e:nth-child(3n+2)") == ( + "e[(count(preceding-sibling::*) >= 1) and " + "((count(preceding-sibling::*) +2) mod 3 = 0)]" + ) + assert xpath("e:nth-child(3n-2)") == ( + "e[count(preceding-sibling::*) mod 3 = 0]" + ) + assert xpath("e:nth-child(-n+6)") == ("e[count(preceding-sibling::*) <= 5]") + + assert xpath("e:nth-last-child(1)") == ("e[count(following-sibling::*) = 0]") + assert xpath("e:nth-last-child(2n)") == ( + "e[(count(following-sibling::*) +1) mod 2 = 0]" + ) + assert xpath("e:nth-last-child(2n+1)") == ( + "e[count(following-sibling::*) mod 2 = 0]" + ) + assert xpath("e:nth-last-child(2n+2)") == ( + "e[(count(following-sibling::*) >= 1) and " + "((count(following-sibling::*) +1) mod 2 = 0)]" + ) + assert xpath("e:nth-last-child(3n+1)") == ( + "e[count(following-sibling::*) mod 3 = 0]" + ) + # represents the two last e elements + assert xpath("e:nth-last-child(-n+2)") == ( + "e[count(following-sibling::*) <= 1]" + ) + + assert xpath("e:nth-of-type(1)") == ("e[count(preceding-sibling::e) = 0]") + assert xpath("e:nth-last-of-type(1)") == ("e[count(following-sibling::e) = 0]") + assert xpath("div e:nth-last-of-type(1) .aclass") == ( + "div/descendant-or-self::*/e[count(following-sibling::e) = 0]" + "/descendant-or-self::*/*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aclass ')]" + ) + + assert xpath("e:first-child") == ("e[count(preceding-sibling::*) = 0]") + assert xpath("e:last-child") == ("e[count(following-sibling::*) = 0]") + assert xpath("e:first-of-type") == ("e[count(preceding-sibling::e) = 0]") + assert xpath("e:last-of-type") == ("e[count(following-sibling::e) = 0]") + assert xpath("e:only-child") == ("e[count(parent::*/child::*) = 1]") + assert xpath("e:only-of-type") == ("e[count(parent::*/child::e) = 1]") + assert xpath("e:empty") == ("e[not(*) and not(string-length())]") + assert xpath("e:EmPTY") == ("e[not(*) and not(string-length())]") + assert xpath("e:root") == ("e[not(parent::*)]") + assert xpath("e:hover") == ("e[0]") # never matches + assert ( + xpath("div:has(bar.foo)") == "div[descendant::bar" + "[@class and contains(concat(' ', normalize-space(@class), ' '), ' foo ')]]" + ) + assert xpath("e:has(> f)") == "e[./f]" + assert xpath("e:has(f)") == "e[descendant::f]" + assert xpath("e:has(~ f)") == "e[following-sibling::f]" + assert ( + xpath("e:has(+ f)") + == "e[following-sibling::*[(name() = 'f') and (position() = 1)]]" + ) + assert xpath('e:contains("foo")') == ("e[contains(., 'foo')]") + assert xpath("e:ConTains(foo)") == ("e[contains(., 'foo')]") + assert xpath("e.warning") == ( + "e[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' warning ')]" + ) + assert xpath("e#myid") == ("e[@id = 'myid']") + assert xpath("e:not(:nth-child(odd))") == ( + "e[not(count(preceding-sibling::*) mod 2 = 0)]" + ) + assert xpath("e:nOT(*)") == ("e[0]") # never matches + assert xpath("e f") == ("e/descendant-or-self::*/f") + assert xpath("e > f") == ("e/f") + assert xpath("e + f") == ( + "e/following-sibling::*[(name() = 'f') and (position() = 1)]" + ) + assert xpath("e ~ f") == ("e/following-sibling::f") + assert xpath("e ~ f:nth-child(3)") == ( + "e/following-sibling::f[count(preceding-sibling::*) = 2]" + ) + assert xpath("div#container p") == ( + "div[@id = 'container']/descendant-or-self::*/p" + ) + assert xpath("e:where(foo)") == "e[name() = 'foo']" + assert xpath("e:where(foo, bar)") == "e[(name() = 'foo') or (name() = 'bar')]" + + # Invalid characters in XPath element names + assert xpath(r"di\a0 v") == ("*[name() = 'di v']") # di\xa0v + assert xpath(r"di\[v") == ("*[name() = 'di[v']") + assert xpath(r"[h\a0 ref]") == ("*[attribute::*[name() = 'h ref']]") # h\xa0ref + assert xpath(r"[h\]ref]") == ("*[attribute::*[name() = 'h]ref']]") + + with pytest.raises(ExpressionError): + xpath(":fİrst-child") + with pytest.raises(ExpressionError): + xpath(":first-of-type") + with pytest.raises(ExpressionError): + xpath(":only-of-type") + with pytest.raises(ExpressionError): + xpath(":last-of-type") + with pytest.raises(ExpressionError): + xpath(":nth-of-type(1)") + with pytest.raises(ExpressionError): + xpath(":nth-last-of-type(1)") + with pytest.raises(ExpressionError): + xpath(":nth-child(n-)") + with pytest.raises(ExpressionError): + xpath(":after") + with pytest.raises(ExpressionError): + xpath(":lorem-ipsum") + with pytest.raises(ExpressionError): + xpath(":lorem(ipsum)") + with pytest.raises(ExpressionError): + xpath("::lorem-ipsum") + with pytest.raises(TypeError): + GenericTranslator().css_to_xpath(4) # type: ignore[arg-type] + with pytest.raises(TypeError): + GenericTranslator().selector_to_xpath("foo") # type: ignore[arg-type] + + def test_unicode(self) -> None: + css = ".a\xc1b" + xpath = GenericTranslator().css_to_xpath(css) + assert css[1:] in xpath + xpath = xpath.encode("ascii", "xmlcharrefreplace").decode("ASCII") + assert xpath == ( + "descendant-or-self::*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aÁb ')]" + ) + + def test_quoting(self) -> None: + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath('*[aval="\'"]') == ( + """descendant-or-self::*[@aval = "'"]""" + ) + assert css_to_xpath("*[aval=\"'''\"]") == ( + """descendant-or-self::*[@aval = "'''"]""" + ) + assert css_to_xpath("*[aval='\"']") == ( + """descendant-or-self::*[@aval = '"']""" + ) + assert css_to_xpath('*[aval=\'"""\']') == ( + '''descendant-or-self::*[@aval = '"""']''' + ) + assert css_to_xpath(':scope > div[dataimg=""]') == ( + "descendant-or-self::*[1]/div[@dataimg = '']" + ) + + def test_unicode_escapes(self) -> None: + # \22 == '"' \20 == ' ' + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath(r'*[aval="\'\22\'"]') == ( + """descendant-or-self::*[@aval = concat("'",'"',"'")]""" + ) + assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( + """descendant-or-self::*[@aval = concat("'",'"2',"'")]""" + ) + assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( + """descendant-or-self::*[@aval = "' '"]""" + ) + assert css_to_xpath("*[aval=\"'\\20\r\n '\"]") == ( + """descendant-or-self::*[@aval = "' '"]""" + ) + + def test_xpath_pseudo_elements(self) -> None: + class CustomTranslator(GenericTranslator): + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + if isinstance(pseudo_element, FunctionalPseudoElement): + method_name = "xpath_{}_functional_pseudo_element".format( + pseudo_element.name.replace("-", "_") + ) + method = getattr(self, method_name, None) + if not method: + raise ExpressionError( + f"The functional pseudo-element ::{pseudo_element.name}() is unknown" + ) + xpath = method(xpath, pseudo_element.arguments) + else: + method_name = "xpath_{}_simple_pseudo_element".format( + pseudo_element.replace("-", "_") + ) + method = getattr(self, method_name, None) + if not method: + raise ExpressionError( + f"The pseudo-element ::{pseudo_element} is unknown" + ) + xpath = method(xpath) + return xpath + + # functional pseudo-class: + # elements that have a certain number of attributes + def xpath_nb_attr_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + assert function.arguments[0].value + nb_attributes = int(function.arguments[0].value) + return xpath.add_condition(f"count(@*)={nb_attributes}") + + # pseudo-class: + # elements that have 5 attributes + def xpath_five_attributes_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(@*)=5") + + # functional pseudo-element: + # element's attribute by name + def xpath_attr_functional_pseudo_element( + self, xpath: XPathExpr, arguments: Sequence[Token] + ) -> XPathExpr: + attribute_name = arguments[0].value + other = XPathExpr( + f"@{attribute_name}", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # element's text() nodes + def xpath_text_node_simple_pseudo_element( + self, xpath: XPathExpr + ) -> XPathExpr: + other = XPathExpr( + "text()", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # element's href attribute + def xpath_attr_href_simple_pseudo_element( + self, xpath: XPathExpr + ) -> XPathExpr: + other = XPathExpr( + "@href", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # used to demonstrate operator precedence + def xpath_first_or_second_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("@id = 'first' or @id = 'second'") + + def xpath(css: str) -> str: + return str(CustomTranslator().css_to_xpath(css)) + + assert xpath(":five-attributes") == "descendant-or-self::*[count(@*)=5]" + assert xpath(":nb-attr(3)") == "descendant-or-self::*[count(@*)=3]" + assert xpath("::attr(href)") == "descendant-or-self::*/@href" + assert xpath("::text-node") == "descendant-or-self::*/text()" + assert xpath("::attr-href") == "descendant-or-self::*/@href" + assert xpath("p img::attr(src)") == ( + "descendant-or-self::p/descendant-or-self::*/img/@src" + ) + assert xpath(":scope") == "descendant-or-self::*[1]" + assert xpath(":first-or-second[href]") == ( + "descendant-or-self::*[(@id = 'first' or @id = 'second') and (@href)]" + ) + + assert str(XPathExpr("", "", condition="@href")) == "[@href]" + + document = etree.fromstring(OPERATOR_PRECEDENCE_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + + def operator_id(selector: str) -> list[str]: + xpath = CustomTranslator().css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + assert operator_id(":first-or-second") == ["first", "second"] + assert operator_id(":first-or-second[href]") == ["second"] + assert operator_id("[href]:first-or-second") == ["second"] + + def test_series(self) -> None: + def series(css: str) -> tuple[int, int] | None: + (selector,) = parse(f":nth-child({css})") + args = typing.cast( + "FunctionalPseudoElement", selector.parsed_tree + ).arguments + try: + return parse_series(args) + except ValueError: + return None + + assert series("1n+3") == (1, 3) + assert series("1n +3") == (1, 3) + assert series("1n + 3") == (1, 3) + assert series("1n+ 3") == (1, 3) + assert series("1n-3") == (1, -3) + assert series("1n -3") == (1, -3) + assert series("1n - 3") == (1, -3) + assert series("1n- 3") == (1, -3) + assert series("n-5") == (1, -5) + assert series("odd") == (2, 1) + assert series("even") == (2, 0) + assert series("3n") == (3, 0) + assert series("n") == (1, 0) + assert series("+n") == (1, 0) + assert series("-n") == (-1, 0) + assert series("5") == (0, 5) + assert series("foo") is None + assert series("n+") is None + + def test_lang(self) -> None: + document = etree.fromstring(XMLLANG_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + + def langid(selector: str) -> list[str]: + xpath = css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + assert langid(':lang("EN")') == ["first", "second", "third", "fourth"] + assert langid(':lang("en-us")') == ["second", "fourth"] + assert langid(":lang(en-nz)") == ["third"] + assert langid(":lang(fr)") == ["fifth"] + assert langid(":lang(ru)") == ["sixth"] + assert langid(":lang('ZH')") == ["eighth"] + assert langid(":lang(de) :lang(zh)") == ["eighth"] + assert langid(":lang(en), :lang(zh)") == [ + "first", + "second", + "third", + "fourth", + "eighth", + ] + assert langid(":lang(es)") == [] + + def test_argument_types(self) -> None: + class CustomTranslator(GenericTranslator): + def __init__(self) -> None: + self.argument_types: list[str] = [] + + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + self.argument_types += typing.cast( + "FunctionalPseudoElement", pseudo_element + ).argument_types() + return xpath + + def argument_types(css: str) -> list[str]: + translator = CustomTranslator() + translator.css_to_xpath(css) + return translator.argument_types + + mappings: list[tuple[str, list[str]]] = [ + ("", []), + ("ident", ["IDENT"]), + ('"string"', ["STRING"]), + ("1", ["NUMBER"]), + ] + for argument_string, argument_list in mappings: + css = f"::pseudo_element({argument_string})" + assert argument_types(css) == argument_list + + def test_select(self) -> None: + document = etree.fromstring(HTML_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + html_css_to_xpath = HTMLTranslator().css_to_xpath + + def select_ids(selector: str, html_only: bool) -> list[str]: + xpath = css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + if html_only: + assert items == [] + xpath = html_css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + def pcss(main: str, *selectors: str, **kwargs: bool) -> list[str]: + html_only = kwargs.pop("html_only", False) + result = select_ids(main, html_only) + for selector in selectors: + assert select_ids(selector, html_only) == result + return result + + all_ids = pcss("*") + assert all_ids[:6] == [ + "html", + "nil", + "link-href", + "link-nohref", + "nil", + "outer-div", + ] + assert all_ids[-1:] == ["foobar-span"] + assert pcss("div") == ["outer-div", "li-div", "foobar-div"] + assert pcss("DIV", html_only=True) == [ + "outer-div", + "li-div", + "foobar-div", + ] # case-insensitive in HTML + assert pcss("div div") == ["li-div"] + assert pcss("div, div div") == ["outer-div", "li-div", "foobar-div"] + assert pcss("a[name]") == ["name-anchor"] + assert pcss("a[NAme]", html_only=True) == [ + "name-anchor" + ] # case-insensitive in HTML: + assert pcss("a[rel]") == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[rel="tag"]') == ["tag-anchor"] + assert pcss('a[href*="localhost"]') == ["tag-anchor"] + assert pcss('a[href*=""]') == [] + assert pcss('a[href^="http"]') == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[href^="http:"]') == ["tag-anchor"] + assert pcss('a[href^=""]') == [] + assert pcss('a[href$="org"]') == ["nofollow-anchor"] + assert pcss('a[href$=""]') == [] + assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ["foobar-div"] + assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == [] + assert pcss('div[foobar~="cd"]') == [] + assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ["second-li"] + # Attribute values are case sensitive + assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] + assert pcss('*[lang|="e"]') == [] + # ... :lang() is not. + assert pcss(':lang("EN")', "*:lang(en-US)", html_only=True) == [ + "second-li", + "li-div", + ] + assert pcss(':lang("e")', html_only=True) == [] + assert pcss(":scope > div") == [] + assert pcss(":scope body") == ["nil"] + assert pcss(":scope body > div") == ["outer-div", "foobar-div"] + assert pcss(":scope head") == ["nil"] + assert pcss(":scope html") == [] + + # --- nth-* and nth-last-* ------------------------------------- + + # select nothing + assert pcss("li:nth-child(-n)") == [] + # select all children + assert pcss("li:nth-child(n)") == [ + "first-li", + "second-li", + "third-li", + "fourth-li", + "fifth-li", + "sixth-li", + "seventh-li", + ] + + assert pcss("li:nth-child(3)", "#first-li ~ :nth-child(3)") == ["third-li"] + assert pcss("li:nth-child(10)") == [] + assert pcss("li:nth-child(2n)", "li:nth-child(even)", "li:nth-child(2n+0)") == [ + "second-li", + "fourth-li", + "sixth-li", + ] + assert pcss("li:nth-child(+2n+1)", "li:nth-child(odd)") == [ + "first-li", + "third-li", + "fifth-li", + "seventh-li", + ] + assert pcss("li:nth-child(2n+4)") == ["fourth-li", "sixth-li"] + assert pcss("li:nth-child(3n+1)") == ["first-li", "fourth-li", "seventh-li"] + assert pcss("li:nth-child(-n+3)") == ["first-li", "second-li", "third-li"] + assert pcss("li:nth-child(-2n+4)") == ["second-li", "fourth-li"] + assert pcss("li:nth-last-child(0)") == [] + assert pcss("li:nth-last-child(1)") == ["seventh-li"] + assert pcss("li:nth-last-child(2n)", "li:nth-last-child(even)") == [ + "second-li", + "fourth-li", + "sixth-li", + ] + assert pcss("li:nth-last-child(2n+1)") == [ + "first-li", + "third-li", + "fifth-li", + "seventh-li", + ] + assert pcss("li:nth-last-child(2n+2)") == ["second-li", "fourth-li", "sixth-li"] + assert pcss("li:nth-last-child(3n+1)") == [ + "first-li", + "fourth-li", + "seventh-li", + ] + assert pcss("ol:first-of-type") == ["first-ol"] + assert pcss("ol:nth-child(1)") == [] + assert pcss("ol:nth-of-type(2)") == ["second-ol"] + assert pcss("ol:nth-last-of-type(1)") == ["second-ol"] + + # "+" and "~" tests + assert pcss("ol#first-ol li + li:nth-child(4)") == ["fourth-li"] + assert pcss("li + li:nth-child(1)") == [] + assert pcss("li ~ li:nth-child(2n+1)") == [ + "third-li", + "fifth-li", + "seventh-li", + ] # all but the first + assert pcss("li ~ li:nth-last-child(2n+1)") == [ + "third-li", + "fifth-li", + "seventh-li", + ] # all but the first + + assert pcss("span:only-child") == ["foobar-span"] + assert pcss("li div:only-child") == ["li-div"] + assert pcss("div *:only-child") == ["li-div", "foobar-span"] + with pytest.raises(ExpressionError): + pcss("p *:only-of-type") + assert pcss("p:only-of-type") == ["paragraph"] + assert pcss("a:empty", "a:EMpty") == ["name-anchor"] + assert pcss("li:empty") == ["third-li", "fourth-li", "fifth-li", "sixth-li"] + assert pcss(":root", "html:root") == ["html"] + assert pcss("li:root", "* :root") == [] + assert pcss('*:contains("link")', ':CONtains("link")') == [ + "html", + "nil", + "outer-div", + "tag-anchor", + "nofollow-anchor", + ] + assert pcss('*:contains("LInk")') == [] # case sensitive + assert pcss('*:contains("e")') == [ + "html", + "nil", + "outer-div", + "first-ol", + "first-li", + "paragraph", + "p-em", + ] + assert pcss('*:contains("E")') == [] # case-sensitive + assert pcss(".a", ".b", "*.a", "ol.a") == ["first-ol"] + assert pcss(".c", "*.c") == ["first-ol", "third-li", "fourth-li"] + assert pcss("ol *.c", "ol li.c", "li ~ li.c", "ol > li.c") == [ + "third-li", + "fourth-li", + ] + assert pcss("#first-li", "li#first-li", "*#first-li") == ["first-li"] + assert pcss("li div", "li > div", "div div") == ["li-div"] + assert pcss("div > div") == [] + assert pcss("div>.c", "div > .c") == ["first-ol"] + assert pcss("div + div") == ["foobar-div"] + assert pcss("a ~ a") == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[rel="tag"] ~ a') == ["nofollow-anchor"] + assert pcss("ol#first-ol li:last-child") == ["seventh-li"] + assert pcss("ol#first-ol *:last-child") == ["li-div", "seventh-li"] + assert pcss("#outer-div:first-child") == ["outer-div"] + assert pcss("#outer-div :first-child") == [ + "name-anchor", + "first-li", + "li-div", + "p-b", + "checkbox-fieldset-disabled", + "area-href", + ] + assert pcss("a[href]") == ["tag-anchor", "nofollow-anchor"] + assert pcss(":not(*)") == [] + assert pcss("a:not([href])") == ["name-anchor"] + assert pcss("ol :Not(li[class])") == [ + "first-li", + "second-li", + "li-div", + "fifth-li", + "sixth-li", + "seventh-li", + ] + assert pcss("link:has(*)") == [] + assert pcss("ol:has(div)") == ["first-ol"] + assert pcss(":is(#first-li, #second-li)") == ["first-li", "second-li"] + assert pcss("a:is(#name-anchor, #tag-anchor)") == ["name-anchor", "tag-anchor"] + assert pcss(":is(.c)") == ["first-ol", "third-li", "fourth-li"] + assert pcss("ol.a.b.c > li.c:nth-child(3)") == ["third-li"] + + # Invalid characters in XPath element names, should not crash + assert pcss(r"di\a0 v", r"div\[") == [] + assert pcss(r"[h\a0 ref]", r"[h\]ref]") == [] + + # HTML-specific + assert pcss(":link", html_only=True) == [ + "link-href", + "tag-anchor", + "nofollow-anchor", + "area-href", + ] + assert pcss(":visited", html_only=True) == [] + assert pcss(":enabled", html_only=True) == [ + "link-href", + "tag-anchor", + "nofollow-anchor", + "checkbox-unchecked", + "text-checked", + "checkbox-checked", + "area-href", + ] + assert pcss(":disabled", html_only=True) == [ + "checkbox-disabled", + "checkbox-disabled-checked", + "fieldset", + "checkbox-fieldset-disabled", + ] + assert pcss(":checked", html_only=True) == [ + "checkbox-checked", + "checkbox-disabled-checked", + ] + + def test_select_shakespeare(self) -> None: + document = html.document_fromstring(HTML_SHAKESPEARE) + body = typing.cast("list[etree._Element]", document.xpath("//body"))[0] + css_to_xpath = GenericTranslator().css_to_xpath + + basestring_ = (str, bytes) + + def count(selector: str) -> int: + xpath = css_to_xpath(selector) + results = typing.cast("list[etree._Element]", body.xpath(xpath)) + assert not isinstance(results, basestring_) + found = set() + for item in results: + assert item not in found + found.add(item) + assert not isinstance(item, basestring_) + return len(results) + + # Data borrowed from http://mootools.net/slickspeed/ + + ## Changed from original; probably because I'm only + ## searching the body. + # assert count('*') == 252 + assert count("*") == 246 + assert count("div:contains(CELIA)") == 26 + assert count("div:only-child") == 22 # ? + assert count("div:nth-child(even)") == 106 + assert count("div:nth-child(2n)") == 106 + assert count("div:nth-child(odd)") == 137 + assert count("div:nth-child(2n+1)") == 137 + assert count("div:nth-child(n)") == 243 + assert count("div:last-child") == 53 + assert count("div:first-child") == 51 + assert count("div > div") == 242 + assert count("div + div") == 190 + assert count("div ~ div") == 190 + assert count("body") == 1 + assert count("body div") == 243 + assert count("div") == 243 + assert count("div div") == 242 + assert count("div div div") == 241 + assert count("div, div, div") == 243 + assert count("div, a, span") == 243 + assert count(".dialog") == 51 + assert count("div.dialog") == 51 + assert count("div .dialog") == 51 + assert count("div.character, div.dialog") == 99 + assert count("div.direction.dialog") == 0 + assert count("div.dialog.direction") == 0 + assert count("div.dialog.scene") == 1 + assert count("div.scene.scene") == 1 + assert count("div.scene .scene") == 0 + assert count("div.direction .dialog ") == 0 + assert count("div .dialog .direction") == 4 + assert count("div.dialog .dialog .direction") == 4 + assert count("#speech5") == 1 + assert count("div#speech5") == 1 + assert count("div #speech5") == 1 + assert count("div.scene div.dialog") == 49 + assert count("div#scene1 div.dialog div") == 142 + assert count("#scene1 #speech1") == 1 + assert count("div[class]") == 103 + assert count("div[class=dialog]") == 50 + assert count("div[class^=dia]") == 51 + assert count("div[class$=log]") == 50 + assert count("div[class*=sce]") == 1 + assert count("div[class|=dialog]") == 50 # ? Seems right + assert count("div[class!=madeup]") == 243 # ? Seems right + assert count("div[class~=dialog]") == 51 # ? Seems right + assert count(":scope > div") == 1 + assert count(":scope > div > div[class=dialog]") == 1 + assert count(":scope > div div") == 242 + + +OPERATOR_PRECEDENCE_IDS = """ + + + + + +""" + +XMLLANG_IDS = """ + + a + b + c + d + e + f + + + + +""" + +HTML_IDS = """ + + + + +
+ + link + + link +
+
content
+
+
+
+
+
+
+
+
+

+
+
+ hi there + guy + + + + + + + +
+ + +
+
+
+
+ + + + +
+
+ +""" + + +HTML_SHAKESPEARE = """ + + + + + + +
+
+
As You Like It
+
+ by William Shakespeare +
+
+
ACT I, SCENE III. A room in the palace.
+
+
Enter CELIA and ROSALIND
+
+
CELIA
+
+
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
+
+
ROSALIND
+
+
Not one to throw at a dog.
+
+
CELIA
+
+
No, thy words are too precious to be cast away upon
+
curs; throw some of them at me; come, lame me with reasons.
+
+
ROSALIND
+
CELIA
+
+
But is all this for your father?
+
+
+
Then there were two cousins laid up; when the one
+
should be lamed with reasons and the other mad
+
without any.
+
+
ROSALIND
+
+
No, some of it is for my child's father. O, how
+
full of briers is this working-day world!
+
+
CELIA
+
+
They are but burs, cousin, thrown upon thee in
+
holiday foolery: if we walk not in the trodden
+
paths our very petticoats will catch them.
+
+
ROSALIND
+
+
I could shake them off my coat: these burs are in my heart.
+
+
CELIA
+
+
Hem them away.
+
+
ROSALIND
+
+
I would try, if I could cry 'hem' and have him.
+
+
CELIA
+
+
Come, come, wrestle with thy affections.
+
+
ROSALIND
+
+
O, they take the part of a better wrestler than myself!
+
+
CELIA
+
+
O, a good wish upon you! you will try in time, in
+
despite of a fall. But, turning these jests out of
+
service, let us talk in good earnest: is it
+
possible, on such a sudden, you should fall into so
+
strong a liking with old Sir Rowland's youngest son?
+
+
ROSALIND
+
+
The duke my father loved his father dearly.
+
+
CELIA
+
+
Doth it therefore ensue that you should love his son
+
dearly? By this kind of chase, I should hate him,
+
for my father hated his father dearly; yet I hate
+
not Orlando.
+
+
ROSALIND
+
+
No, faith, hate him not, for my sake.
+
+
CELIA
+
+
Why should I not? doth he not deserve well?
+
+
ROSALIND
+
+
Let me love him for that, and do you love him
+
because I do. Look, here comes the duke.
+
+
CELIA
+
+
With his eyes full of anger.
+
Enter DUKE FREDERICK, with Lords
+
+
DUKE FREDERICK
+
+
Mistress, dispatch you with your safest haste
+
And get you from our court.
+
+
ROSALIND
+
+
Me, uncle?
+
+
DUKE FREDERICK
+
+
You, cousin
+
Within these ten days if that thou be'st found
+
So near our public court as twenty miles,
+
Thou diest for it.
+
+
ROSALIND
+
+
I do beseech your grace,
+
Let me the knowledge of my fault bear with me:
+
If with myself I hold intelligence
+
Or have acquaintance with mine own desires,
+
If that I do not dream or be not frantic,--
+
As I do trust I am not--then, dear uncle,
+
Never so much as in a thought unborn
+
Did I offend your highness.
+
+
DUKE FREDERICK
+
+
Thus do all traitors:
+
If their purgation did consist in words,
+
They are as innocent as grace itself:
+
Let it suffice thee that I trust thee not.
+
+
ROSALIND
+
+
Yet your mistrust cannot make me a traitor:
+
Tell me whereon the likelihood depends.
+
+
DUKE FREDERICK
+
+
Thou art thy father's daughter; there's enough.
+
+
ROSALIND
+
+
So was I when your highness took his dukedom;
+
So was I when your highness banish'd him:
+
Treason is not inherited, my lord;
+
Or, if we did derive it from our friends,
+
What's that to me? my father was no traitor:
+
Then, good my liege, mistake me not so much
+
To think my poverty is treacherous.
+
+
CELIA
+
+
Dear sovereign, hear me speak.
+
+
DUKE FREDERICK
+
+
Ay, Celia; we stay'd her for your sake,
+
Else had she with her father ranged along.
+
+
CELIA
+
+
I did not then entreat to have her stay;
+
It was your pleasure and your own remorse:
+
I was too young that time to value her;
+
But now I know her: if she be a traitor,
+
Why so am I; we still have slept together,
+
Rose at an instant, learn'd, play'd, eat together,
+
And wheresoever we went, like Juno's swans,
+
Still we went coupled and inseparable.
+
+
DUKE FREDERICK
+
+
She is too subtle for thee; and her smoothness,
+
Her very silence and her patience
+
Speak to the people, and they pity her.
+
Thou art a fool: she robs thee of thy name;
+
And thou wilt show more bright and seem more virtuous
+
When she is gone. Then open not thy lips:
+
Firm and irrevocable is my doom
+
Which I have pass'd upon her; she is banish'd.
+
+
CELIA
+
+
Pronounce that sentence then on me, my liege:
+
I cannot live out of her company.
+
+
DUKE FREDERICK
+
+
You are a fool. You, niece, provide yourself:
+
If you outstay the time, upon mine honour,
+
And in the greatness of my word, you die.
+
Exeunt DUKE FREDERICK and Lords
+
+
CELIA
+
+
O my poor Rosalind, whither wilt thou go?
+
Wilt thou change fathers? I will give thee mine.
+
I charge thee, be not thou more grieved than I am.
+
+
ROSALIND
+
+
I have more cause.
+
+
CELIA
+
+
Thou hast not, cousin;
+
Prithee be cheerful: know'st thou not, the duke
+
Hath banish'd me, his daughter?
+
+
ROSALIND
+
+
That he hath not.
+
+
CELIA
+
+
No, hath not? Rosalind lacks then the love
+
Which teacheth thee that thou and I am one:
+
Shall we be sunder'd? shall we part, sweet girl?
+
No: let my father seek another heir.
+
Therefore devise with me how we may fly,
+
Whither to go and what to bear with us;
+
And do not seek to take your change upon you,
+
To bear your griefs yourself and leave me out;
+
For, by this heaven, now at our sorrows pale,
+
Say what thou canst, I'll go along with thee.
+
+
ROSALIND
+
+
Why, whither shall we go?
+
+
CELIA
+
+
To seek my uncle in the forest of Arden.
+
+
ROSALIND
+
+
Alas, what danger will it be to us,
+
Maids as we are, to travel forth so far!
+
Beauty provoketh thieves sooner than gold.
+
+
CELIA
+
+
I'll put myself in poor and mean attire
+
And with a kind of umber smirch my face;
+
The like do you: so shall we pass along
+
And never stir assailants.
+
+
ROSALIND
+
+
Were it not better,
+
Because that I am more than common tall,
+
That I did suit me all points like a man?
+
A gallant curtle-axe upon my thigh,
+
A boar-spear in my hand; and--in my heart
+
Lie there what hidden woman's fear there will--
+
We'll have a swashing and a martial outside,
+
As many other mannish cowards have
+
That do outface it with their semblances.
+
+
CELIA
+
+
What shall I call thee when thou art a man?
+
+
ROSALIND
+
+
I'll have no worse a name than Jove's own page;
+
And therefore look you call me Ganymede.
+
But what will you be call'd?
+
+
CELIA
+
+
Something that hath a reference to my state
+
No longer Celia, but Aliena.
+
+
ROSALIND
+
+
But, cousin, what if we assay'd to steal
+
The clownish fool out of your father's court?
+
Would he not be a comfort to our travel?
+
+
CELIA
+
+
He'll go along o'er the wide world with me;
+
Leave me alone to woo him. Let's away,
+
And get our jewels and our wealth together,
+
Devise the fittest time and safest way
+
To hide us from pursuit that will be made
+
After my flight. Now go we in content
+
To liberty and not to banishment.
+
Exeunt
+
+
+
+
+ + +""" + + +if __name__ == "__main__": + unittest.main() diff --git a/tox.ini b/tox.ini index 9a552c2..9ff54cf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,49 @@ [tox] -envlist = py24,py25,py26,py27,py31,py32 +envlist = pre-commit,pylint,py,docs,typing [testenv] -deps=lxml -commands = python cssselect/tests.py +deps = + lxml>=4.4 + pytest-cov>=7.0.0 + pytest>=5.4 + sybil +commands = + pytest --cov=cssselect \ + --cov-report=term-missing --cov-report=html --cov-report=xml \ + {posargs: cssselect tests docs} + +[testenv:pylint] +deps = + {[testenv]deps} + pylint==4.0.4 +commands = + pylint {posargs: cssselect tests docs} + +[testenv:docs] +changedir = docs +deps = + -r docs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html + +[testenv:typing] +deps = + {[testenv]deps} + mypy==1.19.1 + types-lxml==2026.1.1 +commands = + mypy {posargs: cssselect tests} + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true + +[testenv:twinecheck] +basepython = python3 +deps = + twine==6.2.0 + build==1.4.0 +commands = + python -m build --sdist + twine check dist/*