diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..38558bf --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 4 +insert_final_newline = true +end_of_line = lf + +[*.{yml,yaml}] +indent_size = 2 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..bb4f6e1 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# applying pre-commit hooks to the project +e91101b37f82558db84a6b8ee9a6dba1fd2ae0bb diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000..41ff7e1 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,43 @@ +name: Checks +on: [push, pull_request] + +jobs: + checks: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: 3.14 + env: + TOXENV: pylint + - python-version: 3.14 # Keep in sync with .readthedocs.yml + env: + TOXENV: docs + - python-version: 3.14 + env: + TOXENV: typing + - python-version: 3.14 + env: + TOXENV: twinecheck + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run check + env: ${{ matrix.env }} + run: | + pip install -U pip + pip install -U tox + tox + + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..526c458 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,32 @@ +name: Publish +on: + push: + tags: + - 'v[0-9]+.[0-9]+.[0-9]+' + +jobs: + publish: + runs-on: ubuntu-latest + + environment: + name: pypi + url: https://pypi.org/p/cssselect + + permissions: + id-token: write + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: 3.14 + + - name: Build + run: | + python -m pip install --upgrade build + python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tests-macos.yml b/.github/workflows/tests-macos.yml new file mode 100644 index 0000000..4947937 --- /dev/null +++ b/.github/workflows/tests-macos.yml @@ -0,0 +1,27 @@ +name: macOS +on: [push, pull_request] + +jobs: + tests: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-ubuntu.yml b/.github/workflows/tests-ubuntu.yml new file mode 100644 index 0000000..1ef905b --- /dev/null +++ b/.github/workflows/tests-ubuntu.yml @@ -0,0 +1,33 @@ +name: Ubuntu +on: [push, pull_request] + +jobs: + tests: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy3.11"] + + steps: + - uses: actions/checkout@v6 + + - name: Install system libraries + if: contains(matrix.python-version, 'pypy') + run: | + sudo apt-get update + sudo apt-get install libxml2-dev libxslt-dev + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml new file mode 100644 index 0000000..24d7ee8 --- /dev/null +++ b/.github/workflows/tests-windows.yml @@ -0,0 +1,27 @@ +name: Windows +on: [push, pull_request] + +jobs: + tests: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + run: | + pip install -U pip + pip install -U tox + tox -e py + + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.gitignore b/.gitignore index 36120ab..c276bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ /MANIFEST /dist /docs/_build +/.coverage +.idea +htmlcov/ +coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..81ca890 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.4 + hooks: + - id: ruff-check + args: [ --fix ] + - id: ruff-format +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.20.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==26.1.0 +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v1.0.0 + hooks: + - id: sphinx-lint +- repo: https://github.com/rhysd/actionlint + rev: v1.7.10 + hooks: + - id: actionlint diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..b91642a --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py + fail_on_warning: true +build: + os: ubuntu-24.04 + tools: + # For available versions, see: + # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python + python: "3.14" # Keep in sync with .github/workflows/checks.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/AUTHORS b/AUTHORS index 8c69e8f..66dcc22 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,13 @@ +Daniel Graña Ian Bicking +James Salter Laurence Rowe +Mikhail Korobov +Nik Nyby +Paul Tremberth +Simon Potter Simon Sapin Stefan Behnel +Thomas Grainger +Varialus +Arthur Darcet diff --git a/CHANGES b/CHANGES index 4583cef..5ca2959 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,251 @@ Changelog ========= +Version 1.4.0 +------------- + +Released on 2026-01-29. + +* Dropped support for Python 3.9 and PyPy 3.10. + +* Added support for Python 3.14 and PyPy 3.11. + +* Switched the build system to ``hatchling``. + +* CI fixes and improvements. + +Version 1.3.0 +------------- + +Released on 2025-03-10. + +* Dropped support for Python 3.7-3.8, added support for Python 3.12-3.13 and + PyPy 3.10. + +* Removed ``_unicode_safe_getattr()``, deprecated in 1.2.0. + +* Added ``pre-commit`` and formatted the code with ``ruff``. + +* Many CI additions and improvements. + + +Version 1.2.0 +------------- + +Released on 2022-10-27. + +* Drop support for Python 2.7, 3.4-3.6, add support for Python 3.7-3.11. + +* Add type annotations (PEP 484 and PEP 561). + +* More features from the CSS Selectors Level 4: + + * The ``:is()`` pseudo-class. + + * The ``:where()`` pseudo-class. + + * The ``:has()`` pseudo-class, with some limitations. + +* Fix parsing ``:scope`` after a comma. + +* Add parentheses to fix condition precedence in some cases. + +* Private API changes related to the removal of the Python 2 support: + + * Remove ``_unicode`` and ``_unichr`` aliases from ``csselect.parser``. + + * Remove ``_basestring`` and ``_unicode`` aliases from ``csselect.xpath``. + + * Deprecate ``csselect.xpath._unicode_safe_getattr()`` and change it to just + call ``getattr()``. + +* Include tests in the PyPI tarball. + +* Many CI additions and improvements. + +* Improve the test coverage. + + +Version 1.1.0 +------------- + +Released on 2019-08-09. + +* Support for the ``:scope`` selector, which allows to access immediate + children of a selector. + +* Support for the ``|E`` syntax for type selectors without a namespace. + +* A new selector method, ``canonical``, returns the CSS expression of the + selector, as a string. + + +Version 1.0.3 +------------- + +Released on 2017-12-27. + +* Fix artifact uploads to pypi + + +Version 1.0.2 +------------- + +Released on 2017-12-26. + +* Drop support for Python 2.6 and Python 3.3. +* Fix deprecation warning in Python 3.6. +* Minor cleanups. + + +Version 1.0.1 +------------- + +Released on 2017-01-10. + +* Add support for Python 3.6. +* Documentation hosted `on Read the Docs `_ + + +Version 1.0.0 +------------- + +Released on 2016-10-21. + +* Add code coverage reports. +* Fix ``:nth-*(an+b)`` pseudo-classes selectors. + (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.) + + +Version 0.9.2 +------------- + +Released on 2016-06-15. + +* Distribute as universal wheel. +* Add support for Python 3.3, 3.4 and 3.5. +* Drop support for Python 2.5 as testing is getting difficult. +* Improve tests on pseudo-elements. + + +Version 0.9.1 +------------- + +Released on 2013-10-17. + +* **Backward incompatible change from 0.9**: + :meth:`~GenericTranslator.selector_to_xpath` defaults to + ignoring pseudo-elements, + as it did in 0.8 and previous versions. + (:meth:`~GenericTranslator.css_to_xpath` doesn’t change.) +* Drop official support for Python 2.4 and 3.1, + as testing was becoming difficult. + Nothing will break overnight, + but future releases may on may not work on these versions. + Older releases will remain available on PyPI. + + +Version 0.9 +----------- + +Released on 2013-10-11. + +Add parser support for :attr:`functional +pseudo-elements `. + +*Update:* +This version accidentally introduced a **backward incompatible** change: +:meth:`~GenericTranslator.selector_to_xpath` defaults to +rejecting pseudo-elements instead of ignoring them. + + +Version 0.8 +----------- + +Released on 2013-03-15. + +Improvements: + +* `#22 `_ + Let extended translators override what XPathExpr class is used +* `#19 `_ + Use the built-in ``lang()`` XPath function + for implementing the ``:lang()`` pseudo-class + with XML documents. + This is probably faster than ``ancestor-or-self::``. + +Bug fixes: + +* `#14 `_ + Fix non-ASCII pseudo-classes. (Invalid selector instead of crash.) +* `#20 `_ + As per the spec, elements containing only whitespace are not considered empty + for the ``:empty`` pseudo-class. + + +Version 0.7.1 +------------- + +Released on 2012-06-14. Code name *remember-to-test-with-tox*. + +0.7 broke the parser in Python 2.4 and 2.5; the tests in 2.x. +Now all is well again. + +Also, pseudo-elements are now correctly made lower-case. (They are supposed +to be case-insensitive.) + + +Version 0.7 +----------- + +Released on 2012-06-14. + +Bug fix release: see #2, #7 and #10 on GitHub. + +* The tokenizer and parser have been rewritten to be much closer to the + specified grammar. In particular, non-ASCII characters and backslash-escapes + are now handled correctly. +* Special characters are protected in the output so that generated XPath + exrpessions should always be valid +* The ``~=``, ``^=`` and ``*=`` attribute operators now correctly never match + when used with an empty string. + + +Version 0.6.1 +------------- + +Released on 2012-04-25. + +Make sure that internal token objects do not "leak" into the public API and +:attr:`Selector.pseudo_element` is an unicode string. + + +Version 0.6 +----------- + +Released on 2012-04-24. + +* In ``setup.py`` use setuptools/distribute if available, but fall back + on distutils. +* Implement the ``:lang()`` pseudo-class, although it is only based on + ``xml:lang`` or ``lang`` attributes. If the document language is known from + some other meta-data (like a ``Content-Language`` HTTP header or ```` + element), a workaround is to set a lang attribute on the root element. + + +Version 0.5 +----------- + +Released on 2012-04-20. + +* Fix case sensitivity issues. +* Implement :class:`HTMLTranslator` based on the `HTML5 specification`_ + rather than guessing; add the ``xhtml`` parameter. +* Several bug fixes and better test coverage. + +.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors + + Version 0.4 ----------- @@ -19,14 +264,14 @@ Version 0.3 Released on 2012-04-17. * Fix many parsing bugs. -* Rename the :class:`Translator` class to :class:`GenericTranslator` +* Rename the ``Translator`` class to :class:`GenericTranslator` * There, implement ``:target``, ``:hover``, ``:focus``, ``:active`` ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as never matching. * Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as appropriate for HTML, with all links "not visited". -* Remove the :func:`css_to_xpath` function. The translator classes +* Remove the ``css_to_xpath`` function. The translator classes are the new API. * Add support for ``:contains()`` back, but case-sensitive. lxml will override it to be case-insensitive for backward-compatibility. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index c8f5dc3..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include AUTHORS CHANGES LICENSE README.rst tox.ini -recursive-include docs * -prune docs/_build diff --git a/README.rst b/README.rst index fa53a5b..c055295 100644 --- a/README.rst +++ b/README.rst @@ -1,25 +1,40 @@ + =================================== cssselect: CSS Selectors for Python =================================== -*cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ -expressions. Such expressions can be used in lxml_ or another XPath engine -to find the matching elements in an XML or HTML document. +.. image:: https://img.shields.io/pypi/v/cssselect.svg + :target: https://pypi.python.org/pypi/cssselect + :alt: PyPI Version + +.. image:: https://img.shields.io/pypi/pyversions/cssselect.svg + :target: https://pypi.python.org/pypi/cssselect + :alt: Supported Python Versions + +.. image:: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml/badge.svg + :target: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml + :alt: Tests -This module used to live inside of lxml as ``lxml.cssselect`` before it was -extracted as a stand-alone project. +.. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg + :target: https://codecov.io/github/scrapy/cssselect?branch=master + :alt: Coverage report -.. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ -.. _XPath 1.0: http://www.w3.org/TR/xpath/ -.. _lxml: http://lxml.de/ +**cssselect** is a BSD-licensed Python library to parse `CSS3 selectors`_ and +translate them to `XPath 1.0`_ expressions. +`XPath 1.0`_ expressions can be used in lxml_ or another XPath engine to find +the matching elements in an XML or HTML document. + +Find the cssselect online documentation at https://cssselect.readthedocs.io. Quick facts: -* Free software: BSD licensed -* Compatible with Python 2.4+ and 3.x -* Latest documentation `on python.org `_ -* Source, issues and pull requests `on Github - `_ -* Releases `on PyPI `_ +* Source, issues and pull requests `on GitHub + `_ +* Releases `on PyPI `_ * Install with ``pip install cssselect`` + + +.. _CSS3 selectors: https://www.w3.org/TR/selectors-3/ +.. _XPath 1.0: https://www.w3.org/TR/xpath/all/ +.. _lxml: https://lxml.de/ diff --git a/cssselect/__init__.py b/cssselect/__init__.py index 3129a42..59d62df 100644 --- a/cssselect/__init__.py +++ b/cssselect/__init__.py @@ -1,21 +1,36 @@ """ - CSS Selectors based on XPath - ============================ +CSS Selectors based on XPath +============================ - This module supports selecting XML/HTML elements based on CSS selectors. - See the `CSSSelector` class for details. +This module supports selecting XML/HTML elements based on CSS selectors. +See the `CSSSelector` class for details. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ -from cssselect.parser import (parse, Selector, SelectorError, - SelectorSyntaxError) -from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError +from cssselect.parser import ( + FunctionalPseudoElement, + Selector, + SelectorError, + SelectorSyntaxError, + parse, +) +from cssselect.xpath import ExpressionError, GenericTranslator, HTMLTranslator +__all__ = ( + "ExpressionError", + "FunctionalPseudoElement", + "GenericTranslator", + "HTMLTranslator", + "Selector", + "SelectorError", + "SelectorSyntaxError", + "parse", +) -VERSION = '0.4' +VERSION = "1.4.0" __version__ = VERSION diff --git a/cssselect/parser.py b/cssselect/parser.py index f6b42c8..f969769 100644 --- a/cssselect/parser.py +++ b/cssselect/parser.py @@ -1,26 +1,33 @@ """ - cssselect.parser - ================ +cssselect.parser +================ - Tokenizer, parser and parsed objects for CSS selectors. +Tokenizer, parser and parsed objects for CSS selectors. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ +from __future__ import annotations + +import operator import re +import sys +from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias, Union, cast, overload + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Sequence + + # typing.Self requires Python 3.11 + from typing_extensions import Self -try: - _unicode = unicode - _unichr = unichr -except NameError: - # Python 3 - _unicode = str - _unichr = chr +def ascii_lower(string: str) -> str: + """Lower-case, but only in the ASCII range.""" + return string.encode("utf8").lower().decode("utf8") class SelectorError(Exception): @@ -32,205 +39,444 @@ class SelectorError(Exception): """ + class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects -class Selector(object): +Tree: TypeAlias = Union[ + "Element", + "Hash", + "Class", + "Function", + "Pseudo", + "Attrib", + "Negation", + "Relation", + "Matching", + "SpecificityAdjustment", + "CombinedSelector", +] +PseudoElement: TypeAlias = Union["FunctionalPseudoElement", str] + + +class Selector: """ - Represents a selector with an optional pseudo element. + Represents a parsed selector. + + :meth:`~GenericTranslator.selector_to_xpath` accepts this object, + but ignores :attr:`pseudo_element`. It is the user’s responsibility + to account for pseudo-elements and reject selectors with unknown + or unsupported pseudo-elements. + """ - def __init__(self, tree, pseudo_element=None): - self._tree = tree - #: If the selector has a pseudo-element: a string like ``'after'``. - #: Otherwise, ``None``. - #: Any identifier preceded by ``::`` is accepted as a pseudo-element. - #: It is the user’s responsibility to reject selectors with - #: unknown or unsupported pseudo-elements. + + def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None: + self.parsed_tree = tree + if pseudo_element is not None and not isinstance( + pseudo_element, FunctionalPseudoElement + ): + pseudo_element = ascii_lower(pseudo_element) + #: A :class:`FunctionalPseudoElement`, + #: or the identifier for the pseudo-element as a string, + # or ``None``. + #: + #: +-------------------------+----------------+--------------------------------+ + #: | | Selector | Pseudo-element | + #: +=========================+================+================================+ + #: | CSS3 syntax | ``a::before`` | ``'before'`` | + #: +-------------------------+----------------+--------------------------------+ + #: | Older syntax | ``a:before`` | ``'before'`` | + #: +-------------------------+----------------+--------------------------------+ + #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | + #: | not in Selectors3 | | | + #: +-------------------------+----------------+--------------------------------+ + #: | Invalid pseudo-class | ``li:marker`` | ``None`` | + #: +-------------------------+----------------+--------------------------------+ + #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | + #: +-------------------------+----------------+--------------------------------+ + #: + #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element - def __repr__(self): - if self.pseudo_element: - pseudo_element = '::%s' % self.pseudo_element + def __repr__(self) -> str: + if isinstance(self.pseudo_element, FunctionalPseudoElement): + pseudo_element = repr(self.pseudo_element) + elif self.pseudo_element: + pseudo_element = f"::{self.pseudo_element}" + else: + pseudo_element = "" + return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]" + + def canonical(self) -> str: + """Return a CSS representation for this selector (a string)""" + if isinstance(self.pseudo_element, FunctionalPseudoElement): + pseudo_element = f"::{self.pseudo_element.canonical()}" + elif self.pseudo_element: + pseudo_element = f"::{self.pseudo_element}" else: - pseudo_element = '' - return '%s[%r%s]' % ( - self.__class__.__name__, self._tree, pseudo_element) + pseudo_element = "" + res = f"{self.parsed_tree.canonical()}{pseudo_element}" + if len(res) > 1: + res = res.lstrip("*") + return res - def specificity(self): + def specificity(self) -> tuple[int, int, int]: """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity """ - a, b, c = self._tree.specificity() + a, b, c = self.parsed_tree.specificity() if self.pseudo_element: c += 1 return a, b, c -class Class(object): +class Class: """ Represents selector.class_name """ - def __init__(self, selector, class_name): + + def __init__(self, selector: Tree, class_name: str) -> None: self.selector = selector self.class_name = class_name - def __repr__(self): - return '%s[%r.%s]' % ( - self.__class__.__name__, self.selector, self.class_name) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]" + + def canonical(self) -> str: + return f"{self.selector.canonical()}.{self.class_name}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Function(object): +class FunctionalPseudoElement: + """ + Represents selector::name(arguments) + + .. attribute:: name + + The name (identifier) of the pseudo-element, as a string. + + .. attribute:: arguments + + The arguments of the pseudo-element, as a list of tokens. + + **Note:** tokens are not part of the public API, + and may change between cssselect versions. + Use at your own risks. + + """ + + def __init__(self, name: str, arguments: Sequence[Token]): + self.name = ascii_lower(name) + self.arguments = arguments + + def __repr__(self) -> str: + token_values = [token.value for token in self.arguments] + return f"{self.__class__.__name__}[::{self.name}({token_values!r})]" + + def argument_types(self) -> list[str]: + return [token.type for token in self.arguments] + + def canonical(self) -> str: + args = "".join(token.css() for token in self.arguments) + return f"{self.name}({args})" + + +class Function: """ Represents selector:name(expr) """ - def __init__(self, selector, name, arguments): + + def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None: self.selector = selector - self.name = name + self.name = ascii_lower(name) self.arguments = arguments - def __repr__(self): - return '%s[%r:%s(%r)]' % ( - self.__class__.__name__, self.selector, self.name, self.arguments) + def __repr__(self) -> str: + token_values = [token.value for token in self.arguments] + return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]" - def specificity(self): + def argument_types(self) -> list[str]: + return [token.type for token in self.arguments] + + def canonical(self) -> str: + args = "".join(token.css() for token in self.arguments) + return f"{self.selector.canonical()}:{self.name}({args})" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Pseudo(object): +class Pseudo: """ Represents selector:ident """ - def __init__(self, selector, ident): + + def __init__(self, selector: Tree, ident: str) -> None: self.selector = selector - self.ident = ident + self.ident = ascii_lower(ident) - def __repr__(self): - return '%s[%r:%s]' % ( - self.__class__.__name__, self.selector, self.ident) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]" - def specificity(self): + def canonical(self) -> str: + return f"{self.selector.canonical()}:{self.ident}" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Negation(object): +class Negation: """ Represents selector:not(subselector) """ - def __init__(self, selector, subselector): + + def __init__(self, selector: Tree, subselector: Tree) -> None: self.selector = selector self.subselector = subselector - def __repr__(self): - return '%s[%r:not(%r)]' % ( - self.__class__.__name__, self.selector, self.subselector) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]" + + def canonical(self) -> str: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()}:not({subsel})" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() - a2, b2, c2 = self.sub_selector.specificity() + a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 -class Attrib(object): +class Relation: + """ + Represents selector:has(subselector) + """ + + def __init__(self, selector: Tree, combinator: Token, subselector: Selector): + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]" + + def canonical(self) -> str: + try: + subsel = self.subselector[0].canonical() # type: ignore[index] + except TypeError: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()}:has({subsel})" + + def specificity(self) -> tuple[int, int, int]: + a1, b1, c1 = self.selector.specificity() + try: + a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index] + except TypeError: + a2, b2, c2 = self.subselector.specificity() + return a1 + a2, b1 + b2, c1 + c2 + + +class Matching: + """ + Represents selector:is(selector_list) + """ + + def __init__(self, selector: Tree, selector_list: Iterable[Tree]): + self.selector = selector + self.selector_list = selector_list + + def __repr__(self) -> str: + args_str = ", ".join(repr(s) for s in self.selector_list) + return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]" + + def canonical(self) -> str: + selector_arguments = [] + for s in self.selector_list: + selarg = s.canonical() + selector_arguments.append(selarg.lstrip("*")) + args_str = ", ".join(str(s) for s in selector_arguments) + return f"{self.selector.canonical()}:is({args_str})" + + def specificity(self) -> tuple[int, int, int]: + return max(x.specificity() for x in self.selector_list) + + +class SpecificityAdjustment: + """ + Represents selector:where(selector_list) + Same as selector:is(selector_list), but its specificity is always 0 + """ + + def __init__(self, selector: Tree, selector_list: list[Tree]): + self.selector = selector + self.selector_list = selector_list + + def __repr__(self) -> str: + args_str = ", ".join(repr(s) for s in self.selector_list) + return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]" + + def canonical(self) -> str: + selector_arguments = [] + for s in self.selector_list: + selarg = s.canonical() + selector_arguments.append(selarg.lstrip("*")) + args_str = ", ".join(str(s) for s in selector_arguments) + return f"{self.selector.canonical()}:where({args_str})" + + def specificity(self) -> tuple[int, int, int]: + return 0, 0, 0 + + +class Attrib: """ Represents selector[namespace|attrib operator value] """ - def __init__(self, selector, namespace, attrib, operator, value): + + @overload + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: Literal["exists"], + value: None, + ) -> None: ... + + @overload + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: str, + value: Token, + ) -> None: ... + + def __init__( + self, + selector: Tree, + namespace: str | None, + attrib: str, + operator: str, + value: Token | None, + ) -> None: self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value - def __repr__(self): - if self.namespace == '*': - attrib = self.attrib - else: - attrib = '%s|%s' % (self.namespace, self.attrib) - if self.operator == 'exists': - return '%s[%r[%s]]' % ( - self.__class__.__name__, self.selector, attrib) + def __repr__(self) -> str: + attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib + if self.operator == "exists": + return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]" + assert self.value is not None + return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]" + + def canonical(self) -> str: + attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib + + if self.operator == "exists": + op = attrib else: - return '%s[%r[%s %s %r]]' % ( - self.__class__.__name__, self.selector, attrib, - self.operator, self.value) + assert self.value is not None + op = f"{attrib}{self.operator}{self.value.css()}" - def specificity(self): + return f"{self.selector.canonical()}[{op}]" + + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c -class Element(object): +class Element: """ Represents namespace|element + + `None` is for the universal selector '*' + """ - def __init__(self, namespace, element): + + def __init__( + self, namespace: str | None = None, element: str | None = None + ) -> None: self.namespace = namespace self.element = element - def __repr__(self): - if self.namespace == '*': - element = self.element - else: - element = '%s|%s' % (self.namespace, self.element) - return '%s[%s]' % ( - self.__class__.__name__, element) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.canonical()}]" - def specificity(self): - if self.element == '*': - return 0, 0, 0 - else: + def canonical(self) -> str: + element = self.element or "*" + if self.namespace: + element = f"{self.namespace}|{element}" + return element + + def specificity(self) -> tuple[int, int, int]: + if self.element: return 0, 0, 1 + return 0, 0, 0 -class Hash(object): +class Hash: """ Represents selector#id """ - def __init__(self, selector, id): + + def __init__(self, selector: Tree, id: str) -> None: # noqa: A002 self.selector = selector self.id = id - def __repr__(self): - return '%s[%r#%s]' % ( - self.__class__.__name__, self.selector, self.id) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]" + + def canonical(self) -> str: + return f"{self.selector.canonical()}#{self.id}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a, b, c = self.selector.specificity() a += 1 return a, b, c -class CombinedSelector(object): - def __init__(self, selector, combinator, subselector): +class CombinedSelector: + def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None: assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector - def __repr__(self): - if self.combinator == ' ': - comb = '' - else: - comb = self.combinator - return '%s[%r %s %r]' % ( - self.__class__.__name__, self.selector, comb, self.subselector) + def __repr__(self) -> str: + comb = "" if self.combinator == " " else self.combinator + return ( + f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]" + ) + + def canonical(self) -> str: + subsel = self.subselector.canonical() + if len(subsel) > 1: + subsel = subsel.lstrip("*") + return f"{self.selector.canonical()} {self.combinator} {subsel}" - def specificity(self): + def specificity(self) -> tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 @@ -238,19 +484,26 @@ def specificity(self): #### Parser -_el_re = re.compile(r'^\s*(\w+)$') -_id_re = re.compile(r'^\s*(\w*)#(\w+)\s*$') -_class_re = re.compile(r'^\s*(\w*)\.(\w+)\s*$') +# foo +_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$") + +# foo#bar or #bar +_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$") +# foo.bar or .bar +_class_re = re.compile( + r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$" +) -def parse(css): + +def parse(css: str) -> list[Selector]: """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: - A *group of selectors* as an Unicode string. + A *group of selectors* as a string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: @@ -261,415 +514,533 @@ def parse(css): # Fast path for simple cases match = _el_re.match(css) if match: - return [Selector(Element('*', match.group(1)))] + return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: - return [Selector(Hash(Element( - '*', match.group(1) or '*'), match.group(2)))] + return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] match = _class_re.match(css) if match is not None: - return [Selector(Class(Element( - '*', match.group(1) or '*'), match.group(2)))] + return [ + Selector(Class(Element(element=match.group(1) or None), match.group(2))) + ] stream = TokenStream(tokenize(css)) stream.source = css - try: - return list(parse_selector_group(stream)) - except SelectorSyntaxError: - import sys - e = sys.exc_info()[1] - message = "%s at %s -> %r" % ( - e, stream.used, stream.peek()) - e.msg = message - if sys.version_info < (2,6): - e.message = message - e.args = tuple([message]) - raise - - -def parse_selector_group(stream): + return list(parse_selector_group(stream)) + + +# except SelectorSyntaxError: +# e = sys.exc_info()[1] +# message = "%s at %s -> %r" % ( +# e, stream.used, stream.peek()) +# e.msg = message +# e.args = tuple([message]) +# raise + + +def parse_selector_group(stream: TokenStream) -> Iterator[Selector]: stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) - if stream.peek() == ',': + if stream.peek() == ("DELIM", ","): stream.next() stream.skip_whitespace() else: break -def parse_selector(stream): + +def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]: result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() - if peek == ',' or peek is None: + if peek in (("EOF", None), ("DELIM", ",")): break if pseudo_element: raise SelectorSyntaxError( - 'A pseudo-element must be at the end of a selector') - if peek in ('+', '>', '~'): + f"Got pseudo-element ::{pseudo_element} not at the end of a selector" + ) + if peek.is_delim("+", ">", "~"): # A combinator - combinator = stream.next() + combinator = cast("str", stream.next().value) stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' - combinator = ' ' + combinator = " " next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element -def parse_simple_selector(stream, inside_negation=False): +def parse_simple_selector( + stream: TokenStream, inside_negation: bool = False +) -> tuple[Tree, PseudoElement | None]: stream.skip_whitespace() + selector_start = len(stream.used) peek = stream.peek() - consumed = len(stream.used) - if peek == '*' or isinstance(peek, Symbol): - next = stream.next() - if stream.peek() == '|': - namespace = next + if peek.type == "IDENT" or peek == ("DELIM", "*"): + if peek.type == "IDENT": + namespace = stream.next().value + else: + stream.next() + namespace = None + if stream.peek() == ("DELIM", "|"): stream.next() - element = stream.next_symbol_or_star() + element = stream.next_ident_or_star() else: - namespace = '*' - element = next + element = namespace + namespace = None else: - element = namespace = '*' - result = Element(namespace, element) - pseudo_element = None + element = namespace = None + result: Tree = Element(namespace, element) + pseudo_element: PseudoElement | None = None while 1: peek = stream.peek() - if peek in (None, ' ', ',', '+', '>', '~') or ( - inside_negation and peek == ')'): + if ( + peek.type in ("S", "EOF") + or peek.is_delim(",", "+", ">", "~") + or (inside_negation and peek == ("DELIM", ")")) + ): break if pseudo_element: raise SelectorSyntaxError( - 'A pseudo-element must be at the end of a selector') - if peek == '#': + f"Got pseudo-element ::{pseudo_element} not at the end of a selector" + ) + if peek.type == "HASH": + result = Hash(result, cast("str", stream.next().value)) + elif peek == ("DELIM", "."): stream.next() - result = Hash(result, stream.next_symbol()) - continue - elif peek == '.': + result = Class(result, stream.next_ident()) + elif peek == ("DELIM", "|"): stream.next() - result = Class(result, stream.next_symbol()) - continue - elif peek == '[': + result = Element(None, stream.next_ident()) + elif peek == ("DELIM", "["): stream.next() result = parse_attrib(result, stream) - next = stream.next() - if next != ']': - raise SelectorSyntaxError( - "] expected, got '%s'" % next) - continue - elif peek == '::': + elif peek == ("DELIM", ":"): stream.next() - pseudo_element = stream.next_symbol() - continue - elif peek == ':': - stream.next() - ident = stream.next_symbol() - if ident in ('first-line', 'first-letter', 'before', 'after'): + if stream.peek() == ("DELIM", ":"): + stream.next() + pseudo_element = stream.next_ident() + if stream.peek() == ("DELIM", "("): + stream.next() + pseudo_element = FunctionalPseudoElement( + pseudo_element, parse_arguments(stream) + ) + continue + ident = stream.next_ident() + if ident.lower() in ("first-line", "first-letter", "before", "after"): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. - pseudo_element = ident + pseudo_element = str(ident) continue - if stream.peek() == '(': - stream.next() - stream.skip_whitespace() - if ident == 'not': - if inside_negation: - raise SelectorSyntaxError('Got nested :not()') - argument, argument_pseudo_element = parse_simple_selector( - stream, inside_negation=True) - if argument_pseudo_element: - raise SelectorSyntaxError( - 'Pseudo-elements are not allowed inside :not()') - else: - peek = stream.peek() - if isinstance(peek, (Symbol, String)): - argument = stream.next() - else: - raise SelectorSyntaxError( - "Expected argument, got '%s'" % peek) - stream.skip_whitespace() - next = stream.next() - if not next == ')': + if stream.peek() != ("DELIM", "("): + result = Pseudo(result, ident) + if repr(result) == "Pseudo[Element[*]:scope]" and not ( + len(stream.used) == 2 + or (len(stream.used) == 3 and stream.used[0].type == "S") + or (len(stream.used) >= 3 and stream.used[-3].is_delim(",")) + or ( + len(stream.used) >= 4 + and stream.used[-3].type == "S" + and stream.used[-4].is_delim(",") + ) + ): raise SelectorSyntaxError( - "Expected ')', got '%s'" % next) - if ident == 'not': - result = Negation(result, argument) - else: - result = Function(result, ident, argument) + 'Got immediate child pseudo-element ":scope" ' + "not at the start of a selector" + ) + continue + stream.next() + stream.skip_whitespace() + if ident.lower() == "not": + if inside_negation: + raise SelectorSyntaxError("Got nested :not()") + argument, argument_pseudo_element = parse_simple_selector( + stream, inside_negation=True + ) + next_ = stream.next() + if argument_pseudo_element: + raise SelectorSyntaxError( + f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next_.pos}" + ) + if next_ != ("DELIM", ")"): + raise SelectorSyntaxError(f"Expected ')', got {next_}") + result = Negation(result, argument) + elif ident.lower() == "has": + combinator, arguments = parse_relative_selector(stream) + result = Relation(result, combinator, arguments) + + elif ident.lower() in ("matches", "is"): + selectors = parse_simple_selector_arguments(stream) + result = Matching(result, selectors) + elif ident.lower() == "where": + selectors = parse_simple_selector_arguments(stream) + result = SpecificityAdjustment(result, selectors) else: - result = Pseudo(result, ident) - continue + result = Function(result, ident, parse_arguments(stream)) else: - raise SelectorSyntaxError( - "Expected selector, got '%s'" % peek) - if consumed == len(stream.used): - raise SelectorSyntaxError( - "Expected selector, got '%s'" % stream.peek()) + raise SelectorSyntaxError(f"Expected selector, got {peek}") + if len(stream.used) == selector_start: + raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}") return result, pseudo_element -def parse_attrib(selector, stream): +def parse_arguments(stream: TokenStream) -> list[Token]: # noqa: RET503 + arguments: list[Token] = [] + while 1: + stream.skip_whitespace() + next_ = stream.next() + if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ + ("DELIM", "+"), + ("DELIM", "-"), + ]: + arguments.append(next_) + elif next_ == ("DELIM", ")"): + return arguments + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + + +def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]: # noqa: RET503 stream.skip_whitespace() - attrib = stream.next_symbol_or_star() - if attrib == '*' and stream.peek() != '|': - raise SelectorSyntaxError( - "Expected '|', got '%s'" % stream.peek()) - if stream.peek() == '|': - namespace = attrib - stream.next() - attrib = stream.next_symbol() + subselector = "" + next_ = stream.next() + + if next_ in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]: + combinator = next_ + stream.skip_whitespace() + next_ = stream.next() else: - namespace = '*' + combinator = Token("DELIM", " ", pos=0) + + while 1: + if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ + ("DELIM", "."), + ("DELIM", "*"), + ]: + subselector += cast("str", next_.value) + elif next_ == ("DELIM", ")"): + result = parse(subselector) + return combinator, result[0] + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + next_ = stream.next() + + +def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]: + arguments = [] + while 1: + result, pseudo_element = parse_simple_selector(stream, True) + if pseudo_element: + raise SelectorSyntaxError( + f"Got pseudo-element ::{pseudo_element} inside function" + ) + stream.skip_whitespace() + next_ = stream.next() + if next_ in (("EOF", None), ("DELIM", ",")): + stream.next() + stream.skip_whitespace() + arguments.append(result) + elif next_ == ("DELIM", ")"): + arguments.append(result) + break + else: + raise SelectorSyntaxError(f"Expected an argument, got {next_}") + return arguments + + +def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib: stream.skip_whitespace() - if stream.peek() == ']': - return Attrib(selector, namespace, attrib, 'exists', None) - op = stream.next() - if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): - raise SelectorSyntaxError( - "Operator expected, got '%s'" % op) + attrib = stream.next_ident_or_star() + if attrib is None and stream.peek() != ("DELIM", "|"): + raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}") + namespace: str | None + op: str | None + if stream.peek() == ("DELIM", "|"): + stream.next() + if stream.peek() == ("DELIM", "="): + namespace = None + stream.next() + op = "|=" + else: + namespace = attrib + attrib = stream.next_ident() + op = None + else: + namespace = op = None + if op is None: + stream.skip_whitespace() + next_ = stream.next() + if next_ == ("DELIM", "]"): + return Attrib(selector, namespace, cast("str", attrib), "exists", None) + if next_ == ("DELIM", "="): + op = "=" + elif next_.is_delim("^", "$", "*", "~", "|", "!") and ( + stream.peek() == ("DELIM", "=") + ): + op = cast("str", next_.value) + "=" + stream.next() + else: + raise SelectorSyntaxError(f"Operator expected, got {next_}") stream.skip_whitespace() value = stream.next() - if not isinstance(value, (Symbol, String)): - raise SelectorSyntaxError( - "Expected string or symbol, got '%s'" % value) + if value.type not in ("IDENT", "STRING"): + raise SelectorSyntaxError(f"Expected string or ident, got {value}") stream.skip_whitespace() - return Attrib(selector, namespace, attrib, op, value) + next_ = stream.next() + if next_ != ("DELIM", "]"): + raise SelectorSyntaxError(f"Expected ']', got {next_}") + return Attrib(selector, namespace, cast("str", attrib), op, value) -def parse_series(s): +def parse_series(tokens: Iterable[Token]) -> tuple[int, int]: """ - Parses things like '1n+2', or 'an+b' generally, returning (a, b) + Parses the arguments for :nth-child() and friends. + + :raises: A list of tokens + :returns: :``(a, b)`` + """ - if isinstance(s, Element): - s = s._format_element() - if not s or s == '*': - # Happens when there's nothing, which the CSS parser thinks of as * - return (0, 0) - if isinstance(s, int): - # Happens when you just get a number - return (0, s) - if s == 'odd': - return (2, 1) - elif s == 'even': - return (2, 0) - elif s == 'n': - return (1, 0) - if 'n' not in s: - # Just a b - return (0, int(s)) - a, b = s.split('n', 1) + for token in tokens: + if token.type == "STRING": + raise ValueError("String tokens not allowed in series.") + s = "".join(cast("str", token.value) for token in tokens).strip() + if s == "odd": + return 2, 1 + if s == "even": + return 2, 0 + if s == "n": + return 1, 0 + if "n" not in s: + # Just b + return 0, int(s) + a, b = s.split("n", 1) + a_as_int: int if not a: - a = 1 - elif a == '-' or a == '+': - a = int(a+'1') - else: - a = int(a) - if not b: - b = 0 - elif b == '-' or b == '+': - b = int(b+'1') + a_as_int = 1 + elif a in {"-", "+"}: + a_as_int = int(a + "1") else: - b = int(b) - return (a, b) + a_as_int = int(a) + b_as_int = int(b) if b else 0 + return a_as_int, b_as_int #### Token objects -class _UniToken(_unicode): - def __new__(cls, contents, pos): - obj = _unicode.__new__(cls, contents) + +class Token(tuple[str, str | None]): # noqa: SLOT001 + @overload + def __new__( + cls, + type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"], + value: str, + pos: int, + ) -> Self: ... + + @overload + def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ... + + def __new__(cls, type_: str, value: str | None, pos: int) -> Self: + obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj - def __repr__(self): - return '%s(%s, %r)' % ( - self.__class__.__name__, - _unicode.__repr__(self), - self.pos) + def __repr__(self) -> str: + return f"<{self.type} '{self.value}' at {self.pos}>" + + def is_delim(self, *values: str) -> bool: + return self.type == "DELIM" and self.value in values + + pos: int + + @property + def type(self) -> str: + return self[0] -class Symbol(_UniToken): - pass + @property + def value(self) -> str | None: + return self[1] -class String(_UniToken): - pass + def css(self) -> str: + if self.type == "STRING": + return repr(self.value) + return cast("str", self.value) -class Token(_UniToken): - pass + +class EOFToken(Token): + def __new__(cls, pos: int) -> Self: + return Token.__new__(cls, "EOF", None, pos) + + def __repr__(self) -> str: + return f"<{self.type} at {self.pos}>" #### Tokenizer -_match_whitespace = re.compile(r'\s+', re.UNICODE).match -_replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub +class TokenMacros: + unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?" + escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]" + string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape + nonascii = r"[^\0-\177]" + nmchar = f"[_a-z0-9-]|{escape}|{nonascii}" + nmstart = f"[_a-z]|{escape}|{nonascii}" + + +class MatchFunc(Protocol): + def __call__( + self, string: str, pos: int = ..., endpos: int = ... + ) -> re.Match[str] | None: ... + + +def _compile(pattern: str) -> MatchFunc: + return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match -_match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match -def tokenize(s): +_match_whitespace = _compile(r"[ \t\r\n\f]+") +_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)") +_match_hash = _compile("#(?:%(nmchar)s)+") +_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*") +_match_string_by_quote = { + "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), + '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), +} + +_sub_simple_escape = re.compile(r"\\(.)").sub +_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub +_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub + +# Same as r'\1', but faster on CPython +_replace_simple = operator.methodcaller("group", 1) + + +def _replace_unicode(match: re.Match[str]) -> str: + codepoint = int(match.group(1), 16) + if codepoint > sys.maxunicode: + codepoint = 0xFFFD + return chr(codepoint) + + +def unescape_ident(value: str) -> str: + value = _sub_unicode_escape(_replace_unicode, value) + return _sub_simple_escape(_replace_simple, value) + + +def tokenize(s: str) -> Iterator[Token]: pos = 0 - s = _replace_comments('', s) len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: - yield Token(' ', pos) + yield Token("S", " ", pos) pos = match.end() continue - match = _match_count_number(s, pos=pos) - if match and match.group() != 'n': - sym = s[pos:match.end()] - yield Symbol(sym, pos) + + match = _match_ident(s, pos=pos) + if match: + value = _sub_simple_escape( + _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()) + ) + yield Token("IDENT", value, pos) pos = match.end() continue - c = s[pos] - c2 = s[pos:pos+2] - if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): - yield Token(c2, pos) - pos += 2 - continue - if c in '>+~,.*=[]()|:#': - yield Token(c, pos) - pos += 1 - continue - if c == '"' or c == "'": - # Quoted string - old_pos = pos - sym, pos = tokenize_escaped_string(s, pos) - yield String(sym, old_pos) - continue - old_pos = pos - sym, pos = tokenize_symbol(s, pos) - yield Symbol(sym, old_pos) - continue -split_at_string_escapes = re.compile(r'(\\(?:%s))' - % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', - '[^A-Fa-f0-9]'])).split + match = _match_hash(s, pos=pos) + if match: + value = _sub_simple_escape( + _replace_simple, + _sub_unicode_escape(_replace_unicode, match.group()[1:]), + ) + yield Token("HASH", value, pos) + pos = match.end() + continue + quote = s[pos] + if quote in _match_string_by_quote: + match = _match_string_by_quote[quote](s, pos=pos + 1) + assert match, "Should have found at least an empty match" + end_pos = match.end() + if end_pos == len_s: + raise SelectorSyntaxError(f"Unclosed string at {pos}") + if s[end_pos] != quote: + raise SelectorSyntaxError(f"Invalid string at {pos}") + value = _sub_simple_escape( + _replace_simple, + _sub_unicode_escape( + _replace_unicode, _sub_newline_escape("", match.group()) + ), + ) + yield Token("STRING", value, pos) + pos = end_pos + 1 + continue -def unescape_string_literal(literal): - substrings = [] - for substring in split_at_string_escapes(literal): - if not substring: + match = _match_number(s, pos=pos) + if match: + value = match.group() + yield Token("NUMBER", value, pos) + pos = match.end() continue - elif '\\' in substring: - if substring[0] == '\\' and len(substring) > 1: - substring = substring[1:] - if substring[0] in '0123456789ABCDEFabcdef': - # int() correctly ignores the potentially trailing whitespace - substring = _unichr(int(substring, 16)) + + pos2 = pos + 2 + if s[pos:pos2] == "/*": + pos = s.find("*/", pos2) + if pos == -1: + pos = len_s else: - raise SelectorSyntaxError( - "Invalid escape sequence %r in string %r" - % (substring.split('\\')[1], literal)) - substrings.append(substring) - return ''.join(substrings) - - -def tokenize_escaped_string(s, pos): - quote = s[pos] - assert quote in ('"', "'") - pos = pos+1 - start = pos - while 1: - next = s.find(quote, pos) - if next == -1: - raise SelectorSyntaxError( - "Expected closing %s for string in: %r" - % (quote, s[start:])) - result = s[start:next] - if result.endswith('\\'): - # next quote character is escaped - pos = next+1 + pos += 2 continue - if '\\' in result: - result = unescape_string_literal(result) - return result, next+1 - - -_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) - -def tokenize_symbol(s, pos): - start = pos - match = _illegal_symbol.search(s, pos=pos) - if not match: - # Goes to end of s - return s[start:], len(s) - if match.start() == pos: - raise SelectorSyntaxError( - "Unexpected symbol: %r" % s[pos]) - if not match: - result = s[start:] - pos = len(s) - else: - result = s[start:match.start()] - pos = match.start() - try: - result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') - except UnicodeDecodeError: - import sys - e = sys.exc_info()[1] - raise SelectorSyntaxError( - "Bad symbol %r: %s" % (result, e)) - return result, pos - - -class TokenStream(object): - def __init__(self, tokens, source=None): - self.used = [] + + yield Token("DELIM", s[pos], pos) + pos += 1 + + assert pos == len_s + yield EOFToken(pos) + + +class TokenStream: + def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None: + self.used: list[Token] = [] self.tokens = iter(tokens) self.source = source - self.peeked = None + self.peeked: Token | None = None self._peeking = False - try: - self.next_token = self.tokens.next - except AttributeError: - # Python 3 - self.next_token = self.tokens.__next__ + self.next_token = self.tokens.__next__ - def next(self): + def next(self) -> Token: if self._peeking: self._peeking = False + assert self.peeked is not None self.used.append(self.peeked) return self.peeked - else: - try: - next = self.next_token() - self.used.append(next) - return next - except StopIteration: - return None + next_ = self.next_token() + self.used.append(next_) + return next_ - def __iter__(self): - return iter(self.next, None) - - def peek(self): + def peek(self) -> Token: if not self._peeking: - try: - self.peeked = self.next_token() - except StopIteration: - return None + self.peeked = self.next_token() self._peeking = True + assert self.peeked is not None return self.peeked - def next_symbol(self): - next = self.next() - if not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol, got '%s'" % next) - return next - - def next_symbol_or_star(self): - next = self.next() - if next != '*' and not isinstance(next, Symbol): - raise SelectorSyntaxError( - "Expected symbol or '*', got '%s'" % next) - return next - - def skip_whitespace(self): - if self.peek() == ' ': + def next_ident(self) -> str: + next_ = self.next() + if next_.type != "IDENT": + raise SelectorSyntaxError(f"Expected ident, got {next_}") + return cast("str", next_.value) + + def next_ident_or_star(self) -> str | None: + next_ = self.next() + if next_.type == "IDENT": + return next_.value + if next_ == ("DELIM", "*"): + return None + raise SelectorSyntaxError(f"Expected ident or '*', got {next_}") + + def skip_whitespace(self) -> None: + peek = self.peek() + if peek.type == "S": self.next() diff --git a/cssselect/py.typed b/cssselect/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/cssselect/tests.py b/cssselect/tests.py deleted file mode 100755 index 086f01f..0000000 --- a/cssselect/tests.py +++ /dev/null @@ -1,935 +0,0 @@ -#!/usr/bin/env python -""" - Tests for cssselect - =================== - - These tests can be run either by py.test or by the standard library's - unittest. They use plain ``assert`` statements and do little reporting - themselves in case of failure. - - Use py.test to get fancy error reporting and assert introspection. - - - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. - -""" - -import sys -import operator -import unittest - -from lxml import html -from cssselect import (parse, GenericTranslator, HTMLTranslator, - SelectorSyntaxError, ExpressionError) -from cssselect.parser import tokenize, parse_series - - -class TestCssselect(unittest.TestCase): - def test_tokenizer(self): - tokens = [repr(item).replace("u'", "'") - for item in tokenize('E > f[a~="y\\"x"]')] - assert tokens == [ - "Symbol('E', 0)", - "Token(' ', 1)", - "Token('>', 2)", - "Token(' ', 3)", - "Symbol('f', 4)", - "Token('[', 5)", - "Symbol('a', 6)", - "Token('~=', 7)", - "String('y\"x', 9)", - "Token(']', 15)"] - - def test_parser(self): - def repr_parse(css): - selectors = parse(css) - for selector in selectors: - assert selector.pseudo_element is None - return [repr(selector._tree).replace("(u'", "('") - for selector in selectors] - - def parse_many(first, *others): - result = repr_parse(first) - for other in others: - assert repr_parse(other) == result - return result - - assert parse_many('*') == ['Element[*]'] - assert parse_many('*|*') == ['Element[*]'] - assert parse_many('*|foo') == ['Element[foo]'] - assert parse_many('foo|*') == ['Element[foo|*]'] - assert parse_many('foo|bar') == ['Element[foo|bar]'] - # This will never match, but it is valid: - assert parse_many('#foo#bar') == ['Hash[Hash[Element[*]#foo]#bar]'] - assert parse_many( - 'div>.foo', - 'div> .foo', - 'div >.foo', - 'div > .foo', - 'div \n> \t \t .foo', 'div\r>\n\n\n.foo', 'div\f>\f.foo' - ) == ['CombinedSelector[Element[div] > Class[Element[*].foo]]'] - assert parse_many('td.foo,.bar', - 'td.foo, .bar', - 'td.foo\t\r\n\f ,\t\r\n\f .bar' - ) == [ - 'Class[Element[td].foo]', - 'Class[Element[*].bar]' - ] - assert parse_many('div, td.foo, div.bar span') == [ - 'Element[div]', - 'Class[Element[td].foo]', - 'CombinedSelector[Class[Element[div].bar] ' - ' Element[span]]'] - assert parse_many('div > p') == [ - 'CombinedSelector[Element[div] > Element[p]]'] - assert parse_many('td:first') == [ - 'Pseudo[Element[td]:first]'] - assert parse_many('td:first') == [ - 'Pseudo[Element[td]:first]'] - assert parse_many('td :first') == [ - 'CombinedSelector[Element[td] ' - ' Pseudo[Element[*]:first]]'] - assert parse_many('td :first') == [ - 'CombinedSelector[Element[td] ' - ' Pseudo[Element[*]:first]]'] - assert parse_many('a[name]', 'a[ name\t]') == [ - 'Attrib[Element[a][name]]'] - assert parse_many('a [name]') == [ - 'CombinedSelector[Element[a] Attrib[Element[*][name]]]'] - assert parse_many('a[rel="include"]') == [ - "Attrib[Element[a][rel = String('include', 6)]]"] - assert parse_many('a[rel = include]') == [ - "Attrib[Element[a][rel = Symbol('include', 8)]]"] - assert parse_many("a[hreflang |= 'en']") == [ - "Attrib[Element[a][hreflang |= String('en', 14)]]"] - assert parse_many('div:nth-child(10)') == [ - "Function[Element[div]:nth-child(Symbol('10', 14))]"] - assert parse_many(':nth-child(2n+2)') == [ - "Function[Element[*]:nth-child(Symbol('2n+2', 11))]"] - assert parse_many('div:nth-of-type(10)') == [ - "Function[Element[div]:nth-of-type(Symbol('10', 16))]"] - assert parse_many('div div:nth-of-type(10) .aclass') == [ - 'CombinedSelector[CombinedSelector[Element[div] ' - "Function[Element[div]:nth-of-type(Symbol('10', 20))]] " - ' Class[Element[*].aclass]]'] - assert parse_many('label:only') == [ - 'Pseudo[Element[label]:only]'] - assert parse_many('a:lang(fr)') == [ - "Function[Element[a]:lang(Symbol('fr', 7))]"] - assert parse_many('div:contains("foo")') == [ - "Function[Element[div]:contains(String('foo', 13))]"] - assert parse_many('div#foobar') == [ - 'Hash[Element[div]#foobar]'] - assert parse_many('div:not(div.foo)') == [ - 'Negation[Element[div]:not(Class[Element[div].foo])]'] - assert parse_many('td ~ th') == [ - 'CombinedSelector[Element[td] ~ Element[th]]'] - - def test_pseudo_elements(self): - def parse_pseudo(css): - result = [] - for selector in parse(css): - result.append(( - repr(selector._tree).replace("(u'", "('"), - selector.pseudo_element)) - return result - - def parse_one(css): - result = parse_pseudo(css) - assert len(result) == 1 - return result[0] - - assert parse_one('foo') == ('Element[foo]', None) - assert parse_one('*') == ('Element[*]', None) - assert parse_one(':empty') == ('Pseudo[Element[*]:empty]', None) - - # Special cases for CSS 2.1 pseudo-elements - assert parse_one(':before') == ('Element[*]', 'before') - assert parse_one(':after') == ('Element[*]', 'after') - assert parse_one(':first-line') == ('Element[*]', 'first-line') - assert parse_one(':first-letter') == ('Element[*]', 'first-letter') - - assert parse_one('::before') == ('Element[*]', 'before') - assert parse_one('::after') == ('Element[*]', 'after') - assert parse_one('::first-line') == ('Element[*]', 'first-line') - assert parse_one('::first-letter') == ('Element[*]', 'first-letter') - - assert parse_one('::selection') == ('Element[*]', 'selection') - assert parse_one('foo:after') == ('Element[foo]', 'after') - assert parse_one('foo::selection') == ('Element[foo]', 'selection') - assert parse_one('lorem#ipsum ~ a#b.c[href]:empty::selection') == ( - 'CombinedSelector[Hash[Element[lorem]#ipsum] ~ ' - 'Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]', - 'selection') - - parse_pseudo('foo:before, bar, baz:after') == [ - ('Element[foo]', 'before'), - ('Element[bar]', None), - ('Element[baz]', 'after')] - - def test_specificity(self): - def specificity(css): - selectors = parse(css) - assert len(selectors) == 1 - return selectors[0].specificity() - - assert specificity('*') == (0, 0, 0) - assert specificity(' foo') == (0, 0, 1) - assert specificity(':empty ') == (0, 1, 0) - assert specificity(':before') == (0, 0, 1) - assert specificity('*:before') == (0, 0, 1) - assert specificity(':nth-child(2)') == (0, 1, 0) - assert specificity('.bar') == (0, 1, 0) - assert specificity('[baz]') == (0, 1, 0) - assert specificity('[baz="4"]') == (0, 1, 0) - assert specificity('[baz^="4"]') == (0, 1, 0) - assert specificity('#lipsum') == (1, 0, 0) - - assert specificity('foo:empty') == (0, 1, 1) - assert specificity('foo:before') == (0, 0, 2) - assert specificity('foo::before') == (0, 0, 2) - assert specificity('foo:empty::before') == (0, 1, 2) - - assert specificity('#lorem + foo#ipsum:first-child > bar:first-line' - ) == (2, 1, 3) - - def test_parse_errors(self): - def get_error(css): - try: - parse(css) - except SelectorSyntaxError: - # Py2, Py3, ... - return str(sys.exc_info()[1]).replace("(u'", "('") - - assert get_error('attributes(href)/html/body/a') == ( - "Expected selector, got '(' at " - "[Symbol('attributes', 0)] -> Token('(', 10)") - assert get_error('attributes(href)') == ( - "Expected selector, got '(' at " - "[Symbol('attributes', 0)] -> Token('(', 10)") - assert get_error('html/body/a') == ( - "Unexpected symbol: '/' at [Symbol('html', 0)] -> None") - assert get_error(' ') == ( - "Expected selector, got 'None' at [Token(' ', 0)] -> None") - assert get_error('div, ') == ( - "Expected selector, got 'None' at " - "[Symbol('div', 0), Token(',', 3), Token(' ', 4)] -> None") - assert get_error(' , div') == ( - "Expected selector, got ',' at " - "[Token(' ', 0)] -> Token(',', 1)") - assert get_error('p, , div') == ( - "Expected selector, got ',' at " - "[Symbol('p', 0), Token(',', 1), Token(' ', 2)] -> Token(',', 3)") - assert get_error('div > ') == ( - "Expected selector, got 'None' at " - "[Symbol('div', 0), Token(' ', 3), Token('>', 4), Token(' ', 5)]" - " -> None") - assert get_error(' > div') == ( - "Expected selector, got '>' at [Token(' ', 0)] -> Token('>', 2)") - assert get_error('foo|#bar') == ( - "Expected symbol or '*', got '#' at " - "[Symbol('foo', 0), Token('|', 3), " - "Token('#', 4)] -> Symbol('bar', 5)") - assert get_error('#.foo') == ( - "Expected symbol, got '.' at " - "[Token('#', 0), Token('.', 1)] -> Symbol('foo', 2)") - assert get_error('.#foo') == ( - "Expected symbol, got '#' at " - "[Token('.', 0), Token('#', 1)] -> Symbol('foo', 2)") - assert get_error(':#foo') == ( - "Expected symbol, got '#' at " - "[Token(':', 0), Token('#', 1)] -> Symbol('foo', 2)") - assert get_error('[*]') == ( - "Expected '|', got ']' at " - "[Token('[', 0), Token('*', 1)] -> Token(']', 2)") - assert get_error('[foo|]') == ( - "Expected symbol, got ']' at " - "[Token('[', 0), Symbol('foo', 1), Token('|', 4), Token(']', 5)]" - " -> None") - assert get_error('[#]') == ( - "Expected symbol or '*', got '#' at " - "[Token('[', 0), Token('#', 1)] -> Token(']', 2)") - assert get_error('[foo=#]') == ( - "Expected string or symbol, got '#' at " - "[Token('[', 0), Symbol('foo', 1), Token('=', 4), Token('#', 5)]" - " -> Token(']', 6)") - assert get_error(':nth-child()') == ( - "Expected argument, got ')' at " - "[Token(':', 0), Symbol('nth-child', 1), Token('(', 10)]" - " -> Token(')', 11)") - assert get_error('[href]a') == ( - "Expected selector, got 'a' at " - "[Token('[', 0), Symbol('href', 1), Token(']', 5)]" - " -> Symbol('a', 6)") - - # Mis-placed pseudo-elements - assert get_error('a:before:empty') == ( - "A pseudo-element must be at the end of a selector at " - "[Symbol('a', 0), Token(':', 1), Symbol('before', 2)] " - "-> Token(':', 8)") - assert get_error('li:before a') == ( - "A pseudo-element must be at the end of a selector at " - "[Symbol('li', 0), Token(':', 2), Symbol('before', 3), " - "Token(' ', 9)] -> Symbol('a', 10)") - assert get_error(':not(:before)') == ( - "Pseudo-elements are not allowed inside :not() at " - "[Token(':', 0), Symbol('not', 1), Token('(', 4), Token(':', 5)," - " Symbol('before', 6)] -> Token(')', 12)") - - - def test_translation(self): - def xpath(css): - return str(GenericTranslator().css_to_xpath(css, prefix='')) - - assert xpath('*') == "*" - assert xpath('E') == "e" - assert xpath('E[foo]') == "e[@foo]" - assert xpath('E[foo="bar"]') == "e[@foo = 'bar']" - assert xpath('E[foo~="bar"]') == ( - "e[@foo and contains(" - "concat(' ', normalize-space(@foo), ' '), ' bar ')]") - assert xpath('E[foo^="bar"]') == ( - "e[@foo and starts-with(@foo, 'bar')]") - assert xpath('E[foo$="bar"]') == ( - "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']") - assert xpath('E[foo*="bar"]') == ( - "e[@foo and contains(@foo, 'bar')]") - assert xpath('E[hreflang|="en"]') == ( - "e[@hreflang and (" - "@hreflang = 'en' or starts-with(@hreflang, 'en-'))]") - assert xpath('E:nth-child(1)') == ( - "*/*[name() = 'e' and (position() = 1)]") - assert xpath('E:nth-last-child(1)') == ( - "*/*[name() = 'e' and (position() = last() - 1)]") - assert xpath('E:nth-last-child(2n+2)') == ( - "*/*[name() = 'e' and (" - "(position() +2) mod -2 = 0 and position() < (last() -2))]") - assert xpath('E:nth-of-type(1)') == ( - "*/e[position() = 1]") - assert xpath('E:nth-last-of-type(1)') == ( - "*/e[position() = last() - 1]") - assert xpath('E:nth-last-of-type(1)') == ( - "*/e[position() = last() - 1]") - assert xpath('div E:nth-last-of-type(1) .aclass') == ( - "div/descendant-or-self::*/e[position() = last() - 1]" - "/descendant-or-self::*/*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aclass ')]") - assert xpath('E:first-child') == ( - "*/*[name() = 'e' and (position() = 1)]") - assert xpath('E:last-child') == ( - "*/*[name() = 'e' and (position() = last())]") - assert xpath('E:first-of-type') == ( - "*/e[position() = 1]") - assert xpath('E:last-of-type') == ( - "*/e[position() = last()]") - assert xpath('E:only-child') == ( - "*/*[name() = 'e' and (last() = 1)]") - assert xpath('E:only-of-type') == ( - "e[last() = 1]") - assert xpath('E:empty') == ( - "e[not(*) and not(normalize-space())]") - assert xpath('E:root') == ( - "e[not(parent::*)]") - assert xpath('E:contains("foo")') == ( - "e[contains(string(.), 'foo')]") - assert xpath('E:contains(foo)') == ( - "e[contains(string(.), 'foo')]") - assert xpath('E.warning') == ( - "e[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' warning ')]") - assert xpath('E#myid') == ( - "e[@id = 'myid']") - assert xpath('E:not(:nth-child(odd))') == ( - "e[not((position() -1) mod 2 = 0 and position() >= 1)]") - assert xpath('E F') == ( - "e/descendant-or-self::*/f") - assert xpath('E > F') == ( - "e/f") - assert xpath('E + F') == ( - "e/following-sibling::*[name() = 'f' and (position() = 1)]") - assert xpath('E ~ F') == ( - "e/following-sibling::f") - assert xpath('div#container p') == ( - "div[@id = 'container']/descendant-or-self::*/p") - self.assertRaises(ExpressionError, xpath, 'p *:only-of-type') - - def test_unicode(self): - if sys.version_info[0] >= 3: - css = '.a\xc1b' - else: - css = '.a\xc1b'.decode('ISO-8859-1') - - xpath = GenericTranslator().css_to_xpath(css) - assert css[1:] in xpath - xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII') - assert xpath == ( - "descendant-or-self::*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aÁb ')]") - - def test_quoting(self): - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath('*[aval="\'"]') == ( - '''descendant-or-self::*[@aval = "'"]''') - assert css_to_xpath('*[aval="\'\'\'"]') == ( - """descendant-or-self::*[@aval = "'''"]""") - assert css_to_xpath('*[aval=\'"\']') == ( - '''descendant-or-self::*[@aval = '"']''') - assert css_to_xpath('*[aval=\'"""\']') == ( - '''descendant-or-self::*[@aval = '"""']''') - - def test_unicode_escapes(self): - # \22 == '"' \20 == ' ' - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath(r'*[aval="\'\22\'"]') == ( - '''descendant-or-self::*[@aval = concat("'",'"',"'")]''') - assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( - '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''') - assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( - '''descendant-or-self::*[@aval = "' '"]''') - assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == ( - '''descendant-or-self::*[@aval = "' '"]''') - - def test_series(self): - assert parse_series('1n+3') == (1, 3) - assert parse_series('n-5') == (1, -5) - assert parse_series('odd') == (2, 1) - assert parse_series('even') == (2, 0) - assert parse_series('3n') == (3, 0) - assert parse_series('n') == (1, 0) - assert parse_series('5') == (0, 5) - - def test_select(self): - document = html.document_fromstring(HTML_IDS) - sort_key = dict( - (el, count) for count, el in enumerate(document.getiterator()) - ).__getitem__ - css_to_xpath = GenericTranslator().css_to_xpath - html_css_to_xpath = HTMLTranslator().css_to_xpath - - def select_ids(selector, html_only): - xpath = css_to_xpath(selector) - items = document.xpath(xpath) - if html_only: - assert items == [] - xpath = html_css_to_xpath(selector) - items = document.xpath(xpath) - items.sort(key=sort_key) - return [element.get('id', 'nil') for element in items] - - def pcss(main, *selectors, **kwargs): - html_only = kwargs.pop('html_only', False) - result = select_ids(main, html_only) - for selector in selectors: - assert select_ids(selector, html_only) == result - return result - - all_ids = pcss('*') - assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div'] - assert all_ids[-1:] == ['foobar-span'] - assert pcss('div') == ['outer-div', 'li-div', 'foobar-div'] - assert pcss('div div') == ['li-div'] - assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div'] - assert pcss('a[name]') == ['name-anchor'] - assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[rel="tag"]') == ['tag-anchor'] - assert pcss('a[href*="localhost"]') == ['tag-anchor'] - assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[href^="http:"]') == ['tag-anchor'] - assert pcss('a[href$="org"]') == ['nofollow-anchor'] - assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [ - 'foobar-div'] - assert pcss('div[foobar~="cd"]') == [] - assert pcss('*[lang|="en"]', '*[lang|="en-US"]') == ['second-li'] - assert pcss('*[lang|="e"]') == [] - assert pcss('li:nth-child(3)') == ['third-li'] - assert pcss('li:nth-child(10)') == [] - assert pcss('li:nth-child(2n)', 'li:nth-child(even)', - 'li:nth-child(2n+0)') == [ - 'second-li', 'fourth-li', 'sixth-li'] - assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [ - 'first-li', 'third-li', 'fifth-li', 'seventh-li'] - assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li'] - # FIXME: I'm not 100% sure this is right: - assert pcss('li:nth-child(3n+1)') == [ - 'first-li', 'fourth-li', 'seventh-li'] - assert pcss('li:nth-last-child(0)') == [ - 'seventh-li'] - assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [ - 'second-li', 'fourth-li', 'sixth-li'] - assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li'] - assert pcss('ol:first-of-type') == ['first-ol'] - assert pcss('ol:nth-child(1)') == [] - assert pcss('ol:nth-of-type(2)') == ['second-ol'] - # FIXME: like above', '(1) or (2)? - assert pcss('ol:nth-last-of-type(1)') == ['first-ol'] - assert pcss('span:only-child') == ['foobar-span'] - assert pcss('li div:only-child') == ['li-div'] - assert pcss('div *:only-child') == [ - 'li-div', 'checkbox-disabled', 'foobar-span'] - self.assertRaises(ExpressionError, pcss, 'p *:only-of-type') - self.assertRaises(ExpressionError, pcss, 'p:lang(fr)') - assert pcss('p:only-of-type') == ['paragraph'] - assert pcss('a:empty') == ['name-anchor'] - assert pcss('li:empty') == [ - 'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li'] - assert pcss(':root', 'html:root') == ['html'] - assert pcss('li:root', '* :root') == [] - assert pcss('*:contains("link")') == [ - 'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor'] - assert pcss('*:contains("LInk")') == [] # case sensitive - assert pcss('*:contains("e")') == [ - 'html', 'nil', 'outer-div', 'first-ol', 'first-li', - 'paragraph', 'p-em'] - assert pcss('*:contains("E")') == [] # case-sensitive - assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol'] - assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li'] - assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [ - 'third-li', 'fourth-li'] - assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li'] - # Need some tests of :not()'] - assert pcss('li div', 'li > div', 'div div') == ['li-div'] - assert pcss('div > div') == [] - assert pcss('div>.c', 'div > .c') == ['first-ol'] - assert pcss('div + div') == ['foobar-div'] - assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor'] - assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor'] - assert pcss('ol#first-ol li:last-child') == ['seventh-li'] - assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li'] - assert pcss('#outer-div:first-child') == ['outer-div'] - assert pcss('#outer-div :first-child') == [ - 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-disabled'] - assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor'] - assert pcss(':link', html_only=True) == pcss('a[href]') - assert pcss(':checked', html_only=True) == ['checkbox-checked'] - assert pcss(':disabled', html_only=True) == [ - 'fieldset', 'checkbox-disabled'] - assert pcss(':enabled', html_only=True) == [ - 'checkbox-unchecked', 'checkbox-checked'] - - def test_select_shakespeare(self): - document = html.document_fromstring(HTML_SHAKESPEARE) - body = document.xpath('//body')[0] - css_to_xpath = GenericTranslator().css_to_xpath - - try: - basestring_ = basestring - except NameError: - basestring_ = (str, bytes) - - def count(selector): - xpath = css_to_xpath(selector) - results = body.xpath(xpath) - assert not isinstance(results, basestring_) - found = set() - for item in results: - assert item not in found - found.add(item) - assert not isinstance(item, basestring_) - return len(results) - - # Data borrowed from http://mootools.net/slickspeed/ - - ## Changed from original; probably because I'm only - ## searching the body. - #assert count('*') == 252 - assert count('*') == 246 - assert count('div:contains(CELIA)') == 26 - assert count('div:only-child') == 22 # ? - assert count('div:nth-child(even)') == 106 - assert count('div:nth-child(2n)') == 106 - assert count('div:nth-child(odd)') == 137 - assert count('div:nth-child(2n+1)') == 137 - assert count('div:nth-child(n)') == 243 - assert count('div:last-child') == 53 - assert count('div:first-child') == 51 - assert count('div > div') == 242 - assert count('div + div') == 190 - assert count('div ~ div') == 190 - assert count('body') == 1 - assert count('body div') == 243 - assert count('div') == 243 - assert count('div div') == 242 - assert count('div div div') == 241 - assert count('div, div, div') == 243 - assert count('div, a, span') == 243 - assert count('.dialog') == 51 - assert count('div.dialog') == 51 - assert count('div .dialog') == 51 - assert count('div.character, div.dialog') == 99 - assert count('div.direction.dialog') == 0 - assert count('div.dialog.direction') == 0 - assert count('div.dialog.scene') == 1 - assert count('div.scene.scene') == 1 - assert count('div.scene .scene') == 0 - assert count('div.direction .dialog ') == 0 - assert count('div .dialog .direction') == 4 - assert count('div.dialog .dialog .direction') == 4 - assert count('#speech5') == 1 - assert count('div#speech5') == 1 - assert count('div #speech5') == 1 - assert count('div.scene div.dialog') == 49 - assert count('div#scene1 div.dialog div') == 142 - assert count('#scene1 #speech1') == 1 - assert count('div[class]') == 103 - assert count('div[class=dialog]') == 50 - assert count('div[class^=dia]') == 51 - assert count('div[class$=log]') == 50 - assert count('div[class*=sce]') == 1 - assert count('div[class|=dialog]') == 50 # ? Seems right - assert count('div[class!=madeup]') == 243 # ? Seems right - assert count('div[class~=dialog]') == 51 # ? Seems right - -HTML_IDS = ''' - -
- - - - link -
    -
  1. content
  2. -
  3. -
    -
    -
  4. -
  5. -
  6. -
  7. -
  8. -
  9. -
-

- hi there - guy - - -

- -
-

-
    -
-
-
- -''' - - -HTML_SHAKESPEARE = ''' - - - - - - -
-
-

As You Like It

-
- by William Shakespeare -
-
-

ACT I, SCENE III. A room in the palace.

-
-
Enter CELIA and ROSALIND
-
-
CELIA
-
-
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
-
-
ROSALIND
-
-
Not one to throw at a dog.
-
-
CELIA
-
-
No, thy words are too precious to be cast away upon
-
curs; throw some of them at me; come, lame me with reasons.
-
-
ROSALIND
-
CELIA
-
-
But is all this for your father?
-
-
-
Then there were two cousins laid up; when the one
-
should be lamed with reasons and the other mad
-
without any.
-
-
ROSALIND
-
-
No, some of it is for my child's father. O, how
-
full of briers is this working-day world!
-
-
CELIA
-
-
They are but burs, cousin, thrown upon thee in
-
holiday foolery: if we walk not in the trodden
-
paths our very petticoats will catch them.
-
-
ROSALIND
-
-
I could shake them off my coat: these burs are in my heart.
-
-
CELIA
-
-
Hem them away.
-
-
ROSALIND
-
-
I would try, if I could cry 'hem' and have him.
-
-
CELIA
-
-
Come, come, wrestle with thy affections.
-
-
ROSALIND
-
-
O, they take the part of a better wrestler than myself!
-
-
CELIA
-
-
O, a good wish upon you! you will try in time, in
-
despite of a fall. But, turning these jests out of
-
service, let us talk in good earnest: is it
-
possible, on such a sudden, you should fall into so
-
strong a liking with old Sir Rowland's youngest son?
-
-
ROSALIND
-
-
The duke my father loved his father dearly.
-
-
CELIA
-
-
Doth it therefore ensue that you should love his son
-
dearly? By this kind of chase, I should hate him,
-
for my father hated his father dearly; yet I hate
-
not Orlando.
-
-
ROSALIND
-
-
No, faith, hate him not, for my sake.
-
-
CELIA
-
-
Why should I not? doth he not deserve well?
-
-
ROSALIND
-
-
Let me love him for that, and do you love him
-
because I do. Look, here comes the duke.
-
-
CELIA
-
-
With his eyes full of anger.
-
Enter DUKE FREDERICK, with Lords
-
-
DUKE FREDERICK
-
-
Mistress, dispatch you with your safest haste
-
And get you from our court.
-
-
ROSALIND
-
-
Me, uncle?
-
-
DUKE FREDERICK
-
-
You, cousin
-
Within these ten days if that thou be'st found
-
So near our public court as twenty miles,
-
Thou diest for it.
-
-
ROSALIND
-
-
I do beseech your grace,
-
Let me the knowledge of my fault bear with me:
-
If with myself I hold intelligence
-
Or have acquaintance with mine own desires,
-
If that I do not dream or be not frantic,--
-
As I do trust I am not--then, dear uncle,
-
Never so much as in a thought unborn
-
Did I offend your highness.
-
-
DUKE FREDERICK
-
-
Thus do all traitors:
-
If their purgation did consist in words,
-
They are as innocent as grace itself:
-
Let it suffice thee that I trust thee not.
-
-
ROSALIND
-
-
Yet your mistrust cannot make me a traitor:
-
Tell me whereon the likelihood depends.
-
-
DUKE FREDERICK
-
-
Thou art thy father's daughter; there's enough.
-
-
ROSALIND
-
-
So was I when your highness took his dukedom;
-
So was I when your highness banish'd him:
-
Treason is not inherited, my lord;
-
Or, if we did derive it from our friends,
-
What's that to me? my father was no traitor:
-
Then, good my liege, mistake me not so much
-
To think my poverty is treacherous.
-
-
CELIA
-
-
Dear sovereign, hear me speak.
-
-
DUKE FREDERICK
-
-
Ay, Celia; we stay'd her for your sake,
-
Else had she with her father ranged along.
-
-
CELIA
-
-
I did not then entreat to have her stay;
-
It was your pleasure and your own remorse:
-
I was too young that time to value her;
-
But now I know her: if she be a traitor,
-
Why so am I; we still have slept together,
-
Rose at an instant, learn'd, play'd, eat together,
-
And wheresoever we went, like Juno's swans,
-
Still we went coupled and inseparable.
-
-
DUKE FREDERICK
-
-
She is too subtle for thee; and her smoothness,
-
Her very silence and her patience
-
Speak to the people, and they pity her.
-
Thou art a fool: she robs thee of thy name;
-
And thou wilt show more bright and seem more virtuous
-
When she is gone. Then open not thy lips:
-
Firm and irrevocable is my doom
-
Which I have pass'd upon her; she is banish'd.
-
-
CELIA
-
-
Pronounce that sentence then on me, my liege:
-
I cannot live out of her company.
-
-
DUKE FREDERICK
-
-
You are a fool. You, niece, provide yourself:
-
If you outstay the time, upon mine honour,
-
And in the greatness of my word, you die.
-
Exeunt DUKE FREDERICK and Lords
-
-
CELIA
-
-
O my poor Rosalind, whither wilt thou go?
-
Wilt thou change fathers? I will give thee mine.
-
I charge thee, be not thou more grieved than I am.
-
-
ROSALIND
-
-
I have more cause.
-
-
CELIA
-
-
Thou hast not, cousin;
-
Prithee be cheerful: know'st thou not, the duke
-
Hath banish'd me, his daughter?
-
-
ROSALIND
-
-
That he hath not.
-
-
CELIA
-
-
No, hath not? Rosalind lacks then the love
-
Which teacheth thee that thou and I am one:
-
Shall we be sunder'd? shall we part, sweet girl?
-
No: let my father seek another heir.
-
Therefore devise with me how we may fly,
-
Whither to go and what to bear with us;
-
And do not seek to take your change upon you,
-
To bear your griefs yourself and leave me out;
-
For, by this heaven, now at our sorrows pale,
-
Say what thou canst, I'll go along with thee.
-
-
ROSALIND
-
-
Why, whither shall we go?
-
-
CELIA
-
-
To seek my uncle in the forest of Arden.
-
-
ROSALIND
-
-
Alas, what danger will it be to us,
-
Maids as we are, to travel forth so far!
-
Beauty provoketh thieves sooner than gold.
-
-
CELIA
-
-
I'll put myself in poor and mean attire
-
And with a kind of umber smirch my face;
-
The like do you: so shall we pass along
-
And never stir assailants.
-
-
ROSALIND
-
-
Were it not better,
-
Because that I am more than common tall,
-
That I did suit me all points like a man?
-
A gallant curtle-axe upon my thigh,
-
A boar-spear in my hand; and--in my heart
-
Lie there what hidden woman's fear there will--
-
We'll have a swashing and a martial outside,
-
As many other mannish cowards have
-
That do outface it with their semblances.
-
-
CELIA
-
-
What shall I call thee when thou art a man?
-
-
ROSALIND
-
-
I'll have no worse a name than Jove's own page;
-
And therefore look you call me Ganymede.
-
But what will you be call'd?
-
-
CELIA
-
-
Something that hath a reference to my state
-
No longer Celia, but Aliena.
-
-
ROSALIND
-
-
But, cousin, what if we assay'd to steal
-
The clownish fool out of your father's court?
-
Would he not be a comfort to our travel?
-
-
CELIA
-
-
He'll go along o'er the wide world with me;
-
Leave me alone to woo him. Let's away,
-
And get our jewels and our wealth together,
-
Devise the fittest time and safest way
-
To hide us from pursuit that will be made
-
After my flight. Now go we in content
-
To liberty and not to banishment.
-
Exeunt
-
-
-
-
- - -''' - - -if __name__ == '__main__': - unittest.main() diff --git a/cssselect/xpath.py b/cssselect/xpath.py index 23a165c..96eac3f 100644 --- a/cssselect/xpath.py +++ b/cssselect/xpath.py @@ -1,27 +1,46 @@ """ - cssselect.xpath - =============== +cssselect.xpath +=============== - Translation of parsed CSS selectors to XPath expressions. +Translation of parsed CSS selectors to XPath expressions. - :copyright: (c) 2007-2012 Ian Bicking and contributors. - See AUTHORS for more details. - :license: BSD, see LICENSE for more details. +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. """ -import re -from cssselect.parser import parse, parse_series, SelectorError - +from __future__ import annotations -try: - _basestring = basestring - _unicode = unicode -except NameError: - # Python 3 - _basestring = str - _unicode = str +import re +from typing import TYPE_CHECKING, cast + +from cssselect.parser import ( + Attrib, + Class, + CombinedSelector, + Element, + Function, + Hash, + Matching, + Negation, + Pseudo, + PseudoElement, + Relation, + Selector, + SelectorError, + SpecificityAdjustment, + Tree, + parse, + parse_series, +) + +if TYPE_CHECKING: + from collections.abc import Callable + + # typing.Self requires Python 3.11 + from typing_extensions import Self class ExpressionError(SelectorError, RuntimeError): @@ -30,368 +49,656 @@ class ExpressionError(SelectorError, RuntimeError): #### XPath Helpers -class XPathExpr(object): - def __init__(self, path='', element='*', condition='', star_prefix=False): +class XPathExpr: + def __init__( + self, + path: str = "", + element: str = "*", + condition: str = "", + star_prefix: bool = False, + ) -> None: self.path = path self.element = element self.condition = condition - self.star_prefix = star_prefix - def __str__(self): - path = _unicode(self.path) + _unicode(self.element) + def __str__(self) -> str: + path = str(self.path) + str(self.element) if self.condition: - path += '[%s]' % self.condition + path += f"[{self.condition}]" return path - def __repr__(self): - return '%s[%s]' % (self.__class__.__name__, self) + def __repr__(self) -> str: + return f"{self.__class__.__name__}[{self}]" - def add_condition(self, condition): + def add_condition(self, condition: str, conjuction: str = "and") -> Self: if self.condition: - self.condition = '%s and (%s)' % (self.condition, condition) + self.condition = f"({self.condition}) {conjuction} ({condition})" else: self.condition = condition return self - def add_name_test(self): - if self.element == '*': + def add_name_test(self) -> None: + if self.element == "*": # We weren't doing a test anyway return - self.add_condition( - "name() = %s" % GenericTranslator.xpath_literal(self.element)) - self.element = '*' + self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}") + self.element = "*" - def add_star_prefix(self): + def add_star_prefix(self) -> None: """ - Adds a /* prefix if there is no prefix. This is when you need - to keep context's constrained to a single parent. + Append '*/' to the path to keep the context constrained + to a single parent. """ - if self.path: - self.path += '*/' - else: - self.path = '*/' - self.star_prefix = True - - def join(self, combiner, other): - path = _unicode(self) + combiner - # We don't need a star prefix if we are joining to this other - # prefix; so we'll get rid of it - if not(other.star_prefix and other.path == '*/'): + self.path += "*/" + + def join( + self, + combiner: str, + other: XPathExpr, + closing_combiner: str | None = None, + has_inner_condition: bool = False, + ) -> Self: + path = str(self) + combiner + # Any "star prefix" is redundant when joining. + if other.path != "*/": path += other.path self.path = path - self.element = other.element - self.condition = other.condition + if not has_inner_condition: + self.element = ( + other.element + closing_combiner if closing_combiner else other.element + ) + self.condition = other.condition + else: + self.element = other.element + if other.condition: + self.element += "[" + other.condition + "]" + if closing_combiner: + self.element += closing_combiner return self split_at_single_quotes = re.compile("('+)").split +# The spec is actually more permissive than that, but don’t bother. +# This is just for the fast path. +# http://www.w3.org/TR/REC-xml/#NT-NameStartChar +is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match + +# Test that the string is not empty and does not contain whitespace +is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match + #### Translation -class GenericTranslator(object): + +class GenericTranslator: """ Translator for "generic" XML documents. + + Everything is case-sensitive, no assumption is made on the meaning + of element names and attribute names. + """ + + #### + #### HERE BE DRAGONS + #### + #### You are welcome to hook into this to change some behavior, + #### but do so at your own risks. + #### Until it has received a lot more work and review, + #### I reserve the right to change this API in backward-incompatible ways + #### with any minor version of cssselect. + #### See https://github.com/scrapy/cssselect/pull/22 + #### -- Simon Sapin. + #### + combinator_mapping = { - ' ': 'descendant', - '>': 'child', - '+': 'direct_adjacent', - '~': 'indirect_adjacent', + " ": "descendant", + ">": "child", + "+": "direct_adjacent", + "~": "indirect_adjacent", } attribute_operator_mapping = { - 'exists': 'exists', - '=': 'equals', - '~=': 'includes', - '|=': 'dashmatch', - '^=': 'prefixmatch', - '$=': 'suffixmatch', - '*=': 'substringmatch', - '!=': 'different', # XXX Not in Level 3 but meh + "exists": "exists", + "=": "equals", + "~=": "includes", + "|=": "dashmatch", + "^=": "prefixmatch", + "$=": "suffixmatch", + "*=": "substringmatch", + "!=": "different", # XXX Not in Level 3 but meh } #: The attribute used for ID selectors depends on the document language: #: http://www.w3.org/TR/selectors/#id-selectors - id_attribute = 'id' - - def css_to_xpath(self, css, prefix='descendant-or-self::'): + id_attribute = "id" + + #: The attribute used for ``:lang()`` depends on the document language: + #: http://www.w3.org/TR/selectors/#lang-pseudo + lang_attribute = "xml:lang" + + #: The case sensitivity of document language element names, + #: attribute names, and attribute values in selectors depends + #: on the document language. + #: http://www.w3.org/TR/selectors/#casesens + #: + #: When a document language defines one of these as case-insensitive, + #: cssselect assumes that the document parser makes the parsed values + #: lower-case. Making the selector lower-case too makes the comparaison + #: case-insensitive. + #: + #: In HTML, element names and attributes names (but not attribute values) + #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 + #: and HTMLParser make them lower-case in their parse result, so + #: the assumption holds. + lower_case_element_names = False + lower_case_attribute_names = False + lower_case_attribute_values = False + + # class used to represent and xpath expression + xpathexpr_cls = XPathExpr + + def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: """Translate a *group of selectors* to XPath. Pseudo-elements are not supported here since XPath only knows about "real" elements. :param css: - A *group of selectors* as an Unicode string. + A *group of selectors* as a string. + :param prefix: + This string is prepended to the XPath expression for each selector. + The default makes selectors scoped to the context node’s subtree. :raises: - :class:`SelectorSyntaxError` on invalid selectors, + :class:`~cssselect.SelectorSyntaxError` on invalid selectors, :class:`ExpressionError` on unknown/unsupported selectors, including pseudo-elements. :returns: - The equivalent XPath 1.0 expression as an Unicode string. + The equivalent XPath 1.0 expression as a string. """ - selectors = parse(css) - for selector in selectors: - if selector.pseudo_element: - raise ExpressionError('Pseudo-elements are not supported.') - - return ' | '.join( - self.selector_to_xpath(selector, prefix) - for selector in selectors) + return " | ".join( + self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) + for selector in parse(css) + ) - def selector_to_xpath(self, selector, prefix='descendant-or-self::'): + def selector_to_xpath( + self, + selector: Selector, + prefix: str = "descendant-or-self::", + translate_pseudo_elements: bool = False, + ) -> str: """Translate a parsed selector to XPath. - The :attr:`~Selector.pseudo_element` attribute of the selector - is ignored. It is the caller's responsibility to reject selectors - with pseudo-elements, or to account for them somehow. :param selector: A parsed :class:`Selector` object. + :param prefix: + This string is prepended to the resulting XPath expression. + The default makes selectors scoped to the context node’s subtree. + :param translate_pseudo_elements: + Unless this is set to ``True`` (as :meth:`css_to_xpath` does), + the :attr:`~Selector.pseudo_element` attribute of the selector + is ignored. + It is the caller's responsibility to reject selectors + with pseudo-elements, or to account for them somehow. :raises: :class:`ExpressionError` on unknown/unsupported selectors. :returns: - The equivalent XPath 1.0 expression as an Unicode string. + The equivalent XPath 1.0 expression as a string. + + """ + tree = getattr(selector, "parsed_tree", None) + if not tree: + raise TypeError(f"Expected a parsed selector, got {selector!r}") + xpath = self.xpath(tree) + assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' + if translate_pseudo_elements and selector.pseudo_element: + xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) + return (prefix or "") + str(xpath) + + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + """Translate a pseudo-element. + + Defaults to not supporting pseudo-elements at all, + but can be overridden by sub-classes. """ - return (prefix or '') + _unicode(self.xpath(selector._tree)) + raise ExpressionError("Pseudo-elements are not supported.") @staticmethod - def xpath_literal(s): - s = _unicode(s) + def xpath_literal(s: str) -> str: + s = str(s) if "'" not in s: - s = "'%s'" % s + s = f"'{s}'" elif '"' not in s: - s = '"%s"' % s + s = f'"{s}"' else: - s = "concat(%s)" % ','.join([ - (("'" in part) and '"%s"' or "'%s'") % part - for part in split_at_single_quotes(s) if part - ]) + parts_quoted = [ + f'"{part}"' if "'" in part else f"'{part}'" + for part in split_at_single_quotes(s) + if part + ] + s = "concat({})".format(",".join(parts_quoted)) return s - def xpath(self, parsed_selector): + def xpath(self, parsed_selector: Tree) -> XPathExpr: """Translate any parsed selector object.""" type_name = type(parsed_selector).__name__ - method = getattr(self, 'xpath_%s' % type_name.lower(), None) - if not method: - raise TypeError('Expected a parsed selector, got %s' % type_name) + method = cast( + "Callable[[Tree], XPathExpr] | None", + getattr(self, f"xpath_{type_name.lower()}", None), + ) + if method is None: + raise ExpressionError(f"{type_name} is not supported.") return method(parsed_selector) - # Dispatched by parsed object type - def xpath_combinedselector(self, combined): + def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: """Translate a combined selector.""" - combinator = self.combinator_mapping.get(combined.combinator) - if not combinator: - raise ExpressionError( - "Unknown combinator: %r" % combined.combinator) - method = getattr(self, 'xpath_%s_combinator' % combinator) - return method(self.xpath(combined.selector), - self.xpath(combined.subselector)) + combinator = self.combinator_mapping[combined.combinator] + method = cast( + "Callable[[XPathExpr, XPathExpr], XPathExpr]", + getattr(self, f"xpath_{combinator}_combinator"), + ) + return method(self.xpath(combined.selector), self.xpath(combined.subselector)) - def xpath_negation(self, negation): + def xpath_negation(self, negation: Negation) -> XPathExpr: xpath = self.xpath(negation.selector) sub_xpath = self.xpath(negation.subselector) sub_xpath.add_name_test() - return xpath.add_condition('not(%s)' % sub_xpath.condition) + if sub_xpath.condition: + return xpath.add_condition(f"not({sub_xpath.condition})") + return xpath.add_condition("0") + + def xpath_relation(self, relation: Relation) -> XPathExpr: + xpath = self.xpath(relation.selector) + combinator = relation.combinator + subselector = relation.subselector + right = self.xpath(subselector.parsed_tree) + method = cast( + "Callable[[XPathExpr, XPathExpr], XPathExpr]", + getattr( + self, + f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator", + ), + ) + return method(xpath, right) + + def xpath_matching(self, matching: Matching) -> XPathExpr: + xpath = self.xpath(matching.selector) + exprs = [self.xpath(selector) for selector in matching.selector_list] + for e in exprs: + e.add_name_test() + if e.condition: + xpath.add_condition(e.condition, "or") + return xpath - def xpath_function(self, function): + def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: + xpath = self.xpath(matching.selector) + exprs = [self.xpath(selector) for selector in matching.selector_list] + for e in exprs: + e.add_name_test() + if e.condition: + xpath.add_condition(e.condition, "or") + return xpath + + def xpath_function(self, function: Function) -> XPathExpr: """Translate a functional pseudo-class.""" - method = 'xpath_%s_function' % function.name.replace('-', '_') - method = getattr(self, method, None) + method_name = "xpath_{}_function".format(function.name.replace("-", "_")) + method = cast( + "Callable[[XPathExpr, Function], XPathExpr] | None", + getattr(self, method_name, None), + ) if not method: - raise ExpressionError( - "The pseudo-class :%s() is unknown" % function.name) + raise ExpressionError(f"The pseudo-class :{function.name}() is unknown") return method(self.xpath(function.selector), function) - def xpath_pseudo(self, pseudo): + def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: """Translate a pseudo-class.""" - method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_') - method = getattr(self, method, None) + method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_")) + method = cast( + "Callable[[XPathExpr], XPathExpr] | None", + getattr(self, method_name, None), + ) if not method: # TODO: better error message for pseudo-elements? - raise ExpressionError( - "The pseudo-class :%s is unknown" % pseudo.ident) + raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown") return method(self.xpath(pseudo.selector)) - - def xpath_attrib(self, selector): + def xpath_attrib(self, selector: Attrib) -> XPathExpr: """Translate an attribute selector.""" - operator = self.attribute_operator_mapping.get(selector.operator) - if not operator: - raise ExpressionError( - "Unknown attribute operator: %r" % selector.operator) - method = getattr(self, 'xpath_attrib_%s' % operator) - # FIXME: what if attrib is *? - if selector.namespace == '*': - name = '@' + selector.attrib + operator = self.attribute_operator_mapping[selector.operator] + method = cast( + "Callable[[XPathExpr, str, str | None], XPathExpr]", + getattr(self, f"xpath_attrib_{operator}"), + ) + if self.lower_case_attribute_names: + name = selector.attrib.lower() + else: + name = selector.attrib + safe = is_safe_name(name) + if selector.namespace: + name = f"{selector.namespace}:{name}" + safe = safe and is_safe_name(selector.namespace) + if safe: + attrib = "@" + name + else: + attrib = f"attribute::*[name() = {self.xpath_literal(name)}]" + if selector.value is None: + value = None + elif self.lower_case_attribute_values: + value = cast("str", selector.value.value).lower() else: - name = '@%s:%s' % (selector.namespace, selector.attrib) - return method(self.xpath(selector.selector), name, selector.value) + value = selector.value.value + return method(self.xpath(selector.selector), attrib, value) - def xpath_class(self, class_selector): + def xpath_class(self, class_selector: Class) -> XPathExpr: """Translate a class selector.""" # .foo is defined as [class~=foo] in the spec. xpath = self.xpath(class_selector.selector) - return self.xpath_attrib_includes( - xpath, '@class', class_selector.class_name) + return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) - def xpath_hash(self, id_selector): + def xpath_hash(self, id_selector: Hash) -> XPathExpr: """Translate an ID selector.""" xpath = self.xpath(id_selector.selector) - return xpath.add_condition('@%s = %s' % ( - self.id_attribute, self.xpath_literal(id_selector.id))) + return self.xpath_attrib_equals(xpath, "@id", id_selector.id) - def xpath_element(self, selector): + def xpath_element(self, selector: Element) -> XPathExpr: """Translate a type or universal selector.""" - if selector.namespace == '*': - element = selector.element.lower() + element = selector.element + if not element: + element = "*" + safe = True else: - # FIXME: Should we lowercase here? - element = '%s:%s' % (selector.namespace, selector.element) - return XPathExpr(element=element) - + safe = bool(is_safe_name(element)) + if self.lower_case_element_names: + element = element.lower() + if selector.namespace: + # Namespace prefixes are case-sensitive. + # http://www.w3.org/TR/css3-namespace/#prefixes + element = f"{selector.namespace}:{element}" + safe = safe and bool(is_safe_name(selector.namespace)) + xpath = self.xpathexpr_cls(element=element) + if not safe: + xpath.add_name_test() + return xpath # CombinedSelector: dispatch by combinator - def xpath_descendant_combinator(self, left, right): + def xpath_descendant_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a child, grand-child or further descendant of left""" - return left.join('/descendant-or-self::*/', right) + return left.join("/descendant-or-self::*/", right) - def xpath_child_combinator(self, left, right): + def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: """right is an immediate child of left""" - return left.join('/', right) + return left.join("/", right) - def xpath_direct_adjacent_combinator(self, left, right): + def xpath_direct_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a sibling immediately after left""" - xpath = left.join('/following-sibling::', right) + xpath = left.join("/following-sibling::", right) xpath.add_name_test() - return xpath.add_condition('position() = 1') + return xpath.add_condition("position() = 1") - def xpath_indirect_adjacent_combinator(self, left, right): + def xpath_indirect_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: """right is a sibling after left, immediately or not""" - return left.join('/following-sibling::', right) + return left.join("/following-sibling::", right) + + def xpath_relation_descendant_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a child, grand-child or further descendant of left; select left""" + return left.join( + "[descendant::", right, closing_combiner="]", has_inner_condition=True + ) + def xpath_relation_child_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is an immediate child of left; select left""" + return left.join("[./", right, closing_combiner="]") + + def xpath_relation_direct_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a sibling immediately after left; select left""" + return left.add_condition( + f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]" + ) + + def xpath_relation_indirect_adjacent_combinator( + self, left: XPathExpr, right: XPathExpr + ) -> XPathExpr: + """right is a sibling after left, immediately or not; select left""" + return left.join("[following-sibling::", right, closing_combiner="]") # Function: dispatch by function/pseudo-class name - def xpath_nth_child_function(self, xpath, function, last=False, - add_name_test=True): - a, b = parse_series(function.arguments) - if not a and not b and not last: - # a=0 means nothing is returned... - return xpath.add_condition('false() and position() = 0') - if add_name_test: - xpath.add_name_test() - xpath.add_star_prefix() - if a == 0: - if last: - b = 'last() - %s' % b - return xpath.add_condition('position() = %s' % b) - if last: - # FIXME: I'm not sure if this is right - a = -a - b = -b - if b > 0: - b_neg = str(-b) + def xpath_nth_child_function( + self, + xpath: XPathExpr, + function: Function, + last: bool = False, + add_name_test: bool = True, + ) -> XPathExpr: + try: + a, b = parse_series(function.arguments) + except ValueError as ex: + raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex + + # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: + # + # :nth-child(an+b) + # an+b-1 siblings before + # + # :nth-last-child(an+b) + # an+b-1 siblings after + # + # :nth-of-type(an+b) + # an+b-1 siblings with the same expanded element name before + # + # :nth-last-of-type(an+b) + # an+b-1 siblings with the same expanded element name after + # + # So, + # for :nth-child and :nth-of-type + # + # count(preceding-sibling::) = an+b-1 + # + # for :nth-last-child and :nth-last-of-type + # + # count(following-sibling::) = an+b-1 + # + # therefore, + # count(...) - (b-1) ≡ 0 (mod a) + # + # if a == 0: + # ~~~~~~~~~~ + # count(...) = b-1 + # + # if a < 0: + # ~~~~~~~~~ + # count(...) - b +1 <= 0 + # -> count(...) <= b-1 + # + # if a > 0: + # ~~~~~~~~~ + # count(...) - b +1 >= 0 + # -> count(...) >= b-1 + + # work with b-1 instead + b_min_1 = b - 1 + + # early-exit condition 1: + # ~~~~~~~~~~~~~~~~~~~~~~~ + # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, + # and since n ∈ {0, 1, 2, ...}, if b-1<=0, + # there is always an "n" matching any number of siblings (maybe none) + if a == 1 and b_min_1 <= 0: + return xpath + + # early-exit condition 2: + # ~~~~~~~~~~~~~~~~~~~~~~~ + # an+b-1 siblings with a<0 and (b-1)<0 is not possible + if a < 0 and b_min_1 < 0: + return xpath.add_condition("0") + + # `add_name_test` boolean is inverted and somewhat counter-intuitive: + # + # nth_of_type() calls nth_child(add_name_test=False) + nodetest = "*" if add_name_test else f"{xpath.element}" + + # count siblings before or after the element + if not last: + siblings_count = f"count(preceding-sibling::{nodetest})" else: - b_neg = '+%s' % (-b) - if a != 1: - expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + siblings_count = f"count(following-sibling::{nodetest})" + + # special case of fixed position: nth-*(0n+b) + # if a == 0: + # ~~~~~~~~~~ + # count(***-sibling::***) = b-1 + if a == 0: + return xpath.add_condition(f"{siblings_count} = {b_min_1}") + + expressions = [] + + if a > 0: + # siblings count, an+b-1, is always >= 0, + # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, + # therefore, the predicate is only interesting if (b-1)>0 + if b_min_1 > 0: + expressions.append(f"{siblings_count} >= {b_min_1}") else: - expr = [] - if b >= 0: - expr.append('position() >= %s' % b) - elif b < 0 and last: - expr.append('position() < (last() %s)' % b) - expr = ' and '.join(expr) - if expr: - xpath.add_condition(expr) + # if a<0, and (b-1)<0, no "n" satisfies this, + # this is tested above as an early exist condition + # otherwise, + expressions.append(f"{siblings_count} <= {b_min_1}") + + # operations modulo 1 or -1 are simpler, one only needs to verify: + # + # - either: + # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., + # i.e. count(***-sibling::***) >= (b-1) + # + # - or: + # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., + # i.e. count(***-sibling::***) <= (b-1) + # we we just did above. + # + if abs(a) != 1: + # count(***-sibling::***) - (b-1) ≡ 0 (mod a) + left = siblings_count + + # apply "modulo a" on 2nd term, -(b-1), + # to simplify things like "(... +6) % -3", + # and also make it positive with |a| + b_neg = (-b_min_1) % abs(a) + + if b_neg != 0: + left = f"({left} +{b_neg})" + + expressions.append(f"{left} mod {a} = 0") + + template = "(%s)" if len(expressions) > 1 else "%s" + xpath.add_condition( + " and ".join(template % expression for expression in expressions) + ) return xpath - # FIXME: handle an+b, odd, even - # an+b means every-a, plus b, e.g., 2n+1 means odd - # 0n+b means b - # n+0 means a=1, i.e., all elements - # an means every a elements, i.e., 2n means even - # -n means -1n - # -1n+6 means elements 6 and previous - - def xpath_nth_last_child_function(self, xpath, function): + + def xpath_nth_last_child_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: return self.xpath_nth_child_function(xpath, function, last=True) - def xpath_nth_of_type_function(self, xpath, function): - if xpath.element == '*': - raise ExpressionError( - "*:nth-of-type() is not implemented") - return self.xpath_nth_child_function(xpath, function, - add_name_test=False) + def xpath_nth_of_type_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:nth-of-type() is not implemented") + return self.xpath_nth_child_function(xpath, function, add_name_test=False) + + def xpath_nth_last_of_type_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:nth-of-type() is not implemented") + return self.xpath_nth_child_function( + xpath, function, last=True, add_name_test=False + ) - def xpath_nth_last_of_type_function(self, xpath, function): - if xpath.element == '*': + def xpath_contains_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + # Defined there, removed in later drafts: + # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors + if function.argument_types() not in (["STRING"], ["IDENT"]): raise ExpressionError( - "*:nth-of-type() is not implemented") - return self.xpath_nth_child_function(xpath, function, last=True, - add_name_test=False) - - def xpath_contains_function(self, xpath, function): - return xpath.add_condition('contains(string(.), %s)' - % self.xpath_literal(function.arguments)) - - def function_unsupported(self, xpath, pseudo): - raise ExpressionError( - "The pseudo-class :%s() is not supported" % pseudo.name) - - xpath_lang_function = function_unsupported + f"Expected a single string or ident for :contains(), got {function.arguments!r}" + ) + value = cast("str", function.arguments[0].value) + return xpath.add_condition(f"contains(., {self.xpath_literal(value)})") + def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: + if function.argument_types() not in (["STRING"], ["IDENT"]): + raise ExpressionError( + f"Expected a single string or ident for :lang(), got {function.arguments!r}" + ) + value = cast("str", function.arguments[0].value) + return xpath.add_condition(f"lang({self.xpath_literal(value)})") # Pseudo: dispatch by pseudo-class name - def xpath_root_pseudo(self, xpath): + def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: return xpath.add_condition("not(parent::*)") - def xpath_first_child_pseudo(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - return xpath.add_condition('position() = 1') + # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") + # Works only at the start of a selector + # Needed to get immediate children of a processed selector in Scrapy + # for product in response.css('.product'): + # description = product.css(':scope > div::text').get() + def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("1") - def xpath_last_child_pseudo(self, xpath): - xpath.add_star_prefix() - xpath.add_name_test() - return xpath.add_condition('position() = last()') + def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(preceding-sibling::*) = 0") - def xpath_first_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:first-of-type is not implemented") - xpath.add_star_prefix() - return xpath.add_condition('position() = 1') + def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(following-sibling::*) = 0") - def xpath_last_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:last-of-type is not implemented") - xpath.add_star_prefix() - return xpath.add_condition('position() = last()') + def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:first-of-type is not implemented") + return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0") - def xpath_only_child_pseudo(self, xpath): - xpath.add_name_test() - xpath.add_star_prefix() - return xpath.add_condition('last() = 1') + def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:last-of-type is not implemented") + return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0") - def xpath_only_of_type_pseudo(self, xpath): - if xpath.element == '*': - raise ExpressionError( - "*:only-of-type is not implemented") - return xpath.add_condition('last() = 1') + def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(parent::*/child::*) = 1") - def xpath_empty_pseudo(self, xpath): - return xpath.add_condition("not(*) and not(normalize-space())") + def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: + if xpath.element == "*": + raise ExpressionError("*:only-of-type is not implemented") + return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1") - def pseudo_never_matches(self, xpath): + def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("not(*) and not(string-length())") + + def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: """Common implementation for pseudo-classes that never match.""" return xpath.add_condition("0") @@ -407,82 +714,161 @@ def pseudo_never_matches(self, xpath): # Attrib: dispatch by attribute operator - def xpath_attrib_exists(self, xpath, name, value): + def xpath_attrib_exists( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: assert not value xpath.add_condition(name) return xpath - def xpath_attrib_equals(self, xpath, name, value): - xpath.add_condition('%s = %s' % (name, self.xpath_literal(value))) + def xpath_attrib_equals( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None + xpath.add_condition(f"{name} = {self.xpath_literal(value)}") return xpath - def xpath_attrib_different(self, xpath, name, value): + def xpath_attrib_different( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None # FIXME: this seems like a weird hack... if value: - xpath.add_condition('not(%s) or %s != %s' - % (name, name, self.xpath_literal(value))) + xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}") else: - xpath.add_condition('%s != %s' - % (name, self.xpath_literal(value))) + xpath.add_condition(f"{name} != {self.xpath_literal(value)}") return xpath - def xpath_attrib_includes(self, xpath, name, value): - xpath.add_condition( - "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" - % (name, name, self.xpath_literal(' '+value+' '))) + def xpath_attrib_includes( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value and is_non_whitespace(value): + arg = self.xpath_literal(" " + value + " ") + xpath.add_condition( + f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})" + ) + else: + xpath.add_condition("0") return xpath - def xpath_attrib_dashmatch(self, xpath, name, value): + def xpath_attrib_dashmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + assert value is not None + arg = self.xpath_literal(value) + arg_dash = self.xpath_literal(value + "-") # Weird, but true... - xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % ( - name, - name, self.xpath_literal(value), - name, self.xpath_literal(value + '-'))) + xpath.add_condition( + f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))" + ) return xpath - def xpath_attrib_prefixmatch(self, xpath, name, value): - return xpath.add_condition('%s and starts-with(%s, %s)' % ( - name, name, self.xpath_literal(value))) + def xpath_attrib_prefixmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + xpath.add_condition( + f"{name} and starts-with({name}, {self.xpath_literal(value)})" + ) + else: + xpath.add_condition("0") + return xpath - def xpath_attrib_suffixmatch(self, xpath, name, value): - # Oddly there is a starts-with in XPath 1.0, but not ends-with - return xpath.add_condition( - '%s and substring(%s, string-length(%s)-%s) = %s' - % (name, name, name, len(value)-1, self.xpath_literal(value))) + def xpath_attrib_suffixmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + # Oddly there is a starts-with in XPath 1.0, but not ends-with + xpath.add_condition( + f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}" + ) + else: + xpath.add_condition("0") + return xpath - def xpath_attrib_substringmatch(self, xpath, name, value): - # Attribute selectors are case sensitive - return xpath.add_condition('%s and contains(%s, %s)' % ( - name, name, self.xpath_literal(value))) + def xpath_attrib_substringmatch( + self, xpath: XPathExpr, name: str, value: str | None + ) -> XPathExpr: + if value: + # Attribute selectors are case sensitive + xpath.add_condition( + f"{name} and contains({name}, {self.xpath_literal(value)})" + ) + else: + xpath.add_condition("0") + return xpath class HTMLTranslator(GenericTranslator): """ - Translator for HTML documents. + Translator for (X)HTML documents. + + Has a more useful implementation of some pseudo-classes based on + HTML-specific element names and attribute names, as described in + the `HTML5 specification`_. It assumes no-quirks mode. + The API is the same as :class:`GenericTranslator`. + + .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors + + :param xhtml: + If false (the default), element names and attribute names + are case-insensitive. + """ - def xpath_checked_pseudo(self, xpath): + + lang_attribute = "lang" + + def __init__(self, xhtml: bool = False) -> None: + self.xhtml = xhtml # Might be useful for sub-classes? + if not xhtml: + # See their definition in GenericTranslator. + self.lower_case_element_names = True + self.lower_case_attribute_names = True + + def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # FIXME: is this really all the elements? return xpath.add_condition( "(@selected and name(.) = 'option') or " - "(@checked and name(.) = 'input')") + "(@checked " + "and (name(.) = 'input' or name(.) = 'command')" + "and (@type = 'checkbox' or @type = 'radio'))" + ) - def xpath_link_pseudo(self, xpath): - return xpath.add_condition("@href and name(.) = 'a'") + def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: + if function.argument_types() not in (["STRING"], ["IDENT"]): + raise ExpressionError( + f"Expected a single string or ident for :lang(), got {function.arguments!r}" + ) + value = function.arguments[0].value + assert value + arg = self.xpath_literal(value.lower() + "-") + return xpath.add_condition( + "ancestor-or-self::*[@lang][1][starts-with(concat(" + # XPath 1.0 has no lower-case function... + f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " + "'abcdefghijklmnopqrstuvwxyz'), " + f"'-'), {arg})]" + ) + + def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition( + "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" + ) # Links are never visited, the implementation for :visited is the same # as in GenericTranslator - def xpath_disabled_pseudo(self, xpath): + def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition(''' + return xpath.add_condition( + """ ( @disabled and ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or - name(.) = 'keygen' or name(.) = 'command' or name(.) = 'fieldset' or name(.) = 'optgroup' or @@ -490,39 +876,54 @@ def xpath_disabled_pseudo(self, xpath): ) ) or ( ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or - name(.) = 'textarea' or - name(.) = 'keygen' + name(.) = 'textarea' ) and ancestor::fieldset[@disabled] ) - ''') + """ + ) # FIXME: in the second half, add "and is not a descendant of that # fieldset element's first legend element child, if any." - def xpath_enabled_pseudo(self, xpath): + def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition(''' + return xpath.add_condition( + """ ( + @href and ( + name(.) = 'a' or + name(.) = 'link' or + name(.) = 'area' + ) + ) or ( ( name(.) = 'command' or name(.) = 'fieldset' or - name(.) = 'optgroup' or - name(.) = 'option' + name(.) = 'optgroup' ) and not(@disabled) ) or ( ( - name(.) = 'input' or + (name(.) = 'input' and @type != 'hidden') or name(.) = 'button' or name(.) = 'select' or name(.) = 'textarea' or name(.) = 'keygen' ) and not (@disabled or ancestor::fieldset[@disabled]) + ) or ( + name(.) = 'option' and not( + @disabled or ancestor::optgroup[@disabled] + ) ) - ''') - # FIXME: in the second half, add "and is not a descendant of that - # fieldset element's first legend element child, if any." + """ + ) + # FIXME: ... or "li elements that are children of menu elements, + # and that have a child element that defines a command, if the first + # such element's Disabled State facet is false (not disabled)". + # FIXME: after ancestor::fieldset[@disabled], add "and is not a + # descendant of that fieldset element's first legend element child, + # if any." diff --git a/docs/conf.py b/docs/conf.py index 22e6032..da3f023 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # cssselect documentation build configuration file, created by # sphinx-quickstart on Tue Mar 27 14:20:34 2012. @@ -12,217 +11,210 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os, re +import re +from pathlib import Path # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', - 'sphinx.ext.doctest'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = {".rst": "restructuredtext"} # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'cssselect' -copyright = '2012, Simon Sapin' +project = "cssselect" +project_copyright = "2012-2017, Simon Sapin, Scrapy developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -init_py = open(os.path.join(os.path.dirname(__file__), - '..', 'cssselect', '__init__.py')).read() -release = re.search("VERSION = '([^']+)'", init_py).group(1) +init_py = (Path(__file__).parent.parent / "cssselect" / "__init__.py").read_text() +release = re.search('VERSION = "([^"]+)"', init_py).group(1) # The short X.Y version. -version = release.rstrip('dev') +version = release.rstrip("dev") # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -#html_theme = 'agogo' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'cssselectdoc' +htmlhelp_basename = "cssselectdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'cssselect.tex', 'cssselect Documentation', - 'Simon Sapin', 'manual'), + ("index", "cssselect.tex", "cssselect Documentation", "Simon Sapin", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'cssselect', 'cssselect Documentation', - ['Simon Sapin'], 1) -] +man_pages = [("index", "cssselect", "cssselect Documentation", ["Simon Sapin"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -231,20 +223,35 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'cssselect', 'cssselect Documentation', - 'Simon Sapin', 'cssselect', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "cssselect", + "cssselect Documentation", + "Simon Sapin", + "cssselect", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} +intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} + + +# --- Nitpicking options ------------------------------------------------------ + +nitpicky = True +nitpick_ignore = [ + # explicitly not a part of the public API + ("py:class", "Token"), +] diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 0000000..a71d108 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,21 @@ +from doctest import ELLIPSIS, NORMALIZE_WHITESPACE + +from sybil import Sybil +from sybil.parsers.doctest import DocTestParser +from sybil.parsers.skip import skip + +try: + # sybil 3.0.0+ + from sybil.parsers.codeblock import PythonCodeBlockParser +except ImportError: + from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser + + +pytest_collect_file = Sybil( + parsers=[ + DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), + PythonCodeBlockParser(future_imports=["print_function"]), + skip, + ], + pattern="*.rst", +).pytest() diff --git a/docs/index.rst b/docs/index.rst index 0c060fc..a024f20 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,8 +44,9 @@ The resulting expression can be used with lxml's `XPath engine`_: User API ======== -In CSS3 terms, a `group of selectors`_ is a sequence of comma-separated -selectors. For example, ``div, h1.title + p`` is a group of 2 selectors. +In CSS3 Selectors terms, the top-level object is a `group of selectors`_, a +sequence of comma-separated selectors. For example, ``div, h1.title + p`` +is a group of two selectors. .. _group of selectors: http://www.w3.org/TR/selectors/#grouping @@ -53,12 +54,15 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors. .. autoclass:: Selector() :members: +.. autoclass:: FunctionalPseudoElement + .. autoclass:: GenericTranslator :members: css_to_xpath, selector_to_xpath .. autoclass:: HTMLTranslator - The API is the same as :class:`GenericTranslator`. +Exceptions +---------- .. autoexception:: SelectorError .. autoexception:: SelectorSyntaxError @@ -90,18 +94,29 @@ they never match: These applicable pseudo-classes are not yet implemented: -* ``:lang(language)`` * ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``, ``*:nth-last-of-type``, ``*:only-of-type``. All of these work when you specify an element type, but not with ``*`` On the other hand, *cssselect* supports some selectors that are not -in the Level 3 specification: +in the Level 3 specification. + +These parts of the Level 4 specification are supported (note that a large part +of the Level 4 additions is not applicable to cssselect similarly to ``:hover`` +or not representable in XPath 1.0 so the complete specification is unlikely to +be implemented): + +* The ``:scope`` pseudo-class. Limitation: it can only be used at a start of a + selector. +* The ``:is()``, ``:where()`` and ``:has()`` pseudo-classes. Limitation: + ``:has()`` cannot contain nested ``:has()`` or ``:not()``. + +These are non-standard extensions: * The ``:contains(text)`` pseudo-class that existed in `an early draft`_ but was then removed. * The ``!=`` attribute operator. ``[foo!=bar]`` is the same as - ``:not([foo=bar])`` + ``:not([foo=bar])``. * ``:not()`` accepts a *sequence of simple selectors*, not just single *simple selector*. For example, ``:not(a.important[rel])`` is allowed, even though the negation contains 3 *simple selectors*. @@ -134,9 +149,9 @@ implemented without forking or monkey-patching cssselect. The "customization API" is the set of methods in translation classes and their signature. You can look at the `source code`_ to see how it works. However, be aware that this API is not very stable yet. It might change -and break you sub-class. +and break your sub-class. -.. _source code: https://github.com/SimonSapin/cssselect/blob/master/cssselect/xpath.py +.. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py Namespaces diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..21cb2eb --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx==8.2.3 +sphinx-rtd-theme==3.0.2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c7c54a0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,239 @@ +[build-system] +build-backend = "hatchling.build" +requires = ["hatchling>=1.27.0"] + +[project] +name = "cssselect" +license = "BSD-3-Clause" +license-files = ["LICENSE", "AUTHORS"] +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +readme = "README.rst" +authors = [{ name = "Ian Bicking", email = "ianb@colorstudy.com" }] +maintainers = [{ name = "Paul Tremberth", email = "paul.tremberth@gmail.com" }] +requires-python = ">=3.10" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version"] + +[project.urls] +"Homepage" = "https://github.com/scrapy/cssselect" + +[tool.hatch.version] +path = "cssselect/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/cssselect", + "/docs", + "/tests", + "/CHANGES", + "/README.rst", + "/tox.ini", +] +exclude = [ + "/docs/_build", +] + +[tool.hatch.build.targets.wheel] +packages = ["cssselect"] + +[tool.bumpversion] +current_version = "1.4.0" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = "cssselect/__init__.py" + +[[tool.bumpversion.files]] +filename = "CHANGES" +search = "^Unreleased\\.$" +replace = "Released on {now:%Y-%m-%d}." +regex = true + +[tool.coverage.run] +branch = true +source = ["cssselect"] + +[tool.coverage.report] +exclude_also = [ + "def __repr__", + "if sys.version_info", + "if __name__ == '__main__':", +] + +[tool.mypy] +strict = true + +[tool.pylint.MASTER] +persistent = "no" +extension-pkg-allow-list = ["lxml"] + +[tool.pylint."MESSAGES CONTROL"] +enable = [ + "useless-suppression", +] +disable = [ + "consider-using-f-string", + "fixme", + "invalid-name", + "line-too-long", + "missing-class-docstring", + "missing-function-docstring", + "missing-module-docstring", + "no-member", + "not-callable", + "redefined-builtin", + "redefined-outer-name", + "too-few-public-methods", + "too-many-arguments", + "too-many-branches", + "too-many-function-args", + "too-many-lines", + "too-many-locals", + "too-many-positional-arguments", + "too-many-public-methods", + "too-many-statements", + "unused-argument", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff.lint] +extend-select = [ + # flake8-builtins + "A", + # flake8-async + "ASYNC", + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # flake8-commas + "COM", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Trailing comma missing + "COM812", + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in public nested class + "D106", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # First word of the first line should be properly capitalized + "D403", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", +] + +[tool.ruff.lint.isort] +split-on-trailing-comma = false + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index ccddf11..0000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[build_sphinx] -source-dir = docs -build-dir = docs/_build -#all_files = 1 - -[upload_sphinx] # Sphinx-PyPI-upload -upload-dir = docs/_build/html - -[pytest] -python_files=tests.py diff --git a/setup.py b/setup.py deleted file mode 100644 index df95379..0000000 --- a/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -import re -import os.path -from setuptools import setup - - -ROOT = os.path.dirname(__file__) -README = open(os.path.join(ROOT, 'README.rst')).read() -INIT_PY = open(os.path.join(ROOT, 'cssselect', '__init__.py')).read() -VERSION = re.search("VERSION = '([^']+)'", INIT_PY).group(1) - - -setup( - name='cssselect', - version=VERSION, - author='Ian Bicking', - author_email='ianb@colorstudy.com', - maintainer='Simon Sapin', - maintainer_email='simon.sapin@exyr.org', - description= - 'cssselect parses CSS3 Selectors and translates them to XPath 1.0', - long_description=README, - url='http://packages.python.org/cssselect/', - license='BSD', - packages=['cssselect'], - test_suite='cssselect.tests', - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.4', - 'Programming Language :: Python :: 2.5', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2', - ], -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cssselect.py b/tests/test_cssselect.py new file mode 100644 index 0000000..dc67bb7 --- /dev/null +++ b/tests/test_cssselect.py @@ -0,0 +1,1540 @@ +#!/usr/bin/env python +""" +Tests for cssselect +=================== + +These tests can be run either by py.test or by the standard library's +unittest. They use plain ``assert`` statements and do little reporting +themselves in case of failure. + +Use py.test to get fancy error reporting and assert introspection. + + +:copyright: (c) 2007-2012 Ian Bicking and contributors. +See AUTHORS for more details. +:license: BSD, see LICENSE for more details. + +""" + +from __future__ import annotations + +import sys +import typing +import unittest +from typing import TYPE_CHECKING + +import pytest +from lxml import etree, html + +from cssselect import ( + ExpressionError, + GenericTranslator, + HTMLTranslator, + SelectorSyntaxError, + parse, +) +from cssselect.parser import ( + Function, + FunctionalPseudoElement, + PseudoElement, + Token, + parse_series, + tokenize, +) +from cssselect.xpath import XPathExpr + +if TYPE_CHECKING: + from collections.abc import Sequence + + +class TestCssselect(unittest.TestCase): + def test_tokenizer(self) -> None: + tokens = [ + str(item) + for item in tokenize(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)') + ] + assert tokens == [ + "", + "", + "' at 5>", + "", + # the no-break space is not whitespace in CSS + "", # f\xa0 + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + def test_parser(self) -> None: + def repr_parse(css: str) -> list[str]: + selectors = parse(css) + for selector in selectors: + assert selector.pseudo_element is None + return [repr(selector.parsed_tree) for selector in selectors] + + def parse_many(first: str, *others: str) -> list[str]: + result = repr_parse(first) + for other in others: + assert repr_parse(other) == result + return result + + assert parse_many("*") == ["Element[*]"] + assert parse_many("*|*") == ["Element[*]"] + assert parse_many("*|foo") == ["Element[foo]"] + assert parse_many("|foo") == ["Element[foo]"] + assert parse_many("foo|*") == ["Element[foo|*]"] + assert parse_many("foo|bar") == ["Element[foo|bar]"] + # This will never match, but it is valid: + assert parse_many("#foo#bar") == ["Hash[Hash[Element[*]#foo]#bar]"] + assert parse_many( + "div>.foo", + "div> .foo", + "div >.foo", + "div > .foo", + "div \n> \t \t .foo", + "div\r>\n\n\n.foo", + "div\f>\f.foo", + ) == ["CombinedSelector[Element[div] > Class[Element[*].foo]]"] + assert parse_many( + "td.foo,.bar", "td.foo, .bar", "td.foo\t\r\n\f ,\t\r\n\f .bar" + ) == [ + "Class[Element[td].foo]", + "Class[Element[*].bar]", + ] + assert parse_many("div, td.foo, div.bar span") == [ + "Element[div]", + "Class[Element[td].foo]", + "CombinedSelector[Class[Element[div].bar] Element[span]]", + ] + assert parse_many("div > p") == ["CombinedSelector[Element[div] > Element[p]]"] + assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] + assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] + assert parse_many("td :first") == [ + "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" + ] + assert parse_many("td :first") == [ + "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" + ] + assert parse_many("a[name]", "a[ name\t]") == ["Attrib[Element[a][name]]"] + assert parse_many("a [name]") == [ + "CombinedSelector[Element[a] Attrib[Element[*][name]]]" + ] + assert parse_many('a[rel="include"]', "a[rel = include]") == [ + "Attrib[Element[a][rel = 'include']]" + ] + assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [ + "Attrib[Element[a][hreflang |= 'en']]" + ] + assert parse_many("div:nth-child(10)") == [ + "Function[Element[div]:nth-child(['10'])]" + ] + assert parse_many(":nth-child(2n+2)") == [ + "Function[Element[*]:nth-child(['2', 'n', '+2'])]" + ] + assert parse_many("div:nth-of-type(10)") == [ + "Function[Element[div]:nth-of-type(['10'])]" + ] + assert parse_many("div div:nth-of-type(10) .aclass") == [ + "CombinedSelector[CombinedSelector[Element[div] " + "Function[Element[div]:nth-of-type(['10'])]] " + " Class[Element[*].aclass]]" + ] + assert parse_many("label:only") == ["Pseudo[Element[label]:only]"] + assert parse_many("a:lang(fr)") == ["Function[Element[a]:lang(['fr'])]"] + assert parse_many('div:contains("foo")') == [ + "Function[Element[div]:contains(['foo'])]" + ] + assert parse_many("div#foobar") == ["Hash[Element[div]#foobar]"] + assert parse_many("div:not(div.foo)") == [ + "Negation[Element[div]:not(Class[Element[div].foo])]" + ] + assert parse_many("div:has(div.foo)") == [ + "Relation[Element[div]:has(Selector[Class[Element[div].foo]])]" + ] + assert parse_many("div:is(.foo, #bar)") == [ + "Matching[Element[div]:is(Class[Element[*].foo], Hash[Element[*]#bar])]" + ] + assert parse_many(":is(:hover, :visited)") == [ + "Matching[Element[*]:is(Pseudo[Element[*]:hover], Pseudo[Element[*]:visited])]" + ] + assert parse_many(":where(:hover, :visited)") == [ + "SpecificityAdjustment[Element[*]:where(Pseudo[Element[*]:hover]," + " Pseudo[Element[*]:visited])]" + ] + assert parse_many("td ~ th") == ["CombinedSelector[Element[td] ~ Element[th]]"] + assert parse_many(":scope > foo") == [ + "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" + ] + assert parse_many(" :scope > foo") == [ + "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" + ] + assert parse_many(":scope > foo bar > div") == [ + "CombinedSelector[CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " + "Element[foo]] Element[bar]] > Element[div]]" + ] + assert parse_many(":scope > #foo #bar") == [ + "CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " + "Hash[Element[*]#foo]] Hash[Element[*]#bar]]" + ] + + def test_pseudo_elements(self) -> None: + def parse_pseudo(css: str) -> list[tuple[str, str | None]]: + result: list[tuple[str, str | None]] = [] + for selector in parse(css): + pseudo = selector.pseudo_element + pseudo = str(pseudo) if pseudo else pseudo + # No Symbol here + assert pseudo is None or isinstance(pseudo, str) + selector_as_str = repr(selector.parsed_tree) + result.append((selector_as_str, pseudo)) + return result + + def parse_one(css: str) -> tuple[str, str | None]: + result = parse_pseudo(css) + assert len(result) == 1 + return result[0] + + def test_pseudo_repr(css: str) -> str: + result = parse(css) + assert len(result) == 1 + selector = result[0] + return repr(selector.parsed_tree) + + assert parse_one("foo") == ("Element[foo]", None) + assert parse_one("*") == ("Element[*]", None) + assert parse_one(":empty") == ("Pseudo[Element[*]:empty]", None) + assert parse_one(":scope") == ("Pseudo[Element[*]:scope]", None) + + # Special cases for CSS 2.1 pseudo-elements + assert parse_one(":BEfore") == ("Element[*]", "before") + assert parse_one(":aftER") == ("Element[*]", "after") + assert parse_one(":First-Line") == ("Element[*]", "first-line") + assert parse_one(":First-Letter") == ("Element[*]", "first-letter") + + assert parse_one("::befoRE") == ("Element[*]", "before") + assert parse_one("::AFter") == ("Element[*]", "after") + assert parse_one("::firsT-linE") == ("Element[*]", "first-line") + assert parse_one("::firsT-letteR") == ("Element[*]", "first-letter") + + assert parse_one("::text-content") == ("Element[*]", "text-content") + assert parse_one("::attr(name)") == ( + "Element[*]", + "FunctionalPseudoElement[::attr(['name'])]", + ) + + assert parse_one("::Selection") == ("Element[*]", "selection") + assert parse_one("foo:after") == ("Element[foo]", "after") + assert parse_one("foo::selection") == ("Element[foo]", "selection") + assert parse_one("lorem#ipsum ~ a#b.c[href]:empty::selection") == ( + "CombinedSelector[Hash[Element[lorem]#ipsum] ~ " + "Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]", + "selection", + ) + assert parse_pseudo(":scope > div, foo bar") == [ + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ("CombinedSelector[Element[foo] Element[bar]]", None), + ] + assert parse_pseudo("foo bar, :scope > div") == [ + ("CombinedSelector[Element[foo] Element[bar]]", None), + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ] + assert parse_pseudo("foo bar,:scope > div") == [ + ("CombinedSelector[Element[foo] Element[bar]]", None), + ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), + ] + assert parse_pseudo("foo:before, bar, baz:after") == [ + ("Element[foo]", "before"), + ("Element[bar]", None), + ("Element[baz]", "after"), + ] + + # Special cases for CSS 2.1 pseudo-elements are ignored by default + for pseudo in ("after", "before", "first-line", "first-letter"): + (selector,) = parse(f"e:{pseudo}") + assert selector.pseudo_element == pseudo + assert GenericTranslator().selector_to_xpath(selector, prefix="") == "e" + + # Pseudo Elements are ignored by default, but if allowed they are not + # supported by GenericTranslator + tr = GenericTranslator() + (selector,) = parse("e::foo") + assert selector.pseudo_element == "foo" + assert tr.selector_to_xpath(selector, prefix="") == "e" + with pytest.raises(ExpressionError): + tr.selector_to_xpath(selector, translate_pseudo_elements=True) + + # Special test for the unicode symbols and ':scope' element if check + # Errors if use repr() instead of __repr__() + assert test_pseudo_repr(":fİrst-child") == "Pseudo[Element[*]:fİrst-child]" + assert test_pseudo_repr(":scope") == "Pseudo[Element[*]:scope]" + + def test_specificity(self) -> None: + def specificity(css: str) -> tuple[int, int, int]: + selectors = parse(css) + assert len(selectors) == 1 + return selectors[0].specificity() + + assert specificity("*") == (0, 0, 0) + assert specificity(" foo") == (0, 0, 1) + assert specificity(":empty ") == (0, 1, 0) + assert specificity(":before") == (0, 0, 1) + assert specificity("*:before") == (0, 0, 1) + assert specificity(":nth-child(2)") == (0, 1, 0) + assert specificity(".bar") == (0, 1, 0) + assert specificity("[baz]") == (0, 1, 0) + assert specificity('[baz="4"]') == (0, 1, 0) + assert specificity('[baz^="4"]') == (0, 1, 0) + assert specificity("#lipsum") == (1, 0, 0) + assert specificity("::attr(name)") == (0, 0, 1) + + assert specificity(":not(*)") == (0, 0, 0) + assert specificity(":not(foo)") == (0, 0, 1) + assert specificity(":not(.foo)") == (0, 1, 0) + assert specificity(":not([foo])") == (0, 1, 0) + assert specificity(":not(:empty)") == (0, 1, 0) + assert specificity(":not(#foo)") == (1, 0, 0) + + assert specificity(":has(*)") == (0, 0, 0) + assert specificity(":has(foo)") == (0, 0, 1) + assert specificity(":has(.foo)") == (0, 1, 0) + assert specificity(":has(> foo)") == (0, 0, 1) + + assert specificity(":is(.foo, #bar)") == (1, 0, 0) + assert specificity(":is(:hover, :visited)") == (0, 1, 0) + assert specificity(":where(:hover, :visited)") == (0, 0, 0) + + assert specificity("foo:empty") == (0, 1, 1) + assert specificity("foo:before") == (0, 0, 2) + assert specificity("foo::before") == (0, 0, 2) + assert specificity("foo:empty::before") == (0, 1, 2) + + assert specificity("#lorem + foo#ipsum:first-child > bar:first-line") == ( + 2, + 1, + 3, + ) + + def test_css_export(self) -> None: + def css2css(css: str, res: str | None = None) -> None: + selectors = parse(css) + assert len(selectors) == 1 + assert selectors[0].canonical() == (res or css) + + css2css("*") + css2css(" foo", "foo") + css2css("Foo", "Foo") + css2css(":empty ", ":empty") + css2css(":before", "::before") + css2css(":beFOre", "::before") + css2css("*:before", "::before") + css2css(":nth-child(2)") + css2css(".bar") + css2css("[baz]") + css2css('[baz="4"]', "[baz='4']") + css2css('[baz^="4"]', "[baz^='4']") + css2css("[ns|attr='4']") + css2css("#lipsum") + css2css(":not(*)") + css2css(":not(foo)") + css2css(":not(*.foo)", ":not(.foo)") + css2css(":not(*[foo])", ":not([foo])") + css2css(":not(:empty)") + css2css(":not(#foo)") + css2css(":has(*)") + css2css(":has(foo)") + css2css(":has(*.foo)", ":has(.foo)") + css2css(":is(#bar, .foo)") + css2css(":is(:focused, :visited)") + css2css(":where(:focused, :visited)") + css2css("foo:empty") + css2css("foo::before") + css2css("foo:empty::before") + css2css('::name(arg + "val" - 3)', "::name(arg+'val'-3)") + css2css("#lorem + foo#ipsum:first-child > bar::first-line") + css2css("foo > *") + + def test_parse_errors(self) -> None: + def get_error(css: str) -> str | None: + try: + parse(css) + except SelectorSyntaxError: + return str(sys.exc_info()[1]) + return None + + assert get_error("attributes(href)/html/body/a") == ( + "Expected selector, got " + ) + assert get_error("attributes(href)") == ( + "Expected selector, got " + ) + assert get_error("html/body/a") == ("Expected selector, got ") + assert get_error(" ") == ("Expected selector, got ") + assert get_error("div, ") == ("Expected selector, got ") + assert get_error(" , div") == ("Expected selector, got ") + assert get_error("p, , div") == ("Expected selector, got ") + assert get_error("div > ") == ("Expected selector, got ") + assert get_error(" > div") == ("Expected selector, got ' at 2>") + assert get_error("foo|#bar") == ("Expected ident or '*', got ") + assert get_error("#.foo") == ("Expected selector, got ") + assert get_error(".#foo") == ("Expected ident, got ") + assert get_error(":#foo") == ("Expected ident, got ") + assert get_error("[*]") == ("Expected '|', got ") + assert get_error("[foo|]") == ("Expected ident, got ") + assert get_error("[#]") == ("Expected ident or '*', got ") + assert get_error("[foo=#]") == ( + "Expected string or ident, got " + ) + assert get_error("[href]a") == ("Expected selector, got ") + assert get_error("[rel=stylesheet]") is None + assert get_error("[rel:stylesheet]") == ( + "Operator expected, got " + ) + assert get_error("[rel=stylesheet") == ("Expected ']', got ") + assert get_error(":lang(fr)") is None + assert get_error(":lang(fr") == ("Expected an argument, got ") + assert get_error(':contains("foo') == ("Unclosed string at 10") + assert get_error("foo!") == ("Expected selector, got ") + + # Mis-placed pseudo-elements + assert get_error("a:before:empty") == ( + "Got pseudo-element ::before not at the end of a selector" + ) + assert get_error("li:before a") == ( + "Got pseudo-element ::before not at the end of a selector" + ) + assert get_error(":not(:before)") == ( + "Got pseudo-element ::before inside :not() at 12" + ) + assert get_error(":not(:not(a))") == ("Got nested :not()") + assert get_error(":is(:before)") == ( + "Got pseudo-element ::before inside function" + ) + assert get_error(":is(a b)") == ("Expected an argument, got ") + assert get_error(":where(:before)") == ( + "Got pseudo-element ::before inside function" + ) + assert get_error(":where(a b)") == ( + "Expected an argument, got " + ) + assert get_error(":scope > div :scope header") == ( + 'Got immediate child pseudo-element ":scope" not at the start of a selector' + ) + assert get_error("div :scope header") == ( + 'Got immediate child pseudo-element ":scope" not at the start of a selector' + ) + assert get_error("> div p") == ("Expected selector, got ' at 0>") + + # Unsupported :has() with several arguments + assert get_error(":has(a, b)") == ("Expected an argument, got ") + assert get_error(":has()") == ("Expected selector, got ") + + def test_translation(self) -> None: + def xpath(css: str) -> str: + return str(GenericTranslator().css_to_xpath(css, prefix="")) + + assert xpath("*") == "*" + assert xpath("e") == "e" + assert xpath("*|e") == "e" + assert xpath("e|f") == "e:f" + assert xpath("e[foo]") == "e[@foo]" + assert xpath("e[foo|bar]") == "e[@foo:bar]" + assert xpath('e[foo="bar"]') == "e[@foo = 'bar']" + assert xpath('e[foo~="bar"]') == ( + "e[@foo and contains(concat(' ', normalize-space(@foo), ' '), ' bar ')]" + ) + assert xpath('e[foo^="bar"]') == ("e[@foo and starts-with(@foo, 'bar')]") + assert xpath('e[foo$="bar"]') == ( + "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']" + ) + assert xpath('e[foo*="bar"]') == ("e[@foo and contains(@foo, 'bar')]") + assert xpath('e[hreflang|="en"]') == ( + "e[@hreflang and (@hreflang = 'en' or starts-with(@hreflang, 'en-'))]" + ) + + # --- nth-* and nth-last-* ------------------------------------- + assert xpath("e:nth-child(1)") == ("e[count(preceding-sibling::*) = 0]") + + # always true + assert xpath("e:nth-child(n)") == ("e") + assert xpath("e:nth-child(n+1)") == ("e") + # always true too + assert xpath("e:nth-child(n-10)") == ("e") + # b=2 is the limit... + assert xpath("e:nth-child(n+2)") == ("e[count(preceding-sibling::*) >= 1]") + # always false + assert xpath("e:nth-child(-n)") == ("e[0]") + # equivalent to first child + assert xpath("e:nth-child(-n+1)") == ("e[count(preceding-sibling::*) <= 0]") + + assert xpath("e:nth-child(3n+2)") == ( + "e[(count(preceding-sibling::*) >= 1) and " + "((count(preceding-sibling::*) +2) mod 3 = 0)]" + ) + assert xpath("e:nth-child(3n-2)") == ( + "e[count(preceding-sibling::*) mod 3 = 0]" + ) + assert xpath("e:nth-child(-n+6)") == ("e[count(preceding-sibling::*) <= 5]") + + assert xpath("e:nth-last-child(1)") == ("e[count(following-sibling::*) = 0]") + assert xpath("e:nth-last-child(2n)") == ( + "e[(count(following-sibling::*) +1) mod 2 = 0]" + ) + assert xpath("e:nth-last-child(2n+1)") == ( + "e[count(following-sibling::*) mod 2 = 0]" + ) + assert xpath("e:nth-last-child(2n+2)") == ( + "e[(count(following-sibling::*) >= 1) and " + "((count(following-sibling::*) +1) mod 2 = 0)]" + ) + assert xpath("e:nth-last-child(3n+1)") == ( + "e[count(following-sibling::*) mod 3 = 0]" + ) + # represents the two last e elements + assert xpath("e:nth-last-child(-n+2)") == ( + "e[count(following-sibling::*) <= 1]" + ) + + assert xpath("e:nth-of-type(1)") == ("e[count(preceding-sibling::e) = 0]") + assert xpath("e:nth-last-of-type(1)") == ("e[count(following-sibling::e) = 0]") + assert xpath("div e:nth-last-of-type(1) .aclass") == ( + "div/descendant-or-self::*/e[count(following-sibling::e) = 0]" + "/descendant-or-self::*/*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aclass ')]" + ) + + assert xpath("e:first-child") == ("e[count(preceding-sibling::*) = 0]") + assert xpath("e:last-child") == ("e[count(following-sibling::*) = 0]") + assert xpath("e:first-of-type") == ("e[count(preceding-sibling::e) = 0]") + assert xpath("e:last-of-type") == ("e[count(following-sibling::e) = 0]") + assert xpath("e:only-child") == ("e[count(parent::*/child::*) = 1]") + assert xpath("e:only-of-type") == ("e[count(parent::*/child::e) = 1]") + assert xpath("e:empty") == ("e[not(*) and not(string-length())]") + assert xpath("e:EmPTY") == ("e[not(*) and not(string-length())]") + assert xpath("e:root") == ("e[not(parent::*)]") + assert xpath("e:hover") == ("e[0]") # never matches + assert ( + xpath("div:has(bar.foo)") == "div[descendant::bar" + "[@class and contains(concat(' ', normalize-space(@class), ' '), ' foo ')]]" + ) + assert xpath("e:has(> f)") == "e[./f]" + assert xpath("e:has(f)") == "e[descendant::f]" + assert xpath("e:has(~ f)") == "e[following-sibling::f]" + assert ( + xpath("e:has(+ f)") + == "e[following-sibling::*[(name() = 'f') and (position() = 1)]]" + ) + assert xpath('e:contains("foo")') == ("e[contains(., 'foo')]") + assert xpath("e:ConTains(foo)") == ("e[contains(., 'foo')]") + assert xpath("e.warning") == ( + "e[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' warning ')]" + ) + assert xpath("e#myid") == ("e[@id = 'myid']") + assert xpath("e:not(:nth-child(odd))") == ( + "e[not(count(preceding-sibling::*) mod 2 = 0)]" + ) + assert xpath("e:nOT(*)") == ("e[0]") # never matches + assert xpath("e f") == ("e/descendant-or-self::*/f") + assert xpath("e > f") == ("e/f") + assert xpath("e + f") == ( + "e/following-sibling::*[(name() = 'f') and (position() = 1)]" + ) + assert xpath("e ~ f") == ("e/following-sibling::f") + assert xpath("e ~ f:nth-child(3)") == ( + "e/following-sibling::f[count(preceding-sibling::*) = 2]" + ) + assert xpath("div#container p") == ( + "div[@id = 'container']/descendant-or-self::*/p" + ) + assert xpath("e:where(foo)") == "e[name() = 'foo']" + assert xpath("e:where(foo, bar)") == "e[(name() = 'foo') or (name() = 'bar')]" + + # Invalid characters in XPath element names + assert xpath(r"di\a0 v") == ("*[name() = 'di v']") # di\xa0v + assert xpath(r"di\[v") == ("*[name() = 'di[v']") + assert xpath(r"[h\a0 ref]") == ("*[attribute::*[name() = 'h ref']]") # h\xa0ref + assert xpath(r"[h\]ref]") == ("*[attribute::*[name() = 'h]ref']]") + + with pytest.raises(ExpressionError): + xpath(":fİrst-child") + with pytest.raises(ExpressionError): + xpath(":first-of-type") + with pytest.raises(ExpressionError): + xpath(":only-of-type") + with pytest.raises(ExpressionError): + xpath(":last-of-type") + with pytest.raises(ExpressionError): + xpath(":nth-of-type(1)") + with pytest.raises(ExpressionError): + xpath(":nth-last-of-type(1)") + with pytest.raises(ExpressionError): + xpath(":nth-child(n-)") + with pytest.raises(ExpressionError): + xpath(":after") + with pytest.raises(ExpressionError): + xpath(":lorem-ipsum") + with pytest.raises(ExpressionError): + xpath(":lorem(ipsum)") + with pytest.raises(ExpressionError): + xpath("::lorem-ipsum") + with pytest.raises(TypeError): + GenericTranslator().css_to_xpath(4) # type: ignore[arg-type] + with pytest.raises(TypeError): + GenericTranslator().selector_to_xpath("foo") # type: ignore[arg-type] + + def test_unicode(self) -> None: + css = ".a\xc1b" + xpath = GenericTranslator().css_to_xpath(css) + assert css[1:] in xpath + xpath = xpath.encode("ascii", "xmlcharrefreplace").decode("ASCII") + assert xpath == ( + "descendant-or-self::*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aÁb ')]" + ) + + def test_quoting(self) -> None: + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath('*[aval="\'"]') == ( + """descendant-or-self::*[@aval = "'"]""" + ) + assert css_to_xpath("*[aval=\"'''\"]") == ( + """descendant-or-self::*[@aval = "'''"]""" + ) + assert css_to_xpath("*[aval='\"']") == ( + """descendant-or-self::*[@aval = '"']""" + ) + assert css_to_xpath('*[aval=\'"""\']') == ( + '''descendant-or-self::*[@aval = '"""']''' + ) + assert css_to_xpath(':scope > div[dataimg=""]') == ( + "descendant-or-self::*[1]/div[@dataimg = '']" + ) + + def test_unicode_escapes(self) -> None: + # \22 == '"' \20 == ' ' + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath(r'*[aval="\'\22\'"]') == ( + """descendant-or-self::*[@aval = concat("'",'"',"'")]""" + ) + assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( + """descendant-or-self::*[@aval = concat("'",'"2',"'")]""" + ) + assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( + """descendant-or-self::*[@aval = "' '"]""" + ) + assert css_to_xpath("*[aval=\"'\\20\r\n '\"]") == ( + """descendant-or-self::*[@aval = "' '"]""" + ) + + def test_xpath_pseudo_elements(self) -> None: + class CustomTranslator(GenericTranslator): + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + if isinstance(pseudo_element, FunctionalPseudoElement): + method_name = "xpath_{}_functional_pseudo_element".format( + pseudo_element.name.replace("-", "_") + ) + method = getattr(self, method_name, None) + if not method: + raise ExpressionError( + f"The functional pseudo-element ::{pseudo_element.name}() is unknown" + ) + xpath = method(xpath, pseudo_element.arguments) + else: + method_name = "xpath_{}_simple_pseudo_element".format( + pseudo_element.replace("-", "_") + ) + method = getattr(self, method_name, None) + if not method: + raise ExpressionError( + f"The pseudo-element ::{pseudo_element} is unknown" + ) + xpath = method(xpath) + return xpath + + # functional pseudo-class: + # elements that have a certain number of attributes + def xpath_nb_attr_function( + self, xpath: XPathExpr, function: Function + ) -> XPathExpr: + assert function.arguments[0].value + nb_attributes = int(function.arguments[0].value) + return xpath.add_condition(f"count(@*)={nb_attributes}") + + # pseudo-class: + # elements that have 5 attributes + def xpath_five_attributes_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("count(@*)=5") + + # functional pseudo-element: + # element's attribute by name + def xpath_attr_functional_pseudo_element( + self, xpath: XPathExpr, arguments: Sequence[Token] + ) -> XPathExpr: + attribute_name = arguments[0].value + other = XPathExpr( + f"@{attribute_name}", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # element's text() nodes + def xpath_text_node_simple_pseudo_element( + self, xpath: XPathExpr + ) -> XPathExpr: + other = XPathExpr( + "text()", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # element's href attribute + def xpath_attr_href_simple_pseudo_element( + self, xpath: XPathExpr + ) -> XPathExpr: + other = XPathExpr( + "@href", + "", + ) + return xpath.join("/", other) + + # pseudo-element: + # used to demonstrate operator precedence + def xpath_first_or_second_pseudo(self, xpath: XPathExpr) -> XPathExpr: + return xpath.add_condition("@id = 'first' or @id = 'second'") + + def xpath(css: str) -> str: + return str(CustomTranslator().css_to_xpath(css)) + + assert xpath(":five-attributes") == "descendant-or-self::*[count(@*)=5]" + assert xpath(":nb-attr(3)") == "descendant-or-self::*[count(@*)=3]" + assert xpath("::attr(href)") == "descendant-or-self::*/@href" + assert xpath("::text-node") == "descendant-or-self::*/text()" + assert xpath("::attr-href") == "descendant-or-self::*/@href" + assert xpath("p img::attr(src)") == ( + "descendant-or-self::p/descendant-or-self::*/img/@src" + ) + assert xpath(":scope") == "descendant-or-self::*[1]" + assert xpath(":first-or-second[href]") == ( + "descendant-or-self::*[(@id = 'first' or @id = 'second') and (@href)]" + ) + + assert str(XPathExpr("", "", condition="@href")) == "[@href]" + + document = etree.fromstring(OPERATOR_PRECEDENCE_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + + def operator_id(selector: str) -> list[str]: + xpath = CustomTranslator().css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + assert operator_id(":first-or-second") == ["first", "second"] + assert operator_id(":first-or-second[href]") == ["second"] + assert operator_id("[href]:first-or-second") == ["second"] + + def test_series(self) -> None: + def series(css: str) -> tuple[int, int] | None: + (selector,) = parse(f":nth-child({css})") + args = typing.cast( + "FunctionalPseudoElement", selector.parsed_tree + ).arguments + try: + return parse_series(args) + except ValueError: + return None + + assert series("1n+3") == (1, 3) + assert series("1n +3") == (1, 3) + assert series("1n + 3") == (1, 3) + assert series("1n+ 3") == (1, 3) + assert series("1n-3") == (1, -3) + assert series("1n -3") == (1, -3) + assert series("1n - 3") == (1, -3) + assert series("1n- 3") == (1, -3) + assert series("n-5") == (1, -5) + assert series("odd") == (2, 1) + assert series("even") == (2, 0) + assert series("3n") == (3, 0) + assert series("n") == (1, 0) + assert series("+n") == (1, 0) + assert series("-n") == (-1, 0) + assert series("5") == (0, 5) + assert series("foo") is None + assert series("n+") is None + + def test_lang(self) -> None: + document = etree.fromstring(XMLLANG_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + + def langid(selector: str) -> list[str]: + xpath = css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + assert langid(':lang("EN")') == ["first", "second", "third", "fourth"] + assert langid(':lang("en-us")') == ["second", "fourth"] + assert langid(":lang(en-nz)") == ["third"] + assert langid(":lang(fr)") == ["fifth"] + assert langid(":lang(ru)") == ["sixth"] + assert langid(":lang('ZH')") == ["eighth"] + assert langid(":lang(de) :lang(zh)") == ["eighth"] + assert langid(":lang(en), :lang(zh)") == [ + "first", + "second", + "third", + "fourth", + "eighth", + ] + assert langid(":lang(es)") == [] + + def test_argument_types(self) -> None: + class CustomTranslator(GenericTranslator): + def __init__(self) -> None: + self.argument_types: list[str] = [] + + def xpath_pseudo_element( + self, xpath: XPathExpr, pseudo_element: PseudoElement + ) -> XPathExpr: + self.argument_types += typing.cast( + "FunctionalPseudoElement", pseudo_element + ).argument_types() + return xpath + + def argument_types(css: str) -> list[str]: + translator = CustomTranslator() + translator.css_to_xpath(css) + return translator.argument_types + + mappings: list[tuple[str, list[str]]] = [ + ("", []), + ("ident", ["IDENT"]), + ('"string"', ["STRING"]), + ("1", ["NUMBER"]), + ] + for argument_string, argument_list in mappings: + css = f"::pseudo_element({argument_string})" + assert argument_types(css) == argument_list + + def test_select(self) -> None: + document = etree.fromstring(HTML_IDS) + sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + html_css_to_xpath = HTMLTranslator().css_to_xpath + + def select_ids(selector: str, html_only: bool) -> list[str]: + xpath = css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + if html_only: + assert items == [] + xpath = html_css_to_xpath(selector) + items = typing.cast("list[etree._Element]", document.xpath(xpath)) + items.sort(key=sort_key) + return [element.get("id", "nil") for element in items] + + def pcss(main: str, *selectors: str, **kwargs: bool) -> list[str]: + html_only = kwargs.pop("html_only", False) + result = select_ids(main, html_only) + for selector in selectors: + assert select_ids(selector, html_only) == result + return result + + all_ids = pcss("*") + assert all_ids[:6] == [ + "html", + "nil", + "link-href", + "link-nohref", + "nil", + "outer-div", + ] + assert all_ids[-1:] == ["foobar-span"] + assert pcss("div") == ["outer-div", "li-div", "foobar-div"] + assert pcss("DIV", html_only=True) == [ + "outer-div", + "li-div", + "foobar-div", + ] # case-insensitive in HTML + assert pcss("div div") == ["li-div"] + assert pcss("div, div div") == ["outer-div", "li-div", "foobar-div"] + assert pcss("a[name]") == ["name-anchor"] + assert pcss("a[NAme]", html_only=True) == [ + "name-anchor" + ] # case-insensitive in HTML: + assert pcss("a[rel]") == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[rel="tag"]') == ["tag-anchor"] + assert pcss('a[href*="localhost"]') == ["tag-anchor"] + assert pcss('a[href*=""]') == [] + assert pcss('a[href^="http"]') == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[href^="http:"]') == ["tag-anchor"] + assert pcss('a[href^=""]') == [] + assert pcss('a[href$="org"]') == ["nofollow-anchor"] + assert pcss('a[href$=""]') == [] + assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ["foobar-div"] + assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == [] + assert pcss('div[foobar~="cd"]') == [] + assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ["second-li"] + # Attribute values are case sensitive + assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] + assert pcss('*[lang|="e"]') == [] + # ... :lang() is not. + assert pcss(':lang("EN")', "*:lang(en-US)", html_only=True) == [ + "second-li", + "li-div", + ] + assert pcss(':lang("e")', html_only=True) == [] + assert pcss(":scope > div") == [] + assert pcss(":scope body") == ["nil"] + assert pcss(":scope body > div") == ["outer-div", "foobar-div"] + assert pcss(":scope head") == ["nil"] + assert pcss(":scope html") == [] + + # --- nth-* and nth-last-* ------------------------------------- + + # select nothing + assert pcss("li:nth-child(-n)") == [] + # select all children + assert pcss("li:nth-child(n)") == [ + "first-li", + "second-li", + "third-li", + "fourth-li", + "fifth-li", + "sixth-li", + "seventh-li", + ] + + assert pcss("li:nth-child(3)", "#first-li ~ :nth-child(3)") == ["third-li"] + assert pcss("li:nth-child(10)") == [] + assert pcss("li:nth-child(2n)", "li:nth-child(even)", "li:nth-child(2n+0)") == [ + "second-li", + "fourth-li", + "sixth-li", + ] + assert pcss("li:nth-child(+2n+1)", "li:nth-child(odd)") == [ + "first-li", + "third-li", + "fifth-li", + "seventh-li", + ] + assert pcss("li:nth-child(2n+4)") == ["fourth-li", "sixth-li"] + assert pcss("li:nth-child(3n+1)") == ["first-li", "fourth-li", "seventh-li"] + assert pcss("li:nth-child(-n+3)") == ["first-li", "second-li", "third-li"] + assert pcss("li:nth-child(-2n+4)") == ["second-li", "fourth-li"] + assert pcss("li:nth-last-child(0)") == [] + assert pcss("li:nth-last-child(1)") == ["seventh-li"] + assert pcss("li:nth-last-child(2n)", "li:nth-last-child(even)") == [ + "second-li", + "fourth-li", + "sixth-li", + ] + assert pcss("li:nth-last-child(2n+1)") == [ + "first-li", + "third-li", + "fifth-li", + "seventh-li", + ] + assert pcss("li:nth-last-child(2n+2)") == ["second-li", "fourth-li", "sixth-li"] + assert pcss("li:nth-last-child(3n+1)") == [ + "first-li", + "fourth-li", + "seventh-li", + ] + assert pcss("ol:first-of-type") == ["first-ol"] + assert pcss("ol:nth-child(1)") == [] + assert pcss("ol:nth-of-type(2)") == ["second-ol"] + assert pcss("ol:nth-last-of-type(1)") == ["second-ol"] + + # "+" and "~" tests + assert pcss("ol#first-ol li + li:nth-child(4)") == ["fourth-li"] + assert pcss("li + li:nth-child(1)") == [] + assert pcss("li ~ li:nth-child(2n+1)") == [ + "third-li", + "fifth-li", + "seventh-li", + ] # all but the first + assert pcss("li ~ li:nth-last-child(2n+1)") == [ + "third-li", + "fifth-li", + "seventh-li", + ] # all but the first + + assert pcss("span:only-child") == ["foobar-span"] + assert pcss("li div:only-child") == ["li-div"] + assert pcss("div *:only-child") == ["li-div", "foobar-span"] + with pytest.raises(ExpressionError): + pcss("p *:only-of-type") + assert pcss("p:only-of-type") == ["paragraph"] + assert pcss("a:empty", "a:EMpty") == ["name-anchor"] + assert pcss("li:empty") == ["third-li", "fourth-li", "fifth-li", "sixth-li"] + assert pcss(":root", "html:root") == ["html"] + assert pcss("li:root", "* :root") == [] + assert pcss('*:contains("link")', ':CONtains("link")') == [ + "html", + "nil", + "outer-div", + "tag-anchor", + "nofollow-anchor", + ] + assert pcss('*:contains("LInk")') == [] # case sensitive + assert pcss('*:contains("e")') == [ + "html", + "nil", + "outer-div", + "first-ol", + "first-li", + "paragraph", + "p-em", + ] + assert pcss('*:contains("E")') == [] # case-sensitive + assert pcss(".a", ".b", "*.a", "ol.a") == ["first-ol"] + assert pcss(".c", "*.c") == ["first-ol", "third-li", "fourth-li"] + assert pcss("ol *.c", "ol li.c", "li ~ li.c", "ol > li.c") == [ + "third-li", + "fourth-li", + ] + assert pcss("#first-li", "li#first-li", "*#first-li") == ["first-li"] + assert pcss("li div", "li > div", "div div") == ["li-div"] + assert pcss("div > div") == [] + assert pcss("div>.c", "div > .c") == ["first-ol"] + assert pcss("div + div") == ["foobar-div"] + assert pcss("a ~ a") == ["tag-anchor", "nofollow-anchor"] + assert pcss('a[rel="tag"] ~ a') == ["nofollow-anchor"] + assert pcss("ol#first-ol li:last-child") == ["seventh-li"] + assert pcss("ol#first-ol *:last-child") == ["li-div", "seventh-li"] + assert pcss("#outer-div:first-child") == ["outer-div"] + assert pcss("#outer-div :first-child") == [ + "name-anchor", + "first-li", + "li-div", + "p-b", + "checkbox-fieldset-disabled", + "area-href", + ] + assert pcss("a[href]") == ["tag-anchor", "nofollow-anchor"] + assert pcss(":not(*)") == [] + assert pcss("a:not([href])") == ["name-anchor"] + assert pcss("ol :Not(li[class])") == [ + "first-li", + "second-li", + "li-div", + "fifth-li", + "sixth-li", + "seventh-li", + ] + assert pcss("link:has(*)") == [] + assert pcss("ol:has(div)") == ["first-ol"] + assert pcss(":is(#first-li, #second-li)") == ["first-li", "second-li"] + assert pcss("a:is(#name-anchor, #tag-anchor)") == ["name-anchor", "tag-anchor"] + assert pcss(":is(.c)") == ["first-ol", "third-li", "fourth-li"] + assert pcss("ol.a.b.c > li.c:nth-child(3)") == ["third-li"] + + # Invalid characters in XPath element names, should not crash + assert pcss(r"di\a0 v", r"div\[") == [] + assert pcss(r"[h\a0 ref]", r"[h\]ref]") == [] + + # HTML-specific + assert pcss(":link", html_only=True) == [ + "link-href", + "tag-anchor", + "nofollow-anchor", + "area-href", + ] + assert pcss(":visited", html_only=True) == [] + assert pcss(":enabled", html_only=True) == [ + "link-href", + "tag-anchor", + "nofollow-anchor", + "checkbox-unchecked", + "text-checked", + "checkbox-checked", + "area-href", + ] + assert pcss(":disabled", html_only=True) == [ + "checkbox-disabled", + "checkbox-disabled-checked", + "fieldset", + "checkbox-fieldset-disabled", + ] + assert pcss(":checked", html_only=True) == [ + "checkbox-checked", + "checkbox-disabled-checked", + ] + + def test_select_shakespeare(self) -> None: + document = html.document_fromstring(HTML_SHAKESPEARE) + body = typing.cast("list[etree._Element]", document.xpath("//body"))[0] + css_to_xpath = GenericTranslator().css_to_xpath + + basestring_ = (str, bytes) + + def count(selector: str) -> int: + xpath = css_to_xpath(selector) + results = typing.cast("list[etree._Element]", body.xpath(xpath)) + assert not isinstance(results, basestring_) + found = set() + for item in results: + assert item not in found + found.add(item) + assert not isinstance(item, basestring_) + return len(results) + + # Data borrowed from http://mootools.net/slickspeed/ + + ## Changed from original; probably because I'm only + ## searching the body. + # assert count('*') == 252 + assert count("*") == 246 + assert count("div:contains(CELIA)") == 26 + assert count("div:only-child") == 22 # ? + assert count("div:nth-child(even)") == 106 + assert count("div:nth-child(2n)") == 106 + assert count("div:nth-child(odd)") == 137 + assert count("div:nth-child(2n+1)") == 137 + assert count("div:nth-child(n)") == 243 + assert count("div:last-child") == 53 + assert count("div:first-child") == 51 + assert count("div > div") == 242 + assert count("div + div") == 190 + assert count("div ~ div") == 190 + assert count("body") == 1 + assert count("body div") == 243 + assert count("div") == 243 + assert count("div div") == 242 + assert count("div div div") == 241 + assert count("div, div, div") == 243 + assert count("div, a, span") == 243 + assert count(".dialog") == 51 + assert count("div.dialog") == 51 + assert count("div .dialog") == 51 + assert count("div.character, div.dialog") == 99 + assert count("div.direction.dialog") == 0 + assert count("div.dialog.direction") == 0 + assert count("div.dialog.scene") == 1 + assert count("div.scene.scene") == 1 + assert count("div.scene .scene") == 0 + assert count("div.direction .dialog ") == 0 + assert count("div .dialog .direction") == 4 + assert count("div.dialog .dialog .direction") == 4 + assert count("#speech5") == 1 + assert count("div#speech5") == 1 + assert count("div #speech5") == 1 + assert count("div.scene div.dialog") == 49 + assert count("div#scene1 div.dialog div") == 142 + assert count("#scene1 #speech1") == 1 + assert count("div[class]") == 103 + assert count("div[class=dialog]") == 50 + assert count("div[class^=dia]") == 51 + assert count("div[class$=log]") == 50 + assert count("div[class*=sce]") == 1 + assert count("div[class|=dialog]") == 50 # ? Seems right + assert count("div[class!=madeup]") == 243 # ? Seems right + assert count("div[class~=dialog]") == 51 # ? Seems right + assert count(":scope > div") == 1 + assert count(":scope > div > div[class=dialog]") == 1 + assert count(":scope > div div") == 242 + + +OPERATOR_PRECEDENCE_IDS = """ + + + + + +""" + +XMLLANG_IDS = """ + + a + b + c + d + e + f + + + + +""" + +HTML_IDS = """ + + + + +
+ + + + link +
    +
  1. content
  2. +
  3. +
    +
    +
  4. +
  5. +
  6. +
  7. +
  8. +
  9. +
+

+ hi there + guy + + + + + + + +

+ + +
+

+
    +
+ + + + +
+
+ +""" + + +HTML_SHAKESPEARE = """ + + + + + + +
+
+

As You Like It

+
+ by William Shakespeare +
+
+

ACT I, SCENE III. A room in the palace.

+
+
Enter CELIA and ROSALIND
+
+
CELIA
+
+
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
+
+
ROSALIND
+
+
Not one to throw at a dog.
+
+
CELIA
+
+
No, thy words are too precious to be cast away upon
+
curs; throw some of them at me; come, lame me with reasons.
+
+
ROSALIND
+
CELIA
+
+
But is all this for your father?
+
+
+
Then there were two cousins laid up; when the one
+
should be lamed with reasons and the other mad
+
without any.
+
+
ROSALIND
+
+
No, some of it is for my child's father. O, how
+
full of briers is this working-day world!
+
+
CELIA
+
+
They are but burs, cousin, thrown upon thee in
+
holiday foolery: if we walk not in the trodden
+
paths our very petticoats will catch them.
+
+
ROSALIND
+
+
I could shake them off my coat: these burs are in my heart.
+
+
CELIA
+
+
Hem them away.
+
+
ROSALIND
+
+
I would try, if I could cry 'hem' and have him.
+
+
CELIA
+
+
Come, come, wrestle with thy affections.
+
+
ROSALIND
+
+
O, they take the part of a better wrestler than myself!
+
+
CELIA
+
+
O, a good wish upon you! you will try in time, in
+
despite of a fall. But, turning these jests out of
+
service, let us talk in good earnest: is it
+
possible, on such a sudden, you should fall into so
+
strong a liking with old Sir Rowland's youngest son?
+
+
ROSALIND
+
+
The duke my father loved his father dearly.
+
+
CELIA
+
+
Doth it therefore ensue that you should love his son
+
dearly? By this kind of chase, I should hate him,
+
for my father hated his father dearly; yet I hate
+
not Orlando.
+
+
ROSALIND
+
+
No, faith, hate him not, for my sake.
+
+
CELIA
+
+
Why should I not? doth he not deserve well?
+
+
ROSALIND
+
+
Let me love him for that, and do you love him
+
because I do. Look, here comes the duke.
+
+
CELIA
+
+
With his eyes full of anger.
+
Enter DUKE FREDERICK, with Lords
+
+
DUKE FREDERICK
+
+
Mistress, dispatch you with your safest haste
+
And get you from our court.
+
+
ROSALIND
+
+
Me, uncle?
+
+
DUKE FREDERICK
+
+
You, cousin
+
Within these ten days if that thou be'st found
+
So near our public court as twenty miles,
+
Thou diest for it.
+
+
ROSALIND
+
+
I do beseech your grace,
+
Let me the knowledge of my fault bear with me:
+
If with myself I hold intelligence
+
Or have acquaintance with mine own desires,
+
If that I do not dream or be not frantic,--
+
As I do trust I am not--then, dear uncle,
+
Never so much as in a thought unborn
+
Did I offend your highness.
+
+
DUKE FREDERICK
+
+
Thus do all traitors:
+
If their purgation did consist in words,
+
They are as innocent as grace itself:
+
Let it suffice thee that I trust thee not.
+
+
ROSALIND
+
+
Yet your mistrust cannot make me a traitor:
+
Tell me whereon the likelihood depends.
+
+
DUKE FREDERICK
+
+
Thou art thy father's daughter; there's enough.
+
+
ROSALIND
+
+
So was I when your highness took his dukedom;
+
So was I when your highness banish'd him:
+
Treason is not inherited, my lord;
+
Or, if we did derive it from our friends,
+
What's that to me? my father was no traitor:
+
Then, good my liege, mistake me not so much
+
To think my poverty is treacherous.
+
+
CELIA
+
+
Dear sovereign, hear me speak.
+
+
DUKE FREDERICK
+
+
Ay, Celia; we stay'd her for your sake,
+
Else had she with her father ranged along.
+
+
CELIA
+
+
I did not then entreat to have her stay;
+
It was your pleasure and your own remorse:
+
I was too young that time to value her;
+
But now I know her: if she be a traitor,
+
Why so am I; we still have slept together,
+
Rose at an instant, learn'd, play'd, eat together,
+
And wheresoever we went, like Juno's swans,
+
Still we went coupled and inseparable.
+
+
DUKE FREDERICK
+
+
She is too subtle for thee; and her smoothness,
+
Her very silence and her patience
+
Speak to the people, and they pity her.
+
Thou art a fool: she robs thee of thy name;
+
And thou wilt show more bright and seem more virtuous
+
When she is gone. Then open not thy lips:
+
Firm and irrevocable is my doom
+
Which I have pass'd upon her; she is banish'd.
+
+
CELIA
+
+
Pronounce that sentence then on me, my liege:
+
I cannot live out of her company.
+
+
DUKE FREDERICK
+
+
You are a fool. You, niece, provide yourself:
+
If you outstay the time, upon mine honour,
+
And in the greatness of my word, you die.
+
Exeunt DUKE FREDERICK and Lords
+
+
CELIA
+
+
O my poor Rosalind, whither wilt thou go?
+
Wilt thou change fathers? I will give thee mine.
+
I charge thee, be not thou more grieved than I am.
+
+
ROSALIND
+
+
I have more cause.
+
+
CELIA
+
+
Thou hast not, cousin;
+
Prithee be cheerful: know'st thou not, the duke
+
Hath banish'd me, his daughter?
+
+
ROSALIND
+
+
That he hath not.
+
+
CELIA
+
+
No, hath not? Rosalind lacks then the love
+
Which teacheth thee that thou and I am one:
+
Shall we be sunder'd? shall we part, sweet girl?
+
No: let my father seek another heir.
+
Therefore devise with me how we may fly,
+
Whither to go and what to bear with us;
+
And do not seek to take your change upon you,
+
To bear your griefs yourself and leave me out;
+
For, by this heaven, now at our sorrows pale,
+
Say what thou canst, I'll go along with thee.
+
+
ROSALIND
+
+
Why, whither shall we go?
+
+
CELIA
+
+
To seek my uncle in the forest of Arden.
+
+
ROSALIND
+
+
Alas, what danger will it be to us,
+
Maids as we are, to travel forth so far!
+
Beauty provoketh thieves sooner than gold.
+
+
CELIA
+
+
I'll put myself in poor and mean attire
+
And with a kind of umber smirch my face;
+
The like do you: so shall we pass along
+
And never stir assailants.
+
+
ROSALIND
+
+
Were it not better,
+
Because that I am more than common tall,
+
That I did suit me all points like a man?
+
A gallant curtle-axe upon my thigh,
+
A boar-spear in my hand; and--in my heart
+
Lie there what hidden woman's fear there will--
+
We'll have a swashing and a martial outside,
+
As many other mannish cowards have
+
That do outface it with their semblances.
+
+
CELIA
+
+
What shall I call thee when thou art a man?
+
+
ROSALIND
+
+
I'll have no worse a name than Jove's own page;
+
And therefore look you call me Ganymede.
+
But what will you be call'd?
+
+
CELIA
+
+
Something that hath a reference to my state
+
No longer Celia, but Aliena.
+
+
ROSALIND
+
+
But, cousin, what if we assay'd to steal
+
The clownish fool out of your father's court?
+
Would he not be a comfort to our travel?
+
+
CELIA
+
+
He'll go along o'er the wide world with me;
+
Leave me alone to woo him. Let's away,
+
And get our jewels and our wealth together,
+
Devise the fittest time and safest way
+
To hide us from pursuit that will be made
+
After my flight. Now go we in content
+
To liberty and not to banishment.
+
Exeunt
+
+
+
+
+ + +""" + + +if __name__ == "__main__": + unittest.main() diff --git a/tox.ini b/tox.ini index 9a552c2..9ff54cf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,49 @@ [tox] -envlist = py24,py25,py26,py27,py31,py32 +envlist = pre-commit,pylint,py,docs,typing [testenv] -deps=lxml -commands = python cssselect/tests.py +deps = + lxml>=4.4 + pytest-cov>=7.0.0 + pytest>=5.4 + sybil +commands = + pytest --cov=cssselect \ + --cov-report=term-missing --cov-report=html --cov-report=xml \ + {posargs: cssselect tests docs} + +[testenv:pylint] +deps = + {[testenv]deps} + pylint==4.0.4 +commands = + pylint {posargs: cssselect tests docs} + +[testenv:docs] +changedir = docs +deps = + -r docs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html + +[testenv:typing] +deps = + {[testenv]deps} + mypy==1.19.1 + types-lxml==2026.1.1 +commands = + mypy {posargs: cssselect tests} + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true + +[testenv:twinecheck] +basepython = python3 +deps = + twine==6.2.0 + build==1.4.0 +commands = + python -m build --sdist + twine check dist/*