diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..2ee5ff3 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,9 @@ +[run] +branch = True + +[report] +exclude_lines = + pragma: no cover + def __repr__ + if sys.version_info + if __name__ == '__main__': diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 38558bf..0000000 --- a/.editorconfig +++ /dev/null @@ -1,11 +0,0 @@ -root = true - -[*] -charset = utf-8 -indent_style = space -indent_size = 4 -insert_final_newline = true -end_of_line = lf - -[*.{yml,yaml}] -indent_size = 2 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs deleted file mode 100644 index bb4f6e1..0000000 --- a/.git-blame-ignore-revs +++ /dev/null @@ -1,2 +0,0 @@ -# applying pre-commit hooks to the project -e91101b37f82558db84a6b8ee9a6dba1fd2ae0bb diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml deleted file mode 100644 index 41ff7e1..0000000 --- a/.github/workflows/checks.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: Checks -on: [push, pull_request] - -jobs: - checks: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - include: - - python-version: 3.14 - env: - TOXENV: pylint - - python-version: 3.14 # Keep in sync with .readthedocs.yml - env: - TOXENV: docs - - python-version: 3.14 - env: - TOXENV: typing - - python-version: 3.14 - env: - TOXENV: twinecheck - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} - - - name: Run check - env: ${{ matrix.env }} - run: | - pip install -U pip - pip install -U tox - tox - - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 526c458..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Publish -on: - push: - tags: - - 'v[0-9]+.[0-9]+.[0-9]+' - -jobs: - publish: - runs-on: ubuntu-latest - - environment: - name: pypi - url: https://pypi.org/p/cssselect - - permissions: - id-token: write - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: 3.14 - - - name: Build - run: | - python -m pip install --upgrade build - python -m build - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tests-macos.yml b/.github/workflows/tests-macos.yml deleted file mode 100644 index 4947937..0000000 --- a/.github/workflows/tests-macos.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: macOS -on: [push, pull_request] - -jobs: - tests: - runs-on: macos-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} - - - name: Run tests - run: | - pip install -U pip - pip install -U tox - tox -e py - - - name: Upload coverage report - uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-ubuntu.yml b/.github/workflows/tests-ubuntu.yml deleted file mode 100644 index 1ef905b..0000000 --- a/.github/workflows/tests-ubuntu.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Ubuntu -on: [push, pull_request] - -jobs: - tests: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "pypy3.11"] - - steps: - - uses: actions/checkout@v6 - - - name: Install system libraries - if: contains(matrix.python-version, 'pypy') - run: | - sudo apt-get update - sudo apt-get install libxml2-dev libxslt-dev - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} - - - name: Run tests - run: | - pip install -U pip - pip install -U tox - tox -e py - - - name: Upload coverage report - uses: codecov/codecov-action@v5 diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml deleted file mode 100644 index 24d7ee8..0000000 --- a/.github/workflows/tests-windows.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Windows -on: [push, pull_request] - -jobs: - tests: - runs-on: windows-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] - - steps: - - uses: actions/checkout@v6 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} - - - name: Run tests - run: | - pip install -U pip - pip install -U tox - tox -e py - - - name: Upload coverage report - uses: codecov/codecov-action@v5 diff --git a/.gitignore b/.gitignore index c276bd1..627d1c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,5 @@ +.DS_Store +.pydevproject +.project *.pyc -*.egg-info -/.tox -/MANIFEST -/dist -/docs/_build -/.coverage -.idea -htmlcov/ -coverage.xml +.settings/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 81ca890..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,26 +0,0 @@ -repos: -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.4 - hooks: - - id: ruff-check - args: [ --fix ] - - id: ruff-format -- repo: https://github.com/adamchainz/blacken-docs - rev: 1.20.0 - hooks: - - id: blacken-docs - additional_dependencies: - - black==26.1.0 -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v6.0.0 - hooks: - - id: end-of-file-fixer - - id: trailing-whitespace -- repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v1.0.0 - hooks: - - id: sphinx-lint -- repo: https://github.com/rhysd/actionlint - rev: v1.7.10 - hooks: - - id: actionlint diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index b91642a..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,15 +0,0 @@ -version: 2 -formats: all -sphinx: - configuration: docs/conf.py - fail_on_warning: true -build: - os: ubuntu-24.04 - tools: - # For available versions, see: - # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python - python: "3.14" # Keep in sync with .github/workflows/checks.yml -python: - install: - - requirements: docs/requirements.txt - - path: . diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..93ad08a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +language: python + +python: + - "2.6" + - "2.7" + - "3.2" + - "3.3" + +install: + - pip install --use-mirrors lxml -e . + +script: py.test diff --git a/AUTHORS b/AUTHORS index 66dcc22..bf826b9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,13 +1,9 @@ Daniel Graña Ian Bicking -James Salter Laurence Rowe Mikhail Korobov -Nik Nyby Paul Tremberth Simon Potter Simon Sapin Stefan Behnel -Thomas Grainger Varialus -Arthur Darcet diff --git a/CHANGES b/CHANGES index 5ca2959..edbbaca 100644 --- a/CHANGES +++ b/CHANGES @@ -1,133 +1,6 @@ Changelog ========= -Version 1.4.0 -------------- - -Released on 2026-01-29. - -* Dropped support for Python 3.9 and PyPy 3.10. - -* Added support for Python 3.14 and PyPy 3.11. - -* Switched the build system to ``hatchling``. - -* CI fixes and improvements. - -Version 1.3.0 -------------- - -Released on 2025-03-10. - -* Dropped support for Python 3.7-3.8, added support for Python 3.12-3.13 and - PyPy 3.10. - -* Removed ``_unicode_safe_getattr()``, deprecated in 1.2.0. - -* Added ``pre-commit`` and formatted the code with ``ruff``. - -* Many CI additions and improvements. - - -Version 1.2.0 -------------- - -Released on 2022-10-27. - -* Drop support for Python 2.7, 3.4-3.6, add support for Python 3.7-3.11. - -* Add type annotations (PEP 484 and PEP 561). - -* More features from the CSS Selectors Level 4: - - * The ``:is()`` pseudo-class. - - * The ``:where()`` pseudo-class. - - * The ``:has()`` pseudo-class, with some limitations. - -* Fix parsing ``:scope`` after a comma. - -* Add parentheses to fix condition precedence in some cases. - -* Private API changes related to the removal of the Python 2 support: - - * Remove ``_unicode`` and ``_unichr`` aliases from ``csselect.parser``. - - * Remove ``_basestring`` and ``_unicode`` aliases from ``csselect.xpath``. - - * Deprecate ``csselect.xpath._unicode_safe_getattr()`` and change it to just - call ``getattr()``. - -* Include tests in the PyPI tarball. - -* Many CI additions and improvements. - -* Improve the test coverage. - - -Version 1.1.0 -------------- - -Released on 2019-08-09. - -* Support for the ``:scope`` selector, which allows to access immediate - children of a selector. - -* Support for the ``|E`` syntax for type selectors without a namespace. - -* A new selector method, ``canonical``, returns the CSS expression of the - selector, as a string. - - -Version 1.0.3 -------------- - -Released on 2017-12-27. - -* Fix artifact uploads to pypi - - -Version 1.0.2 -------------- - -Released on 2017-12-26. - -* Drop support for Python 2.6 and Python 3.3. -* Fix deprecation warning in Python 3.6. -* Minor cleanups. - - -Version 1.0.1 -------------- - -Released on 2017-01-10. - -* Add support for Python 3.6. -* Documentation hosted `on Read the Docs `_ - - -Version 1.0.0 -------------- - -Released on 2016-10-21. - -* Add code coverage reports. -* Fix ``:nth-*(an+b)`` pseudo-classes selectors. - (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.) - - -Version 0.9.2 -------------- - -Released on 2016-06-15. - -* Distribute as universal wheel. -* Add support for Python 3.3, 3.4 and 3.5. -* Drop support for Python 2.5 as testing is getting difficult. -* Improve tests on pseudo-elements. - - Version 0.9.1 ------------- @@ -264,14 +137,14 @@ Version 0.3 Released on 2012-04-17. * Fix many parsing bugs. -* Rename the ``Translator`` class to :class:`GenericTranslator` +* Rename the :class:`Translator` class to :class:`GenericTranslator` * There, implement ``:target``, ``:hover``, ``:focus``, ``:active`` ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as never matching. * Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` as appropriate for HTML, with all links "not visited". -* Remove the ``css_to_xpath`` function. The translator classes +* Remove the :func:`css_to_xpath` function. The translator classes are the new API. * Add support for ``:contains()`` back, but case-sensitive. lxml will override it to be case-insensitive for backward-compatibility. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e98d213 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include AUTHORS CHANGES LICENSE README.rst tox.ini .coveragerc +recursive-include docs * +prune docs/_build diff --git a/README.rst b/README.rst index c055295..f523c7f 100644 --- a/README.rst +++ b/README.rst @@ -1,40 +1,25 @@ - =================================== cssselect: CSS Selectors for Python =================================== -.. image:: https://img.shields.io/pypi/v/cssselect.svg - :target: https://pypi.python.org/pypi/cssselect - :alt: PyPI Version - -.. image:: https://img.shields.io/pypi/pyversions/cssselect.svg - :target: https://pypi.python.org/pypi/cssselect - :alt: Supported Python Versions - -.. image:: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml/badge.svg - :target: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml - :alt: Tests +*cssselect* parses `CSS3 Selectors`_ and translate them to `XPath 1.0`_ +expressions. Such expressions can be used in lxml_ or another XPath engine +to find the matching elements in an XML or HTML document. -.. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg - :target: https://codecov.io/github/scrapy/cssselect?branch=master - :alt: Coverage report +This module used to live inside of lxml as ``lxml.cssselect`` before it was +extracted as a stand-alone project. -**cssselect** is a BSD-licensed Python library to parse `CSS3 selectors`_ and -translate them to `XPath 1.0`_ expressions. +.. _CSS3 Selectors: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/ +.. _XPath 1.0: http://www.w3.org/TR/xpath/ +.. _lxml: http://lxml.de/ -`XPath 1.0`_ expressions can be used in lxml_ or another XPath engine to find -the matching elements in an XML or HTML document. - -Find the cssselect online documentation at https://cssselect.readthedocs.io. Quick facts: -* Source, issues and pull requests `on GitHub - `_ -* Releases `on PyPI `_ +* Free software: BSD licensed +* Compatible with Python 2.5+ and 3.2+ +* Latest documentation `on python.org `_ +* Source, issues and pull requests `on Github + `_ +* Releases `on PyPI `_ * Install with ``pip install cssselect`` - - -.. _CSS3 selectors: https://www.w3.org/TR/selectors-3/ -.. _XPath 1.0: https://www.w3.org/TR/xpath/all/ -.. _lxml: https://lxml.de/ diff --git a/cssselect/__init__.py b/cssselect/__init__.py index 59d62df..871f1b2 100644 --- a/cssselect/__init__.py +++ b/cssselect/__init__.py @@ -1,36 +1,22 @@ +# coding: utf8 """ -CSS Selectors based on XPath -============================ + CSS Selectors based on XPath + ============================ -This module supports selecting XML/HTML elements based on CSS selectors. -See the `CSSSelector` class for details. + This module supports selecting XML/HTML elements based on CSS selectors. + See the `CSSSelector` class for details. -:copyright: (c) 2007-2012 Ian Bicking and contributors. -See AUTHORS for more details. -:license: BSD, see LICENSE for more details. + :copyright: (c) 2007-2012 Ian Bicking and contributors. + See AUTHORS for more details. + :license: BSD, see LICENSE for more details. """ -from cssselect.parser import ( - FunctionalPseudoElement, - Selector, - SelectorError, - SelectorSyntaxError, - parse, -) -from cssselect.xpath import ExpressionError, GenericTranslator, HTMLTranslator +from cssselect.parser import (parse, Selector, FunctionalPseudoElement, + SelectorError, SelectorSyntaxError) +from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError -__all__ = ( - "ExpressionError", - "FunctionalPseudoElement", - "GenericTranslator", - "HTMLTranslator", - "Selector", - "SelectorError", - "SelectorSyntaxError", - "parse", -) -VERSION = "1.4.0" +VERSION = '0.9.1' __version__ = VERSION diff --git a/cssselect/parser.py b/cssselect/parser.py index f969769..d71fdda 100644 --- a/cssselect/parser.py +++ b/cssselect/parser.py @@ -1,33 +1,33 @@ +# coding: utf8 """ -cssselect.parser -================ + cssselect.parser + ================ -Tokenizer, parser and parsed objects for CSS selectors. + Tokenizer, parser and parsed objects for CSS selectors. -:copyright: (c) 2007-2012 Ian Bicking and contributors. -See AUTHORS for more details. -:license: BSD, see LICENSE for more details. + :copyright: (c) 2007-2012 Ian Bicking and contributors. + See AUTHORS for more details. + :license: BSD, see LICENSE for more details. """ -from __future__ import annotations - -import operator -import re import sys -from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias, Union, cast, overload +import re +import operator -if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence - # typing.Self requires Python 3.11 - from typing_extensions import Self +if sys.version_info[0] < 3: + _unicode = unicode + _unichr = unichr +else: + _unicode = str + _unichr = chr -def ascii_lower(string: str) -> str: +def ascii_lower(string): """Lower-case, but only in the ASCII range.""" - return string.encode("utf8").lower().decode("utf8") + return string.encode('utf8').lower().decode('utf8') class SelectorError(Exception): @@ -39,30 +39,13 @@ class SelectorError(Exception): """ - class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects -Tree: TypeAlias = Union[ - "Element", - "Hash", - "Class", - "Function", - "Pseudo", - "Attrib", - "Negation", - "Relation", - "Matching", - "SpecificityAdjustment", - "CombinedSelector", -] -PseudoElement: TypeAlias = Union["FunctionalPseudoElement", str] - - -class Selector: +class Selector(object): """ Represents a parsed selector. @@ -72,12 +55,10 @@ class Selector: or unsupported pseudo-elements. """ - - def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None: + def __init__(self, tree, pseudo_element=None): self.parsed_tree = tree if pseudo_element is not None and not isinstance( - pseudo_element, FunctionalPseudoElement - ): + pseudo_element, FunctionalPseudoElement): pseudo_element = ascii_lower(pseudo_element) #: A :class:`FunctionalPseudoElement`, #: or the identifier for the pseudo-element as a string, @@ -95,35 +76,23 @@ def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> N #: +-------------------------+----------------+--------------------------------+ #: | Invalid pseudo-class | ``li:marker`` | ``None`` | #: +-------------------------+----------------+--------------------------------+ - #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | + #: | Functinal | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | #: +-------------------------+----------------+--------------------------------+ #: #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element - def __repr__(self) -> str: + def __repr__(self): if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = repr(self.pseudo_element) elif self.pseudo_element: - pseudo_element = f"::{self.pseudo_element}" + pseudo_element = '::%s' % self.pseudo_element else: - pseudo_element = "" - return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]" + pseudo_element = '' + return '%s[%r%s]' % ( + self.__class__.__name__, self.parsed_tree, pseudo_element) - def canonical(self) -> str: - """Return a CSS representation for this selector (a string)""" - if isinstance(self.pseudo_element, FunctionalPseudoElement): - pseudo_element = f"::{self.pseudo_element.canonical()}" - elif self.pseudo_element: - pseudo_element = f"::{self.pseudo_element}" - else: - pseudo_element = "" - res = f"{self.parsed_tree.canonical()}{pseudo_element}" - if len(res) > 1: - res = res.lstrip("*") - return res - - def specificity(self) -> tuple[int, int, int]: + def specificity(self): """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity @@ -135,28 +104,25 @@ def specificity(self) -> tuple[int, int, int]: return a, b, c -class Class: +class Class(object): """ Represents selector.class_name """ - - def __init__(self, selector: Tree, class_name: str) -> None: + def __init__(self, selector, class_name): self.selector = selector self.class_name = class_name - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]" + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, self.selector, self.class_name) - def canonical(self) -> str: - return f"{self.selector.canonical()}.{self.class_name}" - - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c -class FunctionalPseudoElement: +class FunctionalPseudoElement(object): """ Represents selector::name(arguments) @@ -173,310 +139,171 @@ class FunctionalPseudoElement: Use at your own risks. """ - - def __init__(self, name: str, arguments: Sequence[Token]): + def __init__(self, name, arguments): self.name = ascii_lower(name) self.arguments = arguments - def __repr__(self) -> str: - token_values = [token.value for token in self.arguments] - return f"{self.__class__.__name__}[::{self.name}({token_values!r})]" + def __repr__(self): + return '%s[::%s(%r)]' % ( + self.__class__.__name__, self.name, + [token.value for token in self.arguments]) - def argument_types(self) -> list[str]: + def argument_types(self): return [token.type for token in self.arguments] - def canonical(self) -> str: - args = "".join(token.css() for token in self.arguments) - return f"{self.name}({args})" + def specificity(self): + a, b, c = self.selector.specificity() + b += 1 + return a, b, c -class Function: +class Function(object): """ Represents selector:name(expr) """ - - def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None: + def __init__(self, selector, name, arguments): self.selector = selector self.name = ascii_lower(name) self.arguments = arguments - def __repr__(self) -> str: - token_values = [token.value for token in self.arguments] - return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]" + def __repr__(self): + return '%s[%r:%s(%r)]' % ( + self.__class__.__name__, self.selector, self.name, + [token.value for token in self.arguments]) - def argument_types(self) -> list[str]: + def argument_types(self): return [token.type for token in self.arguments] - def canonical(self) -> str: - args = "".join(token.css() for token in self.arguments) - return f"{self.selector.canonical()}:{self.name}({args})" - - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c -class Pseudo: +class Pseudo(object): """ Represents selector:ident """ - - def __init__(self, selector: Tree, ident: str) -> None: + def __init__(self, selector, ident): self.selector = selector self.ident = ascii_lower(ident) - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]" - - def canonical(self) -> str: - return f"{self.selector.canonical()}:{self.ident}" + def __repr__(self): + return '%s[%r:%s]' % ( + self.__class__.__name__, self.selector, self.ident) - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c -class Negation: +class Negation(object): """ Represents selector:not(subselector) """ - - def __init__(self, selector: Tree, subselector: Tree) -> None: + def __init__(self, selector, subselector): self.selector = selector self.subselector = subselector - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]" + def __repr__(self): + return '%s[%r:not(%r)]' % ( + self.__class__.__name__, self.selector, self.subselector) - def canonical(self) -> str: - subsel = self.subselector.canonical() - if len(subsel) > 1: - subsel = subsel.lstrip("*") - return f"{self.selector.canonical()}:not({subsel})" - - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 -class Relation: - """ - Represents selector:has(subselector) - """ - - def __init__(self, selector: Tree, combinator: Token, subselector: Selector): - self.selector = selector - self.combinator = combinator - self.subselector = subselector - - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]" - - def canonical(self) -> str: - try: - subsel = self.subselector[0].canonical() # type: ignore[index] - except TypeError: - subsel = self.subselector.canonical() - if len(subsel) > 1: - subsel = subsel.lstrip("*") - return f"{self.selector.canonical()}:has({subsel})" - - def specificity(self) -> tuple[int, int, int]: - a1, b1, c1 = self.selector.specificity() - try: - a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index] - except TypeError: - a2, b2, c2 = self.subselector.specificity() - return a1 + a2, b1 + b2, c1 + c2 - - -class Matching: - """ - Represents selector:is(selector_list) - """ - - def __init__(self, selector: Tree, selector_list: Iterable[Tree]): - self.selector = selector - self.selector_list = selector_list - - def __repr__(self) -> str: - args_str = ", ".join(repr(s) for s in self.selector_list) - return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]" - - def canonical(self) -> str: - selector_arguments = [] - for s in self.selector_list: - selarg = s.canonical() - selector_arguments.append(selarg.lstrip("*")) - args_str = ", ".join(str(s) for s in selector_arguments) - return f"{self.selector.canonical()}:is({args_str})" - - def specificity(self) -> tuple[int, int, int]: - return max(x.specificity() for x in self.selector_list) - - -class SpecificityAdjustment: - """ - Represents selector:where(selector_list) - Same as selector:is(selector_list), but its specificity is always 0 - """ - - def __init__(self, selector: Tree, selector_list: list[Tree]): - self.selector = selector - self.selector_list = selector_list - - def __repr__(self) -> str: - args_str = ", ".join(repr(s) for s in self.selector_list) - return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]" - - def canonical(self) -> str: - selector_arguments = [] - for s in self.selector_list: - selarg = s.canonical() - selector_arguments.append(selarg.lstrip("*")) - args_str = ", ".join(str(s) for s in selector_arguments) - return f"{self.selector.canonical()}:where({args_str})" - - def specificity(self) -> tuple[int, int, int]: - return 0, 0, 0 - - -class Attrib: +class Attrib(object): """ Represents selector[namespace|attrib operator value] """ - - @overload - def __init__( - self, - selector: Tree, - namespace: str | None, - attrib: str, - operator: Literal["exists"], - value: None, - ) -> None: ... - - @overload - def __init__( - self, - selector: Tree, - namespace: str | None, - attrib: str, - operator: str, - value: Token, - ) -> None: ... - - def __init__( - self, - selector: Tree, - namespace: str | None, - attrib: str, - operator: str, - value: Token | None, - ) -> None: + def __init__(self, selector, namespace, attrib, operator, value): self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value - def __repr__(self) -> str: - attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib - if self.operator == "exists": - return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]" - assert self.value is not None - return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]" - - def canonical(self) -> str: - attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib - - if self.operator == "exists": - op = attrib + def __repr__(self): + if self.namespace: + attrib = '%s|%s' % (self.namespace, self.attrib) else: - assert self.value is not None - op = f"{attrib}{self.operator}{self.value.css()}" - - return f"{self.selector.canonical()}[{op}]" + attrib = self.attrib + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, self.selector, attrib) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, self.selector, attrib, + self.operator, self.value) - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c -class Element: +class Element(object): """ Represents namespace|element `None` is for the universal selector '*' """ - - def __init__( - self, namespace: str | None = None, element: str | None = None - ) -> None: + def __init__(self, namespace=None, element=None): self.namespace = namespace self.element = element - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.canonical()}]" - - def canonical(self) -> str: - element = self.element or "*" + def __repr__(self): + element = self.element or '*' if self.namespace: - element = f"{self.namespace}|{element}" - return element + element = '%s|%s' % (self.namespace, element) + return '%s[%s]' % (self.__class__.__name__, element) - def specificity(self) -> tuple[int, int, int]: + def specificity(self): if self.element: return 0, 0, 1 - return 0, 0, 0 + else: + return 0, 0, 0 -class Hash: +class Hash(object): """ Represents selector#id """ - - def __init__(self, selector: Tree, id: str) -> None: # noqa: A002 + def __init__(self, selector, id): self.selector = selector self.id = id - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]" - - def canonical(self) -> str: - return f"{self.selector.canonical()}#{self.id}" + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, self.selector, self.id) - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a, b, c = self.selector.specificity() a += 1 return a, b, c -class CombinedSelector: - def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None: +class CombinedSelector(object): + def __init__(self, selector, combinator, subselector): assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector - def __repr__(self) -> str: - comb = "" if self.combinator == " " else self.combinator - return ( - f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]" - ) - - def canonical(self) -> str: - subsel = self.subselector.canonical() - if len(subsel) > 1: - subsel = subsel.lstrip("*") - return f"{self.selector.canonical()} {self.combinator} {subsel}" + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, self.selector, comb, self.subselector) - def specificity(self) -> tuple[int, int, int]: + def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 @@ -485,25 +312,24 @@ def specificity(self) -> tuple[int, int, int]: #### Parser # foo -_el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$") +_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$') # foo#bar or #bar -_id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$") +_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$') # foo.bar or .bar _class_re = re.compile( - r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$" -) + r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$') -def parse(css: str) -> list[Selector]: +def parse(css): """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: - A *group of selectors* as a string. + A *group of selectors* as an Unicode string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: @@ -517,75 +343,72 @@ def parse(css: str) -> list[Selector]: return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: - return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] + return [Selector(Hash(Element(element=match.group(1) or None), + match.group(2)))] match = _class_re.match(css) if match is not None: - return [ - Selector(Class(Element(element=match.group(1) or None), match.group(2))) - ] + return [Selector(Class(Element(element=match.group(1) or None), + match.group(2)))] stream = TokenStream(tokenize(css)) stream.source = css return list(parse_selector_group(stream)) - - # except SelectorSyntaxError: # e = sys.exc_info()[1] # message = "%s at %s -> %r" % ( # e, stream.used, stream.peek()) # e.msg = message +# if sys.version_info < (2,6): +# e.message = message # e.args = tuple([message]) # raise -def parse_selector_group(stream: TokenStream) -> Iterator[Selector]: +def parse_selector_group(stream): stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) - if stream.peek() == ("DELIM", ","): + if stream.peek() == ('DELIM', ','): stream.next() stream.skip_whitespace() else: break - -def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]: +def parse_selector(stream): result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() - if peek in (("EOF", None), ("DELIM", ",")): + if peek in (('EOF', None), ('DELIM', ',')): break if pseudo_element: raise SelectorSyntaxError( - f"Got pseudo-element ::{pseudo_element} not at the end of a selector" - ) - if peek.is_delim("+", ">", "~"): + 'Got pseudo-element ::%s not at the end of a selector' + % pseudo_element) + if peek.is_delim('+', '>', '~'): # A combinator - combinator = cast("str", stream.next().value) + combinator = stream.next().value stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' - combinator = " " + combinator = ' ' next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element -def parse_simple_selector( - stream: TokenStream, inside_negation: bool = False -) -> tuple[Tree, PseudoElement | None]: +def parse_simple_selector(stream, inside_negation=False): stream.skip_whitespace() selector_start = len(stream.used) peek = stream.peek() - if peek.type == "IDENT" or peek == ("DELIM", "*"): - if peek.type == "IDENT": + if peek.type == 'IDENT' or peek == ('DELIM', '*'): + if peek.type == 'IDENT': namespace = stream.next().value else: stream.next() namespace = None - if stream.peek() == ("DELIM", "|"): + if stream.peek() == ('DELIM', '|'): stream.next() element = stream.next_ident_or_star() else: @@ -593,177 +416,98 @@ def parse_simple_selector( namespace = None else: element = namespace = None - result: Tree = Element(namespace, element) - pseudo_element: PseudoElement | None = None + result = Element(namespace, element) + pseudo_element = None while 1: peek = stream.peek() - if ( - peek.type in ("S", "EOF") - or peek.is_delim(",", "+", ">", "~") - or (inside_negation and peek == ("DELIM", ")")) - ): + if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or ( + inside_negation and peek == ('DELIM', ')')): break if pseudo_element: raise SelectorSyntaxError( - f"Got pseudo-element ::{pseudo_element} not at the end of a selector" - ) - if peek.type == "HASH": - result = Hash(result, cast("str", stream.next().value)) - elif peek == ("DELIM", "."): + 'Got pseudo-element ::%s not at the end of a selector' + % pseudo_element) + if peek.type == 'HASH': + result = Hash(result, stream.next().value) + elif peek == ('DELIM', '.'): stream.next() result = Class(result, stream.next_ident()) - elif peek == ("DELIM", "|"): - stream.next() - result = Element(None, stream.next_ident()) - elif peek == ("DELIM", "["): + elif peek == ('DELIM', '['): stream.next() result = parse_attrib(result, stream) - elif peek == ("DELIM", ":"): + elif peek == ('DELIM', ':'): stream.next() - if stream.peek() == ("DELIM", ":"): + if stream.peek() == ('DELIM', ':'): stream.next() pseudo_element = stream.next_ident() - if stream.peek() == ("DELIM", "("): + if stream.peek() == ('DELIM', '('): stream.next() pseudo_element = FunctionalPseudoElement( - pseudo_element, parse_arguments(stream) - ) + pseudo_element, parse_arguments(stream)) continue ident = stream.next_ident() - if ident.lower() in ("first-line", "first-letter", "before", "after"): + if ident.lower() in ('first-line', 'first-letter', + 'before', 'after'): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. - pseudo_element = str(ident) + pseudo_element = _unicode(ident) continue - if stream.peek() != ("DELIM", "("): + if stream.peek() != ('DELIM', '('): result = Pseudo(result, ident) - if repr(result) == "Pseudo[Element[*]:scope]" and not ( - len(stream.used) == 2 - or (len(stream.used) == 3 and stream.used[0].type == "S") - or (len(stream.used) >= 3 and stream.used[-3].is_delim(",")) - or ( - len(stream.used) >= 4 - and stream.used[-3].type == "S" - and stream.used[-4].is_delim(",") - ) - ): - raise SelectorSyntaxError( - 'Got immediate child pseudo-element ":scope" ' - "not at the start of a selector" - ) continue stream.next() stream.skip_whitespace() - if ident.lower() == "not": + if ident.lower() == 'not': if inside_negation: - raise SelectorSyntaxError("Got nested :not()") + raise SelectorSyntaxError('Got nested :not()') argument, argument_pseudo_element = parse_simple_selector( - stream, inside_negation=True - ) - next_ = stream.next() + stream, inside_negation=True) + next = stream.next() if argument_pseudo_element: raise SelectorSyntaxError( - f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next_.pos}" - ) - if next_ != ("DELIM", ")"): - raise SelectorSyntaxError(f"Expected ')', got {next_}") + 'Got pseudo-element ::%s inside :not() at %s' + % (argument_pseudo_element, next.pos)) + if next != ('DELIM', ')'): + raise SelectorSyntaxError("Expected ')', got %s" % (next,)) result = Negation(result, argument) - elif ident.lower() == "has": - combinator, arguments = parse_relative_selector(stream) - result = Relation(result, combinator, arguments) - - elif ident.lower() in ("matches", "is"): - selectors = parse_simple_selector_arguments(stream) - result = Matching(result, selectors) - elif ident.lower() == "where": - selectors = parse_simple_selector_arguments(stream) - result = SpecificityAdjustment(result, selectors) else: result = Function(result, ident, parse_arguments(stream)) else: - raise SelectorSyntaxError(f"Expected selector, got {peek}") + raise SelectorSyntaxError( + "Expected selector, got %s" % (peek,)) if len(stream.used) == selector_start: - raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}") + raise SelectorSyntaxError( + "Expected selector, got %s" % (stream.peek(),)) return result, pseudo_element -def parse_arguments(stream: TokenStream) -> list[Token]: # noqa: RET503 - arguments: list[Token] = [] +def parse_arguments(stream): + arguments = [] while 1: stream.skip_whitespace() - next_ = stream.next() - if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ - ("DELIM", "+"), - ("DELIM", "-"), - ]: - arguments.append(next_) - elif next_ == ("DELIM", ")"): + next = stream.next() + if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [ + ('DELIM', '+'), ('DELIM', '-')]: + arguments.append(next) + elif next == ('DELIM', ')'): return arguments else: - raise SelectorSyntaxError(f"Expected an argument, got {next_}") - - -def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]: # noqa: RET503 - stream.skip_whitespace() - subselector = "" - next_ = stream.next() - - if next_ in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]: - combinator = next_ - stream.skip_whitespace() - next_ = stream.next() - else: - combinator = Token("DELIM", " ", pos=0) - - while 1: - if next_.type in ("IDENT", "STRING", "NUMBER") or next_ in [ - ("DELIM", "."), - ("DELIM", "*"), - ]: - subselector += cast("str", next_.value) - elif next_ == ("DELIM", ")"): - result = parse(subselector) - return combinator, result[0] - else: - raise SelectorSyntaxError(f"Expected an argument, got {next_}") - next_ = stream.next() - - -def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]: - arguments = [] - while 1: - result, pseudo_element = parse_simple_selector(stream, True) - if pseudo_element: raise SelectorSyntaxError( - f"Got pseudo-element ::{pseudo_element} inside function" - ) - stream.skip_whitespace() - next_ = stream.next() - if next_ in (("EOF", None), ("DELIM", ",")): - stream.next() - stream.skip_whitespace() - arguments.append(result) - elif next_ == ("DELIM", ")"): - arguments.append(result) - break - else: - raise SelectorSyntaxError(f"Expected an argument, got {next_}") - return arguments + "Expected an argument, got %s" % (next,)) -def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib: +def parse_attrib(selector, stream): stream.skip_whitespace() attrib = stream.next_ident_or_star() - if attrib is None and stream.peek() != ("DELIM", "|"): - raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}") - namespace: str | None - op: str | None - if stream.peek() == ("DELIM", "|"): + if attrib is None and stream.peek() != ('DELIM', '|'): + raise SelectorSyntaxError( + "Expected '|', got %s" % (stream.peek(),)) + if stream.peek() == ('DELIM', '|'): stream.next() - if stream.peek() == ("DELIM", "="): + if stream.peek() == ('DELIM', '='): namespace = None stream.next() - op = "|=" + op = '|=' else: namespace = attrib attrib = stream.next_ident() @@ -772,30 +516,32 @@ def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib: namespace = op = None if op is None: stream.skip_whitespace() - next_ = stream.next() - if next_ == ("DELIM", "]"): - return Attrib(selector, namespace, cast("str", attrib), "exists", None) - if next_ == ("DELIM", "="): - op = "=" - elif next_.is_delim("^", "$", "*", "~", "|", "!") and ( - stream.peek() == ("DELIM", "=") - ): - op = cast("str", next_.value) + "=" + next = stream.next() + if next == ('DELIM', ']'): + return Attrib(selector, namespace, attrib, 'exists', None) + elif next == ('DELIM', '='): + op = '=' + elif next.is_delim('^', '$', '*', '~', '|', '!') and ( + stream.peek() == ('DELIM', '=')): + op = next.value + '=' stream.next() else: - raise SelectorSyntaxError(f"Operator expected, got {next_}") + raise SelectorSyntaxError( + "Operator expected, got %s" % (next,)) stream.skip_whitespace() value = stream.next() - if value.type not in ("IDENT", "STRING"): - raise SelectorSyntaxError(f"Expected string or ident, got {value}") + if value.type not in ('IDENT', 'STRING'): + raise SelectorSyntaxError( + "Expected string or ident, got %s" % (value,)) stream.skip_whitespace() - next_ = stream.next() - if next_ != ("DELIM", "]"): - raise SelectorSyntaxError(f"Expected ']', got {next_}") - return Attrib(selector, namespace, cast("str", attrib), op, value) + next = stream.next() + if next != ('DELIM', ']'): + raise SelectorSyntaxError( + "Expected ']', got %s" % (next,)) + return Attrib(selector, namespace, attrib, op, value.value) -def parse_series(tokens: Iterable[Token]) -> tuple[int, int]: +def parse_series(tokens): """ Parses the arguments for :nth-child() and friends. @@ -804,243 +550,217 @@ def parse_series(tokens: Iterable[Token]) -> tuple[int, int]: """ for token in tokens: - if token.type == "STRING": - raise ValueError("String tokens not allowed in series.") - s = "".join(cast("str", token.value) for token in tokens).strip() - if s == "odd": - return 2, 1 - if s == "even": - return 2, 0 - if s == "n": - return 1, 0 - if "n" not in s: + if token.type == 'STRING': + raise ValueError('String tokens not allowed in series.') + s = ''.join(token.value for token in tokens).strip() + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + elif s == 'n': + return (1, 0) + if 'n' not in s: # Just b - return 0, int(s) - a, b = s.split("n", 1) - a_as_int: int + return (0, int(s)) + a, b = s.split('n', 1) if not a: - a_as_int = 1 - elif a in {"-", "+"}: - a_as_int = int(a + "1") + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 else: - a_as_int = int(a) - b_as_int = int(b) if b else 0 - return a_as_int, b_as_int + b = int(b) + return (a, b) #### Token objects - -class Token(tuple[str, str | None]): # noqa: SLOT001 - @overload - def __new__( - cls, - type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"], - value: str, - pos: int, - ) -> Self: ... - - @overload - def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ... - - def __new__(cls, type_: str, value: str | None, pos: int) -> Self: +class Token(tuple): + def __new__(cls, type_, value, pos): obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj - def __repr__(self) -> str: - return f"<{self.type} '{self.value}' at {self.pos}>" + def __repr__(self): + return "<%s '%s' at %i>" % (self.type, self.value, self.pos) - def is_delim(self, *values: str) -> bool: - return self.type == "DELIM" and self.value in values + def is_delim(self, *values): + return self.type == 'DELIM' and self.value in values - pos: int - - @property - def type(self) -> str: - return self[0] - - @property - def value(self) -> str | None: - return self[1] - - def css(self) -> str: - if self.type == "STRING": - return repr(self.value) - return cast("str", self.value) + type = property(operator.itemgetter(0)) + value = property(operator.itemgetter(1)) class EOFToken(Token): - def __new__(cls, pos: int) -> Self: - return Token.__new__(cls, "EOF", None, pos) + def __new__(cls, pos): + return Token.__new__(cls, 'EOF', None, pos) - def __repr__(self) -> str: - return f"<{self.type} at {self.pos}>" + def __repr__(self): + return '<%s at %i>' % (self.type, self.pos) #### Tokenizer class TokenMacros: - unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?" - escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]" - string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape - nonascii = r"[^\0-\177]" - nmchar = f"[_a-z0-9-]|{escape}|{nonascii}" - nmstart = f"[_a-z]|{escape}|{nonascii}" - - -class MatchFunc(Protocol): - def __call__( - self, string: str, pos: int = ..., endpos: int = ... - ) -> re.Match[str] | None: ... - - -def _compile(pattern: str) -> MatchFunc: + unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?' + escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]' + string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape + nonascii = r'[^\0-\177]' + nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii) + nmstart = '[_a-z]|%s|%s' % (escape, nonascii) + +def _compile(pattern): return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match - -_match_whitespace = _compile(r"[ \t\r\n\f]+") -_match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)") -_match_hash = _compile("#(?:%(nmchar)s)+") -_match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*") +_match_whitespace = _compile(r'[ \t\r\n\f]+') +_match_number = _compile('[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)') +_match_hash = _compile('#(?:%(nmchar)s)+') +_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*') _match_string_by_quote = { "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), } -_sub_simple_escape = re.compile(r"\\(.)").sub -_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub -_sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub +_sub_simple_escape = re.compile(r'\\(.)').sub +_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub +_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub # Same as r'\1', but faster on CPython -_replace_simple = operator.methodcaller("group", 1) - - -def _replace_unicode(match: re.Match[str]) -> str: +if hasattr(operator, 'methodcaller'): + # Python 2.6+ + _replace_simple = operator.methodcaller('group', 1) +else: + def _replace_simple(match): + return match.group(1) + +def _replace_unicode(match): codepoint = int(match.group(1), 16) if codepoint > sys.maxunicode: codepoint = 0xFFFD - return chr(codepoint) + return _unichr(codepoint) -def unescape_ident(value: str) -> str: +def unescape_ident(value): value = _sub_unicode_escape(_replace_unicode, value) - return _sub_simple_escape(_replace_simple, value) + value = _sub_simple_escape(_replace_simple, value) + return value -def tokenize(s: str) -> Iterator[Token]: +def tokenize(s): pos = 0 len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: - yield Token("S", " ", pos) + yield Token('S', ' ', pos) pos = match.end() continue match = _match_ident(s, pos=pos) if match: - value = _sub_simple_escape( - _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()) - ) - yield Token("IDENT", value, pos) + value = _sub_simple_escape(_replace_simple, + _sub_unicode_escape(_replace_unicode, match.group())) + yield Token('IDENT', value, pos) pos = match.end() continue match = _match_hash(s, pos=pos) if match: - value = _sub_simple_escape( - _replace_simple, - _sub_unicode_escape(_replace_unicode, match.group()[1:]), - ) - yield Token("HASH", value, pos) + value = _sub_simple_escape(_replace_simple, + _sub_unicode_escape(_replace_unicode, match.group()[1:])) + yield Token('HASH', value, pos) pos = match.end() continue quote = s[pos] if quote in _match_string_by_quote: match = _match_string_by_quote[quote](s, pos=pos + 1) - assert match, "Should have found at least an empty match" + assert match, 'Should have found at least an empty match' end_pos = match.end() if end_pos == len_s: - raise SelectorSyntaxError(f"Unclosed string at {pos}") + raise SelectorSyntaxError('Unclosed string at %s' % pos) if s[end_pos] != quote: - raise SelectorSyntaxError(f"Invalid string at {pos}") - value = _sub_simple_escape( - _replace_simple, - _sub_unicode_escape( - _replace_unicode, _sub_newline_escape("", match.group()) - ), - ) - yield Token("STRING", value, pos) + raise SelectorSyntaxError('Invalid string at %s' % pos) + value = _sub_simple_escape(_replace_simple, + _sub_unicode_escape(_replace_unicode, + _sub_newline_escape('', match.group()))) + yield Token('STRING', value, pos) pos = end_pos + 1 continue match = _match_number(s, pos=pos) if match: value = match.group() - yield Token("NUMBER", value, pos) + yield Token('NUMBER', value, pos) pos = match.end() continue pos2 = pos + 2 - if s[pos:pos2] == "/*": - pos = s.find("*/", pos2) + if s[pos:pos2] == '/*': + pos = s.find('*/', pos2) if pos == -1: pos = len_s else: pos += 2 continue - yield Token("DELIM", s[pos], pos) + yield Token('DELIM', s[pos], pos) pos += 1 assert pos == len_s yield EOFToken(pos) -class TokenStream: - def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None: - self.used: list[Token] = [] +class TokenStream(object): + def __init__(self, tokens, source=None): + self.used = [] self.tokens = iter(tokens) self.source = source - self.peeked: Token | None = None + self.peeked = None self._peeking = False - self.next_token = self.tokens.__next__ + try: + self.next_token = self.tokens.next + except AttributeError: + # Python 3 + self.next_token = self.tokens.__next__ - def next(self) -> Token: + def next(self): if self._peeking: self._peeking = False - assert self.peeked is not None self.used.append(self.peeked) return self.peeked - next_ = self.next_token() - self.used.append(next_) - return next_ + else: + next = self.next_token() + self.used.append(next) + return next - def peek(self) -> Token: + def peek(self): if not self._peeking: self.peeked = self.next_token() self._peeking = True - assert self.peeked is not None return self.peeked - def next_ident(self) -> str: - next_ = self.next() - if next_.type != "IDENT": - raise SelectorSyntaxError(f"Expected ident, got {next_}") - return cast("str", next_.value) - - def next_ident_or_star(self) -> str | None: - next_ = self.next() - if next_.type == "IDENT": - return next_.value - if next_ == ("DELIM", "*"): + def next_ident(self): + next = self.next() + if next.type != 'IDENT': + raise SelectorSyntaxError('Expected ident, got %s' % (next,)) + return next.value + + def next_ident_or_star(self): + next = self.next() + if next.type == 'IDENT': + return next.value + elif next == ('DELIM', '*'): return None - raise SelectorSyntaxError(f"Expected ident or '*', got {next_}") + else: + raise SelectorSyntaxError( + "Expected ident or '*', got %s" % (next,)) - def skip_whitespace(self) -> None: + def skip_whitespace(self): peek = self.peek() - if peek.type == "S": + if peek.type == 'S': self.next() diff --git a/cssselect/py.typed b/cssselect/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/cssselect/tests.py b/cssselect/tests.py new file mode 100755 index 0000000..a1fdc9e --- /dev/null +++ b/cssselect/tests.py @@ -0,0 +1,1166 @@ +#!/usr/bin/env python +# coding: utf8 +""" + Tests for cssselect + =================== + + These tests can be run either by py.test or by the standard library's + unittest. They use plain ``assert`` statements and do little reporting + themselves in case of failure. + + Use py.test to get fancy error reporting and assert introspection. + + + :copyright: (c) 2007-2012 Ian Bicking and contributors. + See AUTHORS for more details. + :license: BSD, see LICENSE for more details. + +""" + +import sys +import unittest + +from lxml import etree, html +from cssselect import (parse, GenericTranslator, HTMLTranslator, + SelectorSyntaxError, ExpressionError) +from cssselect.parser import (tokenize, parse_series, _unicode, + FunctionalPseudoElement) +from cssselect.xpath import _unicode_safe_getattr, XPathExpr + + +if sys.version_info[0] < 3: + # Python 2 + def u(text): + return text.decode('utf8') +else: + # Python 3 + def u(text): + return text + + +class TestCssselect(unittest.TestCase): + def test_tokenizer(self): + tokens = [ + _unicode(item) for item in tokenize( + u(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)'))] + assert tokens == [ + u(""), + "", + "' at 5>", + "", + # the no-break space is not whitespace in CSS + u(""), # f\xa0 + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + def test_parser(self): + def repr_parse(css): + selectors = parse(css) + for selector in selectors: + assert selector.pseudo_element is None + return [repr(selector.parsed_tree).replace("(u'", "('") + for selector in selectors] + + def parse_many(first, *others): + result = repr_parse(first) + for other in others: + assert repr_parse(other) == result + return result + + assert parse_many('*') == ['Element[*]'] + assert parse_many('*|*') == ['Element[*]'] + assert parse_many('*|foo') == ['Element[foo]'] + assert parse_many('foo|*') == ['Element[foo|*]'] + assert parse_many('foo|bar') == ['Element[foo|bar]'] + # This will never match, but it is valid: + assert parse_many('#foo#bar') == ['Hash[Hash[Element[*]#foo]#bar]'] + assert parse_many( + 'div>.foo', + 'div> .foo', + 'div >.foo', + 'div > .foo', + 'div \n> \t \t .foo', 'div\r>\n\n\n.foo', 'div\f>\f.foo' + ) == ['CombinedSelector[Element[div] > Class[Element[*].foo]]'] + assert parse_many('td.foo,.bar', + 'td.foo, .bar', + 'td.foo\t\r\n\f ,\t\r\n\f .bar' + ) == [ + 'Class[Element[td].foo]', + 'Class[Element[*].bar]' + ] + assert parse_many('div, td.foo, div.bar span') == [ + 'Element[div]', + 'Class[Element[td].foo]', + 'CombinedSelector[Class[Element[div].bar] ' + ' Element[span]]'] + assert parse_many('div > p') == [ + 'CombinedSelector[Element[div] > Element[p]]'] + assert parse_many('td:first') == [ + 'Pseudo[Element[td]:first]'] + assert parse_many('td:first') == [ + 'Pseudo[Element[td]:first]'] + assert parse_many('td :first') == [ + 'CombinedSelector[Element[td] ' + ' Pseudo[Element[*]:first]]'] + assert parse_many('td :first') == [ + 'CombinedSelector[Element[td] ' + ' Pseudo[Element[*]:first]]'] + assert parse_many('a[name]', 'a[ name\t]') == [ + 'Attrib[Element[a][name]]'] + assert parse_many('a [name]') == [ + 'CombinedSelector[Element[a] Attrib[Element[*][name]]]'] + assert parse_many('a[rel="include"]', 'a[rel = include]') == [ + "Attrib[Element[a][rel = 'include']]"] + assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [ + "Attrib[Element[a][hreflang |= 'en']]"] + assert parse_many('div:nth-child(10)') == [ + "Function[Element[div]:nth-child(['10'])]"] + assert parse_many(':nth-child(2n+2)') == [ + "Function[Element[*]:nth-child(['2', 'n', '+2'])]"] + assert parse_many('div:nth-of-type(10)') == [ + "Function[Element[div]:nth-of-type(['10'])]"] + assert parse_many('div div:nth-of-type(10) .aclass') == [ + 'CombinedSelector[CombinedSelector[Element[div] ' + "Function[Element[div]:nth-of-type(['10'])]] " + ' Class[Element[*].aclass]]'] + assert parse_many('label:only') == [ + 'Pseudo[Element[label]:only]'] + assert parse_many('a:lang(fr)') == [ + "Function[Element[a]:lang(['fr'])]"] + assert parse_many('div:contains("foo")') == [ + "Function[Element[div]:contains(['foo'])]"] + assert parse_many('div#foobar') == [ + 'Hash[Element[div]#foobar]'] + assert parse_many('div:not(div.foo)') == [ + 'Negation[Element[div]:not(Class[Element[div].foo])]'] + assert parse_many('td ~ th') == [ + 'CombinedSelector[Element[td] ~ Element[th]]'] + + def test_pseudo_elements(self): + def parse_pseudo(css): + result = [] + for selector in parse(css): + pseudo = selector.pseudo_element + pseudo = _unicode(pseudo) if pseudo else pseudo + # No Symbol here + assert pseudo is None or type(pseudo) is _unicode + selector = repr(selector.parsed_tree).replace("(u'", "('") + result.append((selector, pseudo)) + return result + + def parse_one(css): + result = parse_pseudo(css) + assert len(result) == 1 + return result[0] + + assert parse_one('foo') == ('Element[foo]', None) + assert parse_one('*') == ('Element[*]', None) + assert parse_one(':empty') == ('Pseudo[Element[*]:empty]', None) + + # Special cases for CSS 2.1 pseudo-elements + assert parse_one(':BEfore') == ('Element[*]', 'before') + assert parse_one(':aftER') == ('Element[*]', 'after') + assert parse_one(':First-Line') == ('Element[*]', 'first-line') + assert parse_one(':First-Letter') == ('Element[*]', 'first-letter') + + assert parse_one('::befoRE') == ('Element[*]', 'before') + assert parse_one('::AFter') == ('Element[*]', 'after') + assert parse_one('::firsT-linE') == ('Element[*]', 'first-line') + assert parse_one('::firsT-letteR') == ('Element[*]', 'first-letter') + + assert parse_one('::text-content') == ('Element[*]', 'text-content') + assert parse_one('::attr(name)') == ( + "Element[*]", "FunctionalPseudoElement[::attr(['name'])]") + + assert parse_one('::Selection') == ('Element[*]', 'selection') + assert parse_one('foo:after') == ('Element[foo]', 'after') + assert parse_one('foo::selection') == ('Element[foo]', 'selection') + assert parse_one('lorem#ipsum ~ a#b.c[href]:empty::selection') == ( + 'CombinedSelector[Hash[Element[lorem]#ipsum] ~ ' + 'Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]', + 'selection') + + parse_pseudo('foo:before, bar, baz:after') == [ + ('Element[foo]', 'before'), + ('Element[bar]', None), + ('Element[baz]', 'after')] + + # Special cases for CSS 2.1 pseudo-elements are ignored by default + for pseudo in ('after', 'before', 'first-line', 'first-letter'): + selector, = parse('e:%s' % pseudo) + assert selector.pseudo_element == pseudo + assert GenericTranslator().selector_to_xpath(selector, prefix='') == "e" + + # Pseudo Elements are ignored by default, but if allowed they are not + # supported by GenericTranslator + tr = GenericTranslator() + selector, = parse('e::foo') + assert selector.pseudo_element == 'foo' + assert tr.selector_to_xpath(selector, prefix='') == "e" + self.assertRaises(ExpressionError, tr.selector_to_xpath, selector, + translate_pseudo_elements=True) + + def test_specificity(self): + def specificity(css): + selectors = parse(css) + assert len(selectors) == 1 + return selectors[0].specificity() + + assert specificity('*') == (0, 0, 0) + assert specificity(' foo') == (0, 0, 1) + assert specificity(':empty ') == (0, 1, 0) + assert specificity(':before') == (0, 0, 1) + assert specificity('*:before') == (0, 0, 1) + assert specificity(':nth-child(2)') == (0, 1, 0) + assert specificity('.bar') == (0, 1, 0) + assert specificity('[baz]') == (0, 1, 0) + assert specificity('[baz="4"]') == (0, 1, 0) + assert specificity('[baz^="4"]') == (0, 1, 0) + assert specificity('#lipsum') == (1, 0, 0) + + assert specificity(':not(*)') == (0, 0, 0) + assert specificity(':not(foo)') == (0, 0, 1) + assert specificity(':not(.foo)') == (0, 1, 0) + assert specificity(':not([foo])') == (0, 1, 0) + assert specificity(':not(:empty)') == (0, 1, 0) + assert specificity(':not(#foo)') == (1, 0, 0) + + assert specificity('foo:empty') == (0, 1, 1) + assert specificity('foo:before') == (0, 0, 2) + assert specificity('foo::before') == (0, 0, 2) + assert specificity('foo:empty::before') == (0, 1, 2) + + assert specificity('#lorem + foo#ipsum:first-child > bar:first-line' + ) == (2, 1, 3) + + def test_parse_errors(self): + def get_error(css): + try: + parse(css) + except SelectorSyntaxError: + # Py2, Py3, ... + return str(sys.exc_info()[1]).replace("(u'", "('") + + assert get_error('attributes(href)/html/body/a') == ( + "Expected selector, got ") + assert get_error('attributes(href)') == ( + "Expected selector, got ") + assert get_error('html/body/a') == ( + "Expected selector, got ") + assert get_error(' ') == ( + "Expected selector, got ") + assert get_error('div, ') == ( + "Expected selector, got ") + assert get_error(' , div') == ( + "Expected selector, got ") + assert get_error('p, , div') == ( + "Expected selector, got ") + assert get_error('div > ') == ( + "Expected selector, got ") + assert get_error(' > div') == ( + "Expected selector, got ' at 2>") + assert get_error('foo|#bar') == ( + "Expected ident or '*', got ") + assert get_error('#.foo') == ( + "Expected selector, got ") + assert get_error('.#foo') == ( + "Expected ident, got ") + assert get_error(':#foo') == ( + "Expected ident, got ") + assert get_error('[*]') == ( + "Expected '|', got ") + assert get_error('[foo|]') == ( + "Expected ident, got ") + assert get_error('[#]') == ( + "Expected ident or '*', got ") + assert get_error('[foo=#]') == ( + "Expected string or ident, got ") + assert get_error('[href]a') == ( + "Expected selector, got ") + assert get_error('[rel=stylesheet]') == None + assert get_error('[rel:stylesheet]') == ( + "Operator expected, got ") + assert get_error('[rel=stylesheet') == ( + "Expected ']', got ") + assert get_error(':lang(fr)') == None + assert get_error(':lang(fr') == ( + "Expected an argument, got ") + assert get_error(':contains("foo') == ( + "Unclosed string at 10") + assert get_error('foo!') == ( + "Expected selector, got ") + + # Mis-placed pseudo-elements + assert get_error('a:before:empty') == ( + "Got pseudo-element ::before not at the end of a selector") + assert get_error('li:before a') == ( + "Got pseudo-element ::before not at the end of a selector") + assert get_error(':not(:before)') == ( + "Got pseudo-element ::before inside :not() at 12") + assert get_error(':not(:not(a))') == ( + "Got nested :not()") + + def test_translation(self): + def xpath(css): + return _unicode(GenericTranslator().css_to_xpath(css, prefix='')) + + assert xpath('*') == "*" + assert xpath('e') == "e" + assert xpath('*|e') == "e" + assert xpath('e|f') == "e:f" + assert xpath('e[foo]') == "e[@foo]" + assert xpath('e[foo|bar]') == "e[@foo:bar]" + assert xpath('e[foo="bar"]') == "e[@foo = 'bar']" + assert xpath('e[foo~="bar"]') == ( + "e[@foo and contains(" + "concat(' ', normalize-space(@foo), ' '), ' bar ')]") + assert xpath('e[foo^="bar"]') == ( + "e[@foo and starts-with(@foo, 'bar')]") + assert xpath('e[foo$="bar"]') == ( + "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']") + assert xpath('e[foo*="bar"]') == ( + "e[@foo and contains(@foo, 'bar')]") + assert xpath('e[hreflang|="en"]') == ( + "e[@hreflang and (" + "@hreflang = 'en' or starts-with(@hreflang, 'en-'))]") + assert xpath('e:nth-child(1)') == ( + "*/*[name() = 'e' and (position() = 1)]") + assert xpath('e:nth-last-child(1)') == ( + "*/*[name() = 'e' and (position() = last() - 1)]") + assert xpath('e:nth-last-child(2n+2)') == ( + "*/*[name() = 'e' and (" + "(position() +2) mod -2 = 0 and position() < (last() -2))]") + assert xpath('e:nth-of-type(1)') == ( + "*/e[position() = 1]") + assert xpath('e:nth-last-of-type(1)') == ( + "*/e[position() = last() - 1]") + assert xpath('e:nth-last-of-type(1)') == ( + "*/e[position() = last() - 1]") + assert xpath('div e:nth-last-of-type(1) .aclass') == ( + "div/descendant-or-self::*/e[position() = last() - 1]" + "/descendant-or-self::*/*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aclass ')]") + assert xpath('e:first-child') == ( + "*/*[name() = 'e' and (position() = 1)]") + assert xpath('e:last-child') == ( + "*/*[name() = 'e' and (position() = last())]") + assert xpath('e:first-of-type') == ( + "*/e[position() = 1]") + assert xpath('e:last-of-type') == ( + "*/e[position() = last()]") + assert xpath('e:only-child') == ( + "*/*[name() = 'e' and (last() = 1)]") + assert xpath('e:only-of-type') == ( + "e[last() = 1]") + assert xpath('e:empty') == ( + "e[not(*) and not(string-length())]") + assert xpath('e:EmPTY') == ( + "e[not(*) and not(string-length())]") + assert xpath('e:root') == ( + "e[not(parent::*)]") + assert xpath('e:hover') == ( + "e[0]") # never matches + assert xpath('e:contains("foo")') == ( + "e[contains(., 'foo')]") + assert xpath('e:ConTains(foo)') == ( + "e[contains(., 'foo')]") + assert xpath('e.warning') == ( + "e[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' warning ')]") + assert xpath('e#myid') == ( + "e[@id = 'myid']") + assert xpath('e:not(:nth-child(odd))') == ( + "e[not((position() -1) mod 2 = 0 and position() >= 1)]") + assert xpath('e:nOT(*)') == ( + "e[0]") # never matches + assert xpath('e f') == ( + "e/descendant-or-self::*/f") + assert xpath('e > f') == ( + "e/f") + assert xpath('e + f') == ( + "e/following-sibling::*[name() = 'f' and (position() = 1)]") + assert xpath('e ~ f') == ( + "e/following-sibling::f") + assert xpath('div#container p') == ( + "div[@id = 'container']/descendant-or-self::*/p") + + # Invalid characters in XPath element names + assert xpath(r'di\a0 v') == ( + u("*[name() = 'di v']")) # di\xa0v + assert xpath(r'di\[v') == ( + "*[name() = 'di[v']") + assert xpath(r'[h\a0 ref]') == ( + u("*[attribute::*[name() = 'h ref']]")) # h\xa0ref + assert xpath(r'[h\]ref]') == ( + "*[attribute::*[name() = 'h]ref']]") + + self.assertRaises(ExpressionError, xpath, u(':fİrst-child')) + self.assertRaises(ExpressionError, xpath, ':first-of-type') + self.assertRaises(ExpressionError, xpath, ':only-of-type') + self.assertRaises(ExpressionError, xpath, ':last-of-type') + self.assertRaises(ExpressionError, xpath, ':nth-of-type(1)') + self.assertRaises(ExpressionError, xpath, ':nth-last-of-type(1)') + self.assertRaises(ExpressionError, xpath, ':nth-child(n-)') + self.assertRaises(ExpressionError, xpath, ':after') + self.assertRaises(ExpressionError, xpath, ':lorem-ipsum') + self.assertRaises(ExpressionError, xpath, ':lorem(ipsum)') + self.assertRaises(ExpressionError, xpath, '::lorem-ipsum') + self.assertRaises(TypeError, GenericTranslator().css_to_xpath, 4) + self.assertRaises(TypeError, GenericTranslator().selector_to_xpath, + 'foo') + + def test_unicode(self): + if sys.version_info[0] < 3: + css = '.a\xc1b'.decode('ISO-8859-1') + else: + css = '.a\xc1b' + + xpath = GenericTranslator().css_to_xpath(css) + assert css[1:] in xpath + xpath = xpath.encode('ascii', 'xmlcharrefreplace').decode('ASCII') + assert xpath == ( + "descendant-or-self::*[@class and contains(" + "concat(' ', normalize-space(@class), ' '), ' aÁb ')]") + + def test_quoting(self): + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath('*[aval="\'"]') == ( + '''descendant-or-self::*[@aval = "'"]''') + assert css_to_xpath('*[aval="\'\'\'"]') == ( + """descendant-or-self::*[@aval = "'''"]""") + assert css_to_xpath('*[aval=\'"\']') == ( + '''descendant-or-self::*[@aval = '"']''') + assert css_to_xpath('*[aval=\'"""\']') == ( + '''descendant-or-self::*[@aval = '"""']''') + + def test_unicode_escapes(self): + # \22 == '"' \20 == ' ' + css_to_xpath = GenericTranslator().css_to_xpath + assert css_to_xpath(r'*[aval="\'\22\'"]') == ( + '''descendant-or-self::*[@aval = concat("'",'"',"'")]''') + assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( + '''descendant-or-self::*[@aval = concat("'",'"2',"'")]''') + assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( + '''descendant-or-self::*[@aval = "' '"]''') + assert css_to_xpath('*[aval="\'\\20\r\n \'"]') == ( + '''descendant-or-self::*[@aval = "' '"]''') + + def test_xpath_pseudo_elements(self): + class CustomTranslator(GenericTranslator): + def xpath_pseudo_element(self, xpath, pseudo_element): + if isinstance(pseudo_element, FunctionalPseudoElement): + method = 'xpath_%s_functional_pseudo_element' % ( + pseudo_element.name.replace('-', '_')) + method = _unicode_safe_getattr(self, method, None) + if not method: + raise ExpressionError( + "The functional pseudo-element ::%s() is unknown" + % pseudo_element.name) + xpath = method(xpath, pseudo_element.arguments) + else: + method = 'xpath_%s_simple_pseudo_element' % ( + pseudo_element.replace('-', '_')) + method = _unicode_safe_getattr(self, method, None) + if not method: + raise ExpressionError( + "The pseudo-element ::%s is unknown" + % pseudo_element) + xpath = method(xpath) + return xpath + + # functional pseudo-class: + # elements that have a certain number of attributes + def xpath_nb_attr_function(self, xpath, function): + nb_attributes = int(function.arguments[0].value) + return xpath.add_condition( + "count(@*)=%d" % nb_attributes) + + # pseudo-class: + # elements that have 5 attributes + def xpath_five_attributes_pseudo(self, xpath): + return xpath.add_condition("count(@*)=5") + + # functional pseudo-element: + # element's attribute by name + def xpath_attr_functional_pseudo_element(self, xpath, arguments): + attribute_name = arguments[0].value + other = XPathExpr('@%s' % attribute_name, '', ) + return xpath.join('/', other) + + # pseudo-element: + # element's text() nodes + def xpath_text_node_simple_pseudo_element(self, xpath): + other = XPathExpr('text()', '', ) + return xpath.join('/', other) + + # pseudo-element: + # element's href attribute + def xpath_attr_href_simple_pseudo_element(self, xpath): + other = XPathExpr('@href', '', ) + return xpath.join('/', other) + + def xpath(css): + return _unicode(CustomTranslator().css_to_xpath(css)) + + assert xpath(':five-attributes') == "descendant-or-self::*[count(@*)=5]" + assert xpath(':nb-attr(3)') == "descendant-or-self::*[count(@*)=3]" + assert xpath('::attr(href)') == "descendant-or-self::*/@href" + assert xpath('::text-node') == "descendant-or-self::*/text()" + assert xpath('::attr-href') == "descendant-or-self::*/@href" + assert xpath('p img::attr(src)') == ( + "descendant-or-self::p/descendant-or-self::*/img/@src") + + def test_series(self): + def series(css): + selector, = parse(':nth-child(%s)' % css) + args = selector.parsed_tree.arguments + try: + return parse_series(args) + except ValueError: + return None + + assert series('1n+3') == (1, 3) + assert series('1n +3') == (1, 3) + assert series('1n + 3') == (1, 3) + assert series('1n+ 3') == (1, 3) + assert series('1n-3') == (1, -3) + assert series('1n -3') == (1, -3) + assert series('1n - 3') == (1, -3) + assert series('1n- 3') == (1, -3) + assert series('n-5') == (1, -5) + assert series('odd') == (2, 1) + assert series('even') == (2, 0) + assert series('3n') == (3, 0) + assert series('n') == (1, 0) + assert series('+n') == (1, 0) + assert series('-n') == (-1, 0) + assert series('5') == (0, 5) + assert series('foo') == None + assert series('n+') == None + + def test_lang(self): + document = etree.fromstring(XMLLANG_IDS) + sort_key = dict( + (el, count) for count, el in enumerate(document.getiterator()) + ).__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + + def langid(selector): + xpath = css_to_xpath(selector) + items = document.xpath(xpath) + items.sort(key=sort_key) + return [element.get('id', 'nil') for element in items] + + assert langid(':lang("EN")') == ['first', 'second', 'third', 'fourth'] + assert langid(':lang("en-us")') == ['second', 'fourth'] + assert langid(':lang(en-nz)') == ['third'] + assert langid(':lang(fr)') == ['fifth'] + assert langid(':lang(ru)') == ['sixth'] + assert langid(":lang('ZH')") == ['eighth'] + assert langid(':lang(de) :lang(zh)') == ['eighth'] + assert langid(':lang(en), :lang(zh)') == [ + 'first', 'second', 'third', 'fourth', 'eighth'] + assert langid(':lang(es)') == [] + + def test_select(self): + document = etree.fromstring(HTML_IDS) + sort_key = dict( + (el, count) for count, el in enumerate(document.getiterator()) + ).__getitem__ + css_to_xpath = GenericTranslator().css_to_xpath + html_css_to_xpath = HTMLTranslator().css_to_xpath + + def select_ids(selector, html_only): + xpath = css_to_xpath(selector) + items = document.xpath(xpath) + if html_only: + assert items == [] + xpath = html_css_to_xpath(selector) + items = document.xpath(xpath) + items.sort(key=sort_key) + return [element.get('id', 'nil') for element in items] + + def pcss(main, *selectors, **kwargs): + html_only = kwargs.pop('html_only', False) + result = select_ids(main, html_only) + for selector in selectors: + assert select_ids(selector, html_only) == result + return result + + all_ids = pcss('*') + assert all_ids[:6] == [ + 'html', 'nil', 'link-href', 'link-nohref', 'nil', 'outer-div'] + assert all_ids[-1:] == ['foobar-span'] + assert pcss('div') == ['outer-div', 'li-div', 'foobar-div'] + assert pcss('DIV', html_only=True) == [ + 'outer-div', 'li-div', 'foobar-div'] # case-insensitive in HTML + assert pcss('div div') == ['li-div'] + assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div'] + assert pcss('a[name]') == ['name-anchor'] + assert pcss('a[NAme]', html_only=True) == [ + 'name-anchor'] # case-insensitive in HTML: + assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor'] + assert pcss('a[rel="tag"]') == ['tag-anchor'] + assert pcss('a[href*="localhost"]') == ['tag-anchor'] + assert pcss('a[href*=""]') == [] + assert pcss('a[href^="http"]') == ['tag-anchor', 'nofollow-anchor'] + assert pcss('a[href^="http:"]') == ['tag-anchor'] + assert pcss('a[href^=""]') == [] + assert pcss('a[href$="org"]') == ['nofollow-anchor'] + assert pcss('a[href$=""]') == [] + assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [ + 'foobar-div'] + assert pcss('[foobar~="ab bc"]', + '[foobar~=""]', '[foobar~=" \t"]') == [] + assert pcss('div[foobar~="cd"]') == [] + assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ['second-li'] + # Attribute values are case sensitive + assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] + assert pcss('*[lang|="e"]') == [] + # ... :lang() is not. + assert pcss(':lang("EN")', '*:lang(en-US)', html_only=True) == [ + 'second-li', 'li-div'] + assert pcss(':lang("e")', html_only=True) == [] + assert pcss('li:nth-child(3)') == ['third-li'] + assert pcss('li:nth-child(10)') == [] + assert pcss('li:nth-child(2n)', 'li:nth-child(even)', + 'li:nth-child(2n+0)') == [ + 'second-li', 'fourth-li', 'sixth-li'] + assert pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') == [ + 'first-li', 'third-li', 'fifth-li', 'seventh-li'] + assert pcss('li:nth-child(2n+4)') == ['fourth-li', 'sixth-li'] + # FIXME: I'm not 100% sure this is right: + assert pcss('li:nth-child(3n+1)') == [ + 'first-li', 'fourth-li', 'seventh-li'] + assert pcss('li:nth-last-child(0)') == [ + 'seventh-li'] + assert pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') == [ + 'second-li', 'fourth-li', 'sixth-li'] + assert pcss('li:nth-last-child(2n+2)') == ['second-li', 'fourth-li'] + assert pcss('ol:first-of-type') == ['first-ol'] + assert pcss('ol:nth-child(1)') == [] + assert pcss('ol:nth-of-type(2)') == ['second-ol'] + # FIXME: like above', '(1) or (2)? + assert pcss('ol:nth-last-of-type(1)') == ['first-ol'] + assert pcss('span:only-child') == ['foobar-span'] + assert pcss('li div:only-child') == ['li-div'] + assert pcss('div *:only-child') == ['li-div', 'foobar-span'] + self.assertRaises(ExpressionError, pcss, 'p *:only-of-type') + assert pcss('p:only-of-type') == ['paragraph'] + assert pcss('a:empty', 'a:EMpty') == ['name-anchor'] + assert pcss('li:empty') == [ + 'third-li', 'fourth-li', 'fifth-li', 'sixth-li'] + assert pcss(':root', 'html:root') == ['html'] + assert pcss('li:root', '* :root') == [] + assert pcss('*:contains("link")', ':CONtains("link")') == [ + 'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor'] + assert pcss('*:contains("LInk")') == [] # case sensitive + assert pcss('*:contains("e")') == [ + 'html', 'nil', 'outer-div', 'first-ol', 'first-li', + 'paragraph', 'p-em'] + assert pcss('*:contains("E")') == [] # case-sensitive + assert pcss('.a', '.b', '*.a', 'ol.a') == ['first-ol'] + assert pcss('.c', '*.c') == ['first-ol', 'third-li', 'fourth-li'] + assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [ + 'third-li', 'fourth-li'] + assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li'] + assert pcss('li div', 'li > div', 'div div') == ['li-div'] + assert pcss('div > div') == [] + assert pcss('div>.c', 'div > .c') == ['first-ol'] + assert pcss('div + div') == ['foobar-div'] + assert pcss('a ~ a') == ['tag-anchor', 'nofollow-anchor'] + assert pcss('a[rel="tag"] ~ a') == ['nofollow-anchor'] + assert pcss('ol#first-ol li:last-child') == ['seventh-li'] + assert pcss('ol#first-ol *:last-child') == ['li-div', 'seventh-li'] + assert pcss('#outer-div:first-child') == ['outer-div'] + assert pcss('#outer-div :first-child') == [ + 'name-anchor', 'first-li', 'li-div', 'p-b', + 'checkbox-fieldset-disabled', 'area-href'] + assert pcss('a[href]') == ['tag-anchor', 'nofollow-anchor'] + assert pcss(':not(*)') == [] + assert pcss('a:not([href])') == ['name-anchor'] + assert pcss('ol :Not(li[class])') == [ + 'first-li', 'second-li', 'li-div', + 'fifth-li', 'sixth-li', 'seventh-li'] + # Invalid characters in XPath element names, should not crash + assert pcss(r'di\a0 v', r'div\[') == [] + assert pcss(r'[h\a0 ref]', r'[h\]ref]') == [] + + # HTML-specific + assert pcss(':link', html_only=True) == [ + 'link-href', 'tag-anchor', 'nofollow-anchor', 'area-href'] + assert pcss(':visited', html_only=True) == [] + assert pcss(':enabled', html_only=True) == [ + 'link-href', 'tag-anchor', 'nofollow-anchor', + 'checkbox-unchecked', 'text-checked', 'checkbox-checked', + 'area-href'] + assert pcss(':disabled', html_only=True) == [ + 'checkbox-disabled', 'checkbox-disabled-checked', 'fieldset', + 'checkbox-fieldset-disabled'] + assert pcss(':checked', html_only=True) == [ + 'checkbox-checked', 'checkbox-disabled-checked'] + + def test_select_shakespeare(self): + document = html.document_fromstring(HTML_SHAKESPEARE) + body = document.xpath('//body')[0] + css_to_xpath = GenericTranslator().css_to_xpath + + try: + basestring_ = basestring + except NameError: + basestring_ = (str, bytes) + + def count(selector): + xpath = css_to_xpath(selector) + results = body.xpath(xpath) + assert not isinstance(results, basestring_) + found = set() + for item in results: + assert item not in found + found.add(item) + assert not isinstance(item, basestring_) + return len(results) + + # Data borrowed from http://mootools.net/slickspeed/ + + ## Changed from original; probably because I'm only + ## searching the body. + #assert count('*') == 252 + assert count('*') == 246 + assert count('div:contains(CELIA)') == 26 + assert count('div:only-child') == 22 # ? + assert count('div:nth-child(even)') == 106 + assert count('div:nth-child(2n)') == 106 + assert count('div:nth-child(odd)') == 137 + assert count('div:nth-child(2n+1)') == 137 + assert count('div:nth-child(n)') == 243 + assert count('div:last-child') == 53 + assert count('div:first-child') == 51 + assert count('div > div') == 242 + assert count('div + div') == 190 + assert count('div ~ div') == 190 + assert count('body') == 1 + assert count('body div') == 243 + assert count('div') == 243 + assert count('div div') == 242 + assert count('div div div') == 241 + assert count('div, div, div') == 243 + assert count('div, a, span') == 243 + assert count('.dialog') == 51 + assert count('div.dialog') == 51 + assert count('div .dialog') == 51 + assert count('div.character, div.dialog') == 99 + assert count('div.direction.dialog') == 0 + assert count('div.dialog.direction') == 0 + assert count('div.dialog.scene') == 1 + assert count('div.scene.scene') == 1 + assert count('div.scene .scene') == 0 + assert count('div.direction .dialog ') == 0 + assert count('div .dialog .direction') == 4 + assert count('div.dialog .dialog .direction') == 4 + assert count('#speech5') == 1 + assert count('div#speech5') == 1 + assert count('div #speech5') == 1 + assert count('div.scene div.dialog') == 49 + assert count('div#scene1 div.dialog div') == 142 + assert count('#scene1 #speech1') == 1 + assert count('div[class]') == 103 + assert count('div[class=dialog]') == 50 + assert count('div[class^=dia]') == 51 + assert count('div[class$=log]') == 50 + assert count('div[class*=sce]') == 1 + assert count('div[class|=dialog]') == 50 # ? Seems right + assert count('div[class!=madeup]') == 243 # ? Seems right + assert count('div[class~=dialog]') == 51 # ? Seems right + +XMLLANG_IDS = ''' + + a + b + c + d + e + f + + + + +''' + +HTML_IDS = ''' + + + + +
+ + + + link +
    +
  1. content
  2. +
  3. +
    +
    +
  4. +
  5. +
  6. +
  7. +
  8. +
  9. +
+

+ hi there + guy + + + + + + + +

+ + +
+

+
    +
+ + + + +
+
+ +''' + + +HTML_SHAKESPEARE = ''' + + + + + + +
+
+

As You Like It

+
+ by William Shakespeare +
+
+

ACT I, SCENE III. A room in the palace.

+
+
Enter CELIA and ROSALIND
+
+
CELIA
+
+
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
+
+
ROSALIND
+
+
Not one to throw at a dog.
+
+
CELIA
+
+
No, thy words are too precious to be cast away upon
+
curs; throw some of them at me; come, lame me with reasons.
+
+
ROSALIND
+
CELIA
+
+
But is all this for your father?
+
+
+
Then there were two cousins laid up; when the one
+
should be lamed with reasons and the other mad
+
without any.
+
+
ROSALIND
+
+
No, some of it is for my child's father. O, how
+
full of briers is this working-day world!
+
+
CELIA
+
+
They are but burs, cousin, thrown upon thee in
+
holiday foolery: if we walk not in the trodden
+
paths our very petticoats will catch them.
+
+
ROSALIND
+
+
I could shake them off my coat: these burs are in my heart.
+
+
CELIA
+
+
Hem them away.
+
+
ROSALIND
+
+
I would try, if I could cry 'hem' and have him.
+
+
CELIA
+
+
Come, come, wrestle with thy affections.
+
+
ROSALIND
+
+
O, they take the part of a better wrestler than myself!
+
+
CELIA
+
+
O, a good wish upon you! you will try in time, in
+
despite of a fall. But, turning these jests out of
+
service, let us talk in good earnest: is it
+
possible, on such a sudden, you should fall into so
+
strong a liking with old Sir Rowland's youngest son?
+
+
ROSALIND
+
+
The duke my father loved his father dearly.
+
+
CELIA
+
+
Doth it therefore ensue that you should love his son
+
dearly? By this kind of chase, I should hate him,
+
for my father hated his father dearly; yet I hate
+
not Orlando.
+
+
ROSALIND
+
+
No, faith, hate him not, for my sake.
+
+
CELIA
+
+
Why should I not? doth he not deserve well?
+
+
ROSALIND
+
+
Let me love him for that, and do you love him
+
because I do. Look, here comes the duke.
+
+
CELIA
+
+
With his eyes full of anger.
+
Enter DUKE FREDERICK, with Lords
+
+
DUKE FREDERICK
+
+
Mistress, dispatch you with your safest haste
+
And get you from our court.
+
+
ROSALIND
+
+
Me, uncle?
+
+
DUKE FREDERICK
+
+
You, cousin
+
Within these ten days if that thou be'st found
+
So near our public court as twenty miles,
+
Thou diest for it.
+
+
ROSALIND
+
+
I do beseech your grace,
+
Let me the knowledge of my fault bear with me:
+
If with myself I hold intelligence
+
Or have acquaintance with mine own desires,
+
If that I do not dream or be not frantic,--
+
As I do trust I am not--then, dear uncle,
+
Never so much as in a thought unborn
+
Did I offend your highness.
+
+
DUKE FREDERICK
+
+
Thus do all traitors:
+
If their purgation did consist in words,
+
They are as innocent as grace itself:
+
Let it suffice thee that I trust thee not.
+
+
ROSALIND
+
+
Yet your mistrust cannot make me a traitor:
+
Tell me whereon the likelihood depends.
+
+
DUKE FREDERICK
+
+
Thou art thy father's daughter; there's enough.
+
+
ROSALIND
+
+
So was I when your highness took his dukedom;
+
So was I when your highness banish'd him:
+
Treason is not inherited, my lord;
+
Or, if we did derive it from our friends,
+
What's that to me? my father was no traitor:
+
Then, good my liege, mistake me not so much
+
To think my poverty is treacherous.
+
+
CELIA
+
+
Dear sovereign, hear me speak.
+
+
DUKE FREDERICK
+
+
Ay, Celia; we stay'd her for your sake,
+
Else had she with her father ranged along.
+
+
CELIA
+
+
I did not then entreat to have her stay;
+
It was your pleasure and your own remorse:
+
I was too young that time to value her;
+
But now I know her: if she be a traitor,
+
Why so am I; we still have slept together,
+
Rose at an instant, learn'd, play'd, eat together,
+
And wheresoever we went, like Juno's swans,
+
Still we went coupled and inseparable.
+
+
DUKE FREDERICK
+
+
She is too subtle for thee; and her smoothness,
+
Her very silence and her patience
+
Speak to the people, and they pity her.
+
Thou art a fool: she robs thee of thy name;
+
And thou wilt show more bright and seem more virtuous
+
When she is gone. Then open not thy lips:
+
Firm and irrevocable is my doom
+
Which I have pass'd upon her; she is banish'd.
+
+
CELIA
+
+
Pronounce that sentence then on me, my liege:
+
I cannot live out of her company.
+
+
DUKE FREDERICK
+
+
You are a fool. You, niece, provide yourself:
+
If you outstay the time, upon mine honour,
+
And in the greatness of my word, you die.
+
Exeunt DUKE FREDERICK and Lords
+
+
CELIA
+
+
O my poor Rosalind, whither wilt thou go?
+
Wilt thou change fathers? I will give thee mine.
+
I charge thee, be not thou more grieved than I am.
+
+
ROSALIND
+
+
I have more cause.
+
+
CELIA
+
+
Thou hast not, cousin;
+
Prithee be cheerful: know'st thou not, the duke
+
Hath banish'd me, his daughter?
+
+
ROSALIND
+
+
That he hath not.
+
+
CELIA
+
+
No, hath not? Rosalind lacks then the love
+
Which teacheth thee that thou and I am one:
+
Shall we be sunder'd? shall we part, sweet girl?
+
No: let my father seek another heir.
+
Therefore devise with me how we may fly,
+
Whither to go and what to bear with us;
+
And do not seek to take your change upon you,
+
To bear your griefs yourself and leave me out;
+
For, by this heaven, now at our sorrows pale,
+
Say what thou canst, I'll go along with thee.
+
+
ROSALIND
+
+
Why, whither shall we go?
+
+
CELIA
+
+
To seek my uncle in the forest of Arden.
+
+
ROSALIND
+
+
Alas, what danger will it be to us,
+
Maids as we are, to travel forth so far!
+
Beauty provoketh thieves sooner than gold.
+
+
CELIA
+
+
I'll put myself in poor and mean attire
+
And with a kind of umber smirch my face;
+
The like do you: so shall we pass along
+
And never stir assailants.
+
+
ROSALIND
+
+
Were it not better,
+
Because that I am more than common tall,
+
That I did suit me all points like a man?
+
A gallant curtle-axe upon my thigh,
+
A boar-spear in my hand; and--in my heart
+
Lie there what hidden woman's fear there will--
+
We'll have a swashing and a martial outside,
+
As many other mannish cowards have
+
That do outface it with their semblances.
+
+
CELIA
+
+
What shall I call thee when thou art a man?
+
+
ROSALIND
+
+
I'll have no worse a name than Jove's own page;
+
And therefore look you call me Ganymede.
+
But what will you be call'd?
+
+
CELIA
+
+
Something that hath a reference to my state
+
No longer Celia, but Aliena.
+
+
ROSALIND
+
+
But, cousin, what if we assay'd to steal
+
The clownish fool out of your father's court?
+
Would he not be a comfort to our travel?
+
+
CELIA
+
+
He'll go along o'er the wide world with me;
+
Leave me alone to woo him. Let's away,
+
And get our jewels and our wealth together,
+
Devise the fittest time and safest way
+
To hide us from pursuit that will be made
+
After my flight. Now go we in content
+
To liberty and not to banishment.
+
Exeunt
+
+
+
+
+ + +''' + + +if __name__ == '__main__': + unittest.main() diff --git a/cssselect/xpath.py b/cssselect/xpath.py index 96eac3f..e5e74b9 100644 --- a/cssselect/xpath.py +++ b/cssselect/xpath.py @@ -1,46 +1,35 @@ +# coding: utf8 """ -cssselect.xpath -=============== + cssselect.xpath + =============== -Translation of parsed CSS selectors to XPath expressions. + Translation of parsed CSS selectors to XPath expressions. -:copyright: (c) 2007-2012 Ian Bicking and contributors. -See AUTHORS for more details. -:license: BSD, see LICENSE for more details. + :copyright: (c) 2007-2012 Ian Bicking and contributors. + See AUTHORS for more details. + :license: BSD, see LICENSE for more details. """ -from __future__ import annotations - +import sys import re -from typing import TYPE_CHECKING, cast - -from cssselect.parser import ( - Attrib, - Class, - CombinedSelector, - Element, - Function, - Hash, - Matching, - Negation, - Pseudo, - PseudoElement, - Relation, - Selector, - SelectorError, - SpecificityAdjustment, - Tree, - parse, - parse_series, -) - -if TYPE_CHECKING: - from collections.abc import Callable - - # typing.Self requires Python 3.11 - from typing_extensions import Self + +from cssselect.parser import parse, parse_series, SelectorError + + +if sys.version_info[0] < 3: + _basestring = basestring + _unicode = unicode +else: + _basestring = str + _unicode = str + + +def _unicode_safe_getattr(obj, name, default=None): + # getattr() with a non-ASCII name fails on Python 2.x + name = name.encode('ascii', 'replace').decode('ascii') + return getattr(obj, name, default) class ExpressionError(SelectorError, RuntimeError): @@ -49,72 +38,52 @@ class ExpressionError(SelectorError, RuntimeError): #### XPath Helpers +class XPathExpr(object): -class XPathExpr: - def __init__( - self, - path: str = "", - element: str = "*", - condition: str = "", - star_prefix: bool = False, - ) -> None: + def __init__(self, path='', element='*', condition='', star_prefix=False): self.path = path self.element = element self.condition = condition - def __str__(self) -> str: - path = str(self.path) + str(self.element) + def __str__(self): + path = _unicode(self.path) + _unicode(self.element) if self.condition: - path += f"[{self.condition}]" + path += '[%s]' % self.condition return path - def __repr__(self) -> str: - return f"{self.__class__.__name__}[{self}]" + def __repr__(self): + return '%s[%s]' % (self.__class__.__name__, self) - def add_condition(self, condition: str, conjuction: str = "and") -> Self: + def add_condition(self, condition): if self.condition: - self.condition = f"({self.condition}) {conjuction} ({condition})" + self.condition = '%s and (%s)' % (self.condition, condition) else: self.condition = condition return self - def add_name_test(self) -> None: - if self.element == "*": + def add_name_test(self): + if self.element == '*': # We weren't doing a test anyway return - self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}") - self.element = "*" + self.add_condition( + "name() = %s" % GenericTranslator.xpath_literal(self.element)) + self.element = '*' - def add_star_prefix(self) -> None: + def add_star_prefix(self): """ Append '*/' to the path to keep the context constrained to a single parent. """ - self.path += "*/" - - def join( - self, - combiner: str, - other: XPathExpr, - closing_combiner: str | None = None, - has_inner_condition: bool = False, - ) -> Self: - path = str(self) + combiner + self.path += '*/' + + def join(self, combiner, other): + path = _unicode(self) + combiner # Any "star prefix" is redundant when joining. - if other.path != "*/": + if other.path != '*/': path += other.path self.path = path - if not has_inner_condition: - self.element = ( - other.element + closing_combiner if closing_combiner else other.element - ) - self.condition = other.condition - else: - self.element = other.element - if other.condition: - self.element += "[" + other.condition + "]" - if closing_combiner: - self.element += closing_combiner + self.element = other.element + self.condition = other.condition return self @@ -123,16 +92,15 @@ def join( # The spec is actually more permissive than that, but don’t bother. # This is just for the fast path. # http://www.w3.org/TR/REC-xml/#NT-NameStartChar -is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match +is_safe_name = re.compile('^[a-zA-Z_][a-zA-Z0-9_.-]*$').match # Test that the string is not empty and does not contain whitespace -is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match +is_non_whitespace = re.compile(r'^[^ \t\r\n\f]+$').match #### Translation - -class GenericTranslator: +class GenericTranslator(object): """ Translator for "generic" XML documents. @@ -140,44 +108,44 @@ class GenericTranslator: of element names and attribute names. """ - + #### #### HERE BE DRAGONS #### #### You are welcome to hook into this to change some behavior, #### but do so at your own risks. - #### Until it has received a lot more work and review, + #### Until is has recieved a lot more work and review, #### I reserve the right to change this API in backward-incompatible ways #### with any minor version of cssselect. - #### See https://github.com/scrapy/cssselect/pull/22 + #### See https://github.com/SimonSapin/cssselect/pull/22 #### -- Simon Sapin. #### combinator_mapping = { - " ": "descendant", - ">": "child", - "+": "direct_adjacent", - "~": "indirect_adjacent", + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', } attribute_operator_mapping = { - "exists": "exists", - "=": "equals", - "~=": "includes", - "|=": "dashmatch", - "^=": "prefixmatch", - "$=": "suffixmatch", - "*=": "substringmatch", - "!=": "different", # XXX Not in Level 3 but meh + 'exists': 'exists', + '=': 'equals', + '~=': 'includes', + '|=': 'dashmatch', + '^=': 'prefixmatch', + '$=': 'suffixmatch', + '*=': 'substringmatch', + '!=': 'different', # XXX Not in Level 3 but meh } #: The attribute used for ID selectors depends on the document language: #: http://www.w3.org/TR/selectors/#id-selectors - id_attribute = "id" + id_attribute = 'id' #: The attribute used for ``:lang()`` depends on the document language: #: http://www.w3.org/TR/selectors/#lang-pseudo - lang_attribute = "xml:lang" + lang_attribute = 'xml:lang' #: The case sensitivity of document language element names, #: attribute names, and attribute values in selectors depends @@ -200,36 +168,31 @@ class GenericTranslator: # class used to represent and xpath expression xpathexpr_cls = XPathExpr - def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: + def css_to_xpath(self, css, prefix='descendant-or-self::'): """Translate a *group of selectors* to XPath. Pseudo-elements are not supported here since XPath only knows about "real" elements. :param css: - A *group of selectors* as a string. + A *group of selectors* as an Unicode string. :param prefix: This string is prepended to the XPath expression for each selector. The default makes selectors scoped to the context node’s subtree. :raises: - :class:`~cssselect.SelectorSyntaxError` on invalid selectors, + :class:`SelectorSyntaxError` on invalid selectors, :class:`ExpressionError` on unknown/unsupported selectors, including pseudo-elements. :returns: - The equivalent XPath 1.0 expression as a string. + The equivalent XPath 1.0 expression as an Unicode string. """ - return " | ".join( - self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) - for selector in parse(css) - ) + return ' | '.join(self.selector_to_xpath(selector, prefix, + translate_pseudo_elements=True) + for selector in parse(css)) - def selector_to_xpath( - self, - selector: Selector, - prefix: str = "descendant-or-self::", - translate_pseudo_elements: bool = False, - ) -> str: + def selector_to_xpath(self, selector, prefix='descendant-or-self::', + translate_pseudo_elements=False): """Translate a parsed selector to XPath. @@ -247,458 +210,289 @@ def selector_to_xpath( :raises: :class:`ExpressionError` on unknown/unsupported selectors. :returns: - The equivalent XPath 1.0 expression as a string. + The equivalent XPath 1.0 expression as an Unicode string. """ - tree = getattr(selector, "parsed_tree", None) + tree = getattr(selector, 'parsed_tree', None) if not tree: - raise TypeError(f"Expected a parsed selector, got {selector!r}") + raise TypeError('Expected a parsed selector, got %r' % (selector,)) xpath = self.xpath(tree) assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' if translate_pseudo_elements and selector.pseudo_element: xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) - return (prefix or "") + str(xpath) + return (prefix or '') + _unicode(xpath) - def xpath_pseudo_element( - self, xpath: XPathExpr, pseudo_element: PseudoElement - ) -> XPathExpr: + def xpath_pseudo_element(self, xpath, pseudo_element): """Translate a pseudo-element. Defaults to not supporting pseudo-elements at all, but can be overridden by sub-classes. """ - raise ExpressionError("Pseudo-elements are not supported.") + raise ExpressionError('Pseudo-elements are not supported.') @staticmethod - def xpath_literal(s: str) -> str: - s = str(s) + def xpath_literal(s): + s = _unicode(s) if "'" not in s: - s = f"'{s}'" + s = "'%s'" % s elif '"' not in s: - s = f'"{s}"' + s = '"%s"' % s else: - parts_quoted = [ - f'"{part}"' if "'" in part else f"'{part}'" - for part in split_at_single_quotes(s) - if part - ] - s = "concat({})".format(",".join(parts_quoted)) + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) return s - def xpath(self, parsed_selector: Tree) -> XPathExpr: + def xpath(self, parsed_selector): """Translate any parsed selector object.""" type_name = type(parsed_selector).__name__ - method = cast( - "Callable[[Tree], XPathExpr] | None", - getattr(self, f"xpath_{type_name.lower()}", None), - ) + method = getattr(self, 'xpath_%s' % type_name.lower(), None) if method is None: - raise ExpressionError(f"{type_name} is not supported.") + raise ExpressionError('%s is not supported.' % type_name) return method(parsed_selector) + # Dispatched by parsed object type - def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: + def xpath_combinedselector(self, combined): """Translate a combined selector.""" combinator = self.combinator_mapping[combined.combinator] - method = cast( - "Callable[[XPathExpr, XPathExpr], XPathExpr]", - getattr(self, f"xpath_{combinator}_combinator"), - ) - return method(self.xpath(combined.selector), self.xpath(combined.subselector)) + method = getattr(self, 'xpath_%s_combinator' % combinator) + return method(self.xpath(combined.selector), + self.xpath(combined.subselector)) - def xpath_negation(self, negation: Negation) -> XPathExpr: + def xpath_negation(self, negation): xpath = self.xpath(negation.selector) sub_xpath = self.xpath(negation.subselector) sub_xpath.add_name_test() if sub_xpath.condition: - return xpath.add_condition(f"not({sub_xpath.condition})") - return xpath.add_condition("0") - - def xpath_relation(self, relation: Relation) -> XPathExpr: - xpath = self.xpath(relation.selector) - combinator = relation.combinator - subselector = relation.subselector - right = self.xpath(subselector.parsed_tree) - method = cast( - "Callable[[XPathExpr, XPathExpr], XPathExpr]", - getattr( - self, - f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator", - ), - ) - return method(xpath, right) - - def xpath_matching(self, matching: Matching) -> XPathExpr: - xpath = self.xpath(matching.selector) - exprs = [self.xpath(selector) for selector in matching.selector_list] - for e in exprs: - e.add_name_test() - if e.condition: - xpath.add_condition(e.condition, "or") - return xpath - - def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: - xpath = self.xpath(matching.selector) - exprs = [self.xpath(selector) for selector in matching.selector_list] - for e in exprs: - e.add_name_test() - if e.condition: - xpath.add_condition(e.condition, "or") - return xpath + return xpath.add_condition('not(%s)' % sub_xpath.condition) + else: + return xpath.add_condition('0') - def xpath_function(self, function: Function) -> XPathExpr: + def xpath_function(self, function): """Translate a functional pseudo-class.""" - method_name = "xpath_{}_function".format(function.name.replace("-", "_")) - method = cast( - "Callable[[XPathExpr, Function], XPathExpr] | None", - getattr(self, method_name, None), - ) + method = 'xpath_%s_function' % function.name.replace('-', '_') + method = _unicode_safe_getattr(self, method, None) if not method: - raise ExpressionError(f"The pseudo-class :{function.name}() is unknown") + raise ExpressionError( + "The pseudo-class :%s() is unknown" % function.name) return method(self.xpath(function.selector), function) - def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: + def xpath_pseudo(self, pseudo): """Translate a pseudo-class.""" - method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_")) - method = cast( - "Callable[[XPathExpr], XPathExpr] | None", - getattr(self, method_name, None), - ) + method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_') + method = _unicode_safe_getattr(self, method, None) if not method: # TODO: better error message for pseudo-elements? - raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown") + raise ExpressionError( + "The pseudo-class :%s is unknown" % pseudo.ident) return method(self.xpath(pseudo.selector)) - def xpath_attrib(self, selector: Attrib) -> XPathExpr: + + def xpath_attrib(self, selector): """Translate an attribute selector.""" operator = self.attribute_operator_mapping[selector.operator] - method = cast( - "Callable[[XPathExpr, str, str | None], XPathExpr]", - getattr(self, f"xpath_attrib_{operator}"), - ) + method = getattr(self, 'xpath_attrib_%s' % operator) if self.lower_case_attribute_names: name = selector.attrib.lower() else: name = selector.attrib safe = is_safe_name(name) if selector.namespace: - name = f"{selector.namespace}:{name}" + name = '%s:%s' % (selector.namespace, name) safe = safe and is_safe_name(selector.namespace) if safe: - attrib = "@" + name + attrib = '@' + name else: - attrib = f"attribute::*[name() = {self.xpath_literal(name)}]" - if selector.value is None: - value = None - elif self.lower_case_attribute_values: - value = cast("str", selector.value.value).lower() + attrib = 'attribute::*[name() = %s]' % self.xpath_literal(name) + if self.lower_case_attribute_values: + value = selector.value.lower() else: - value = selector.value.value + value = selector.value return method(self.xpath(selector.selector), attrib, value) - def xpath_class(self, class_selector: Class) -> XPathExpr: + def xpath_class(self, class_selector): """Translate a class selector.""" # .foo is defined as [class~=foo] in the spec. xpath = self.xpath(class_selector.selector) - return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) + return self.xpath_attrib_includes( + xpath, '@class', class_selector.class_name) - def xpath_hash(self, id_selector: Hash) -> XPathExpr: + def xpath_hash(self, id_selector): """Translate an ID selector.""" xpath = self.xpath(id_selector.selector) - return self.xpath_attrib_equals(xpath, "@id", id_selector.id) + return self.xpath_attrib_equals(xpath, '@id', id_selector.id) - def xpath_element(self, selector: Element) -> XPathExpr: + def xpath_element(self, selector): """Translate a type or universal selector.""" element = selector.element if not element: - element = "*" + element = '*' safe = True else: - safe = bool(is_safe_name(element)) + safe = is_safe_name(element) if self.lower_case_element_names: element = element.lower() if selector.namespace: # Namespace prefixes are case-sensitive. # http://www.w3.org/TR/css3-namespace/#prefixes - element = f"{selector.namespace}:{element}" - safe = safe and bool(is_safe_name(selector.namespace)) + element = '%s:%s' % (selector.namespace, element) + safe = safe and is_safe_name(selector.namespace) xpath = self.xpathexpr_cls(element=element) if not safe: xpath.add_name_test() return xpath + # CombinedSelector: dispatch by combinator - def xpath_descendant_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: + def xpath_descendant_combinator(self, left, right): """right is a child, grand-child or further descendant of left""" - return left.join("/descendant-or-self::*/", right) + return left.join('/descendant-or-self::*/', right) - def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: + def xpath_child_combinator(self, left, right): """right is an immediate child of left""" - return left.join("/", right) + return left.join('/', right) - def xpath_direct_adjacent_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: + def xpath_direct_adjacent_combinator(self, left, right): """right is a sibling immediately after left""" - xpath = left.join("/following-sibling::", right) + xpath = left.join('/following-sibling::', right) xpath.add_name_test() - return xpath.add_condition("position() = 1") + return xpath.add_condition('position() = 1') - def xpath_indirect_adjacent_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: + def xpath_indirect_adjacent_combinator(self, left, right): """right is a sibling after left, immediately or not""" - return left.join("/following-sibling::", right) - - def xpath_relation_descendant_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: - """right is a child, grand-child or further descendant of left; select left""" - return left.join( - "[descendant::", right, closing_combiner="]", has_inner_condition=True - ) - - def xpath_relation_child_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: - """right is an immediate child of left; select left""" - return left.join("[./", right, closing_combiner="]") - - def xpath_relation_direct_adjacent_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: - """right is a sibling immediately after left; select left""" - return left.add_condition( - f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]" - ) + return left.join('/following-sibling::', right) - def xpath_relation_indirect_adjacent_combinator( - self, left: XPathExpr, right: XPathExpr - ) -> XPathExpr: - """right is a sibling after left, immediately or not; select left""" - return left.join("[following-sibling::", right, closing_combiner="]") # Function: dispatch by function/pseudo-class name - def xpath_nth_child_function( - self, - xpath: XPathExpr, - function: Function, - last: bool = False, - add_name_test: bool = True, - ) -> XPathExpr: + def xpath_nth_child_function(self, xpath, function, last=False, + add_name_test=True): try: a, b = parse_series(function.arguments) - except ValueError as ex: - raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex - - # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: - # - # :nth-child(an+b) - # an+b-1 siblings before - # - # :nth-last-child(an+b) - # an+b-1 siblings after - # - # :nth-of-type(an+b) - # an+b-1 siblings with the same expanded element name before - # - # :nth-last-of-type(an+b) - # an+b-1 siblings with the same expanded element name after - # - # So, - # for :nth-child and :nth-of-type - # - # count(preceding-sibling::) = an+b-1 - # - # for :nth-last-child and :nth-last-of-type - # - # count(following-sibling::) = an+b-1 - # - # therefore, - # count(...) - (b-1) ≡ 0 (mod a) - # - # if a == 0: - # ~~~~~~~~~~ - # count(...) = b-1 - # - # if a < 0: - # ~~~~~~~~~ - # count(...) - b +1 <= 0 - # -> count(...) <= b-1 - # - # if a > 0: - # ~~~~~~~~~ - # count(...) - b +1 >= 0 - # -> count(...) >= b-1 - - # work with b-1 instead - b_min_1 = b - 1 - - # early-exit condition 1: - # ~~~~~~~~~~~~~~~~~~~~~~~ - # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, - # and since n ∈ {0, 1, 2, ...}, if b-1<=0, - # there is always an "n" matching any number of siblings (maybe none) - if a == 1 and b_min_1 <= 0: - return xpath - - # early-exit condition 2: - # ~~~~~~~~~~~~~~~~~~~~~~~ - # an+b-1 siblings with a<0 and (b-1)<0 is not possible - if a < 0 and b_min_1 < 0: - return xpath.add_condition("0") - - # `add_name_test` boolean is inverted and somewhat counter-intuitive: - # - # nth_of_type() calls nth_child(add_name_test=False) - nodetest = "*" if add_name_test else f"{xpath.element}" - - # count siblings before or after the element - if not last: - siblings_count = f"count(preceding-sibling::{nodetest})" - else: - siblings_count = f"count(following-sibling::{nodetest})" - - # special case of fixed position: nth-*(0n+b) - # if a == 0: - # ~~~~~~~~~~ - # count(***-sibling::***) = b-1 + except ValueError: + raise ExpressionError("Invalid series: '%r'" % function.arguments) + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() if a == 0: - return xpath.add_condition(f"{siblings_count} = {b_min_1}") - - expressions = [] - - if a > 0: - # siblings count, an+b-1, is always >= 0, - # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, - # therefore, the predicate is only interesting if (b-1)>0 - if b_min_1 > 0: - expressions.append(f"{siblings_count} >= {b_min_1}") + if last: + b = 'last() - %s' % b + return xpath.add_condition('position() = %s' % b) + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) else: - # if a<0, and (b-1)<0, no "n" satisfies this, - # this is tested above as an early exist condition - # otherwise, - expressions.append(f"{siblings_count} <= {b_min_1}") - - # operations modulo 1 or -1 are simpler, one only needs to verify: - # - # - either: - # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., - # i.e. count(***-sibling::***) >= (b-1) - # - # - or: - # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., - # i.e. count(***-sibling::***) <= (b-1) - # we we just did above. - # - if abs(a) != 1: - # count(***-sibling::***) - (b-1) ≡ 0 (mod a) - left = siblings_count - - # apply "modulo a" on 2nd term, -(b-1), - # to simplify things like "(... +6) % -3", - # and also make it positive with |a| - b_neg = (-b_min_1) % abs(a) - - if b_neg != 0: - left = f"({left} +{b_neg})" - - expressions.append(f"{left} mod {a} = 0") - - template = "(%s)" if len(expressions) > 1 else "%s" - xpath.add_condition( - " and ".join(template % expression for expression in expressions) - ) + b_neg = '+%s' % (-b) + if a != 1: + expr = ['(position() %s) mod %s = 0' % (b_neg, a)] + else: + expr = [] + if b >= 0: + expr.append('position() >= %s' % b) + elif b < 0 and last: + expr.append('position() < (last() %s)' % b) + expr = ' and '.join(expr) + if expr: + xpath.add_condition(expr) return xpath - - def xpath_nth_last_child_function( - self, xpath: XPathExpr, function: Function - ) -> XPathExpr: + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def xpath_nth_last_child_function(self, xpath, function): return self.xpath_nth_child_function(xpath, function, last=True) - def xpath_nth_of_type_function( - self, xpath: XPathExpr, function: Function - ) -> XPathExpr: - if xpath.element == "*": - raise ExpressionError("*:nth-of-type() is not implemented") - return self.xpath_nth_child_function(xpath, function, add_name_test=False) - - def xpath_nth_last_of_type_function( - self, xpath: XPathExpr, function: Function - ) -> XPathExpr: - if xpath.element == "*": - raise ExpressionError("*:nth-of-type() is not implemented") - return self.xpath_nth_child_function( - xpath, function, last=True, add_name_test=False - ) + def xpath_nth_of_type_function(self, xpath, function): + if xpath.element == '*': + raise ExpressionError( + "*:nth-of-type() is not implemented") + return self.xpath_nth_child_function(xpath, function, + add_name_test=False) + + def xpath_nth_last_of_type_function(self, xpath, function): + if xpath.element == '*': + raise ExpressionError( + "*:nth-of-type() is not implemented") + return self.xpath_nth_child_function(xpath, function, last=True, + add_name_test=False) - def xpath_contains_function( - self, xpath: XPathExpr, function: Function - ) -> XPathExpr: + def xpath_contains_function(self, xpath, function): # Defined there, removed in later drafts: # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors - if function.argument_types() not in (["STRING"], ["IDENT"]): + if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( - f"Expected a single string or ident for :contains(), got {function.arguments!r}" - ) - value = cast("str", function.arguments[0].value) - return xpath.add_condition(f"contains(., {self.xpath_literal(value)})") + "Expected a single string or ident for :contains(), got %r" + % function.arguments) + value = function.arguments[0].value + return xpath.add_condition( + 'contains(., %s)' % self.xpath_literal(value)) - def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: - if function.argument_types() not in (["STRING"], ["IDENT"]): + def xpath_lang_function(self, xpath, function): + if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( - f"Expected a single string or ident for :lang(), got {function.arguments!r}" - ) - value = cast("str", function.arguments[0].value) - return xpath.add_condition(f"lang({self.xpath_literal(value)})") + "Expected a single string or ident for :lang(), got %r" + % function.arguments) + value = function.arguments[0].value + return xpath.add_condition( + "lang(%s)" % (self.xpath_literal(value))) + # Pseudo: dispatch by pseudo-class name - def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: + def xpath_root_pseudo(self, xpath): return xpath.add_condition("not(parent::*)") - # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") - # Works only at the start of a selector - # Needed to get immediate children of a processed selector in Scrapy - # for product in response.css('.product'): - # description = product.css(':scope > div::text').get() - def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("1") - - def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("count(preceding-sibling::*) = 0") + def xpath_first_child_pseudo(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + return xpath.add_condition('position() = 1') - def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("count(following-sibling::*) = 0") + def xpath_last_child_pseudo(self, xpath): + xpath.add_star_prefix() + xpath.add_name_test() + return xpath.add_condition('position() = last()') - def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: - if xpath.element == "*": - raise ExpressionError("*:first-of-type is not implemented") - return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0") + def xpath_first_of_type_pseudo(self, xpath): + if xpath.element == '*': + raise ExpressionError( + "*:first-of-type is not implemented") + xpath.add_star_prefix() + return xpath.add_condition('position() = 1') - def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: - if xpath.element == "*": - raise ExpressionError("*:last-of-type is not implemented") - return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0") + def xpath_last_of_type_pseudo(self, xpath): + if xpath.element == '*': + raise ExpressionError( + "*:last-of-type is not implemented") + xpath.add_star_prefix() + return xpath.add_condition('position() = last()') - def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("count(parent::*/child::*) = 1") + def xpath_only_child_pseudo(self, xpath): + xpath.add_name_test() + xpath.add_star_prefix() + return xpath.add_condition('last() = 1') - def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: - if xpath.element == "*": - raise ExpressionError("*:only-of-type is not implemented") - return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1") + def xpath_only_of_type_pseudo(self, xpath): + if xpath.element == '*': + raise ExpressionError( + "*:only-of-type is not implemented") + return xpath.add_condition('last() = 1') - def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: + def xpath_empty_pseudo(self, xpath): return xpath.add_condition("not(*) and not(string-length())") - def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: + def pseudo_never_matches(self, xpath): """Common implementation for pseudo-classes that never match.""" return xpath.add_condition("0") @@ -714,88 +508,67 @@ def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: # Attrib: dispatch by attribute operator - def xpath_attrib_exists( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: + def xpath_attrib_exists(self, xpath, name, value): assert not value xpath.add_condition(name) return xpath - def xpath_attrib_equals( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: - assert value is not None - xpath.add_condition(f"{name} = {self.xpath_literal(value)}") + def xpath_attrib_equals(self, xpath, name, value): + xpath.add_condition('%s = %s' % (name, self.xpath_literal(value))) return xpath - def xpath_attrib_different( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: - assert value is not None + def xpath_attrib_different(self, xpath, name, value): # FIXME: this seems like a weird hack... if value: - xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}") + xpath.add_condition('not(%s) or %s != %s' + % (name, name, self.xpath_literal(value))) else: - xpath.add_condition(f"{name} != {self.xpath_literal(value)}") + xpath.add_condition('%s != %s' + % (name, self.xpath_literal(value))) return xpath - def xpath_attrib_includes( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: - if value and is_non_whitespace(value): - arg = self.xpath_literal(" " + value + " ") + def xpath_attrib_includes(self, xpath, name, value): + if is_non_whitespace(value): xpath.add_condition( - f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})" - ) + "%s and contains(concat(' ', normalize-space(%s), ' '), %s)" + % (name, name, self.xpath_literal(' '+value+' '))) else: - xpath.add_condition("0") + xpath.add_condition('0') return xpath - def xpath_attrib_dashmatch( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: - assert value is not None - arg = self.xpath_literal(value) - arg_dash = self.xpath_literal(value + "-") + def xpath_attrib_dashmatch(self, xpath, name, value): # Weird, but true... - xpath.add_condition( - f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))" - ) + xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % ( + name, + name, self.xpath_literal(value), + name, self.xpath_literal(value + '-'))) return xpath - def xpath_attrib_prefixmatch( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: + def xpath_attrib_prefixmatch(self, xpath, name, value): if value: - xpath.add_condition( - f"{name} and starts-with({name}, {self.xpath_literal(value)})" - ) + xpath.add_condition('%s and starts-with(%s, %s)' % ( + name, name, self.xpath_literal(value))) else: - xpath.add_condition("0") + xpath.add_condition('0') return xpath - def xpath_attrib_suffixmatch( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: + def xpath_attrib_suffixmatch(self, xpath, name, value): if value: # Oddly there is a starts-with in XPath 1.0, but not ends-with xpath.add_condition( - f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}" - ) + '%s and substring(%s, string-length(%s)-%s) = %s' + % (name, name, name, len(value)-1, self.xpath_literal(value))) else: - xpath.add_condition("0") + xpath.add_condition('0') return xpath - def xpath_attrib_substringmatch( - self, xpath: XPathExpr, name: str, value: str | None - ) -> XPathExpr: + def xpath_attrib_substringmatch(self, xpath, name, value): if value: # Attribute selectors are case sensitive - xpath.add_condition( - f"{name} and contains({name}, {self.xpath_literal(value)})" - ) + xpath.add_condition('%s and contains(%s, %s)' % ( + name, name, self.xpath_literal(value))) else: - xpath.add_condition("0") + xpath.add_condition('0') return xpath @@ -816,52 +589,47 @@ class HTMLTranslator(GenericTranslator): """ - lang_attribute = "lang" + lang_attribute = 'lang' - def __init__(self, xhtml: bool = False) -> None: + def __init__(self, xhtml=False): self.xhtml = xhtml # Might be useful for sub-classes? if not xhtml: # See their definition in GenericTranslator. self.lower_case_element_names = True self.lower_case_attribute_names = True - def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: + def xpath_checked_pseudo(self, xpath): # FIXME: is this really all the elements? return xpath.add_condition( "(@selected and name(.) = 'option') or " "(@checked " - "and (name(.) = 'input' or name(.) = 'command')" - "and (@type = 'checkbox' or @type = 'radio'))" - ) + "and (name(.) = 'input' or name(.) = 'command')" + "and (@type = 'checkbox' or @type = 'radio'))") - def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: - if function.argument_types() not in (["STRING"], ["IDENT"]): + def xpath_lang_function(self, xpath, function): + if function.argument_types() not in (['STRING'], ['IDENT']): raise ExpressionError( - f"Expected a single string or ident for :lang(), got {function.arguments!r}" - ) + "Expected a single string or ident for :lang(), got %r" + % function.arguments) value = function.arguments[0].value - assert value - arg = self.xpath_literal(value.lower() + "-") return xpath.add_condition( "ancestor-or-self::*[@lang][1][starts-with(concat(" - # XPath 1.0 has no lower-case function... - f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " - "'abcdefghijklmnopqrstuvwxyz'), " - f"'-'), {arg})]" - ) + # XPath 1.0 has no lower-case function... + "translate(@%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " + "'abcdefghijklmnopqrstuvwxyz'), " + "'-'), %s)]" + % (self.lang_attribute, self.xpath_literal(value.lower() + '-'))) - def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition( - "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" - ) + def xpath_link_pseudo(self, xpath): + return xpath.add_condition("@href and " + "(name(.) = 'a' or name(.) = 'link' or name(.) = 'area')") # Links are never visited, the implementation for :visited is the same # as in GenericTranslator - def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: + def xpath_disabled_pseudo(self, xpath): # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition( - """ + return xpath.add_condition(''' ( @disabled and ( @@ -883,15 +651,13 @@ def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: ) and ancestor::fieldset[@disabled] ) - """ - ) + ''') # FIXME: in the second half, add "and is not a descendant of that # fieldset element's first legend element child, if any." - def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: + def xpath_enabled_pseudo(self, xpath): # http://www.w3.org/TR/html5/section-index.html#attributes-1 - return xpath.add_condition( - """ + return xpath.add_condition(''' ( @href and ( name(.) = 'a' or @@ -919,8 +685,7 @@ def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: @disabled or ancestor::optgroup[@disabled] ) ) - """ - ) + ''') # FIXME: ... or "li elements that are children of menu elements, # and that have a child element that defines a command, if the first # such element's Disabled State facet is false (not disabled)". diff --git a/docs/conf.py b/docs/conf.py index da3f023..22e6032 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- # # cssselect documentation build configuration file, created by # sphinx-quickstart on Tue Mar 27 14:20:34 2012. @@ -11,210 +12,217 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import re -from pathlib import Path +import sys, os, re # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) +#sys.path.insert(0, os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' +#needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', + 'sphinx.ext.doctest'] # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = {".rst": "restructuredtext"} +source_suffix = '.rst' # The encoding of source files. -# source_encoding = 'utf-8-sig' +#source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = "index" +master_doc = 'index' # General information about the project. -project = "cssselect" -project_copyright = "2012-2017, Simon Sapin, Scrapy developers" +project = 'cssselect' +copyright = '2012, Simon Sapin' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -init_py = (Path(__file__).parent.parent / "cssselect" / "__init__.py").read_text() -release = re.search('VERSION = "([^"]+)"', init_py).group(1) +init_py = open(os.path.join(os.path.dirname(__file__), + '..', 'cssselect', '__init__.py')).read() +release = re.search("VERSION = '([^']+)'", init_py).group(1) # The short X.Y version. -version = release.rstrip("dev") +version = release.rstrip('dev') # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# language = None +#language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -# today = '' +#today = '' # Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' +#today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build"] +exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all documents. -# default_role = None +#default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True +#add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -# add_module_names = True +#add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -# show_authors = False +#show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" +pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] +#modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = "sphinx_rtd_theme" +#html_theme = 'agogo' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# html_theme_options = {} +#html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] +#html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -# html_title = None +#html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None +#html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -# html_logo = None +#html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -# html_favicon = None +#html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] +#html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' +#html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -# html_use_smartypants = True +#html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -# html_sidebars = {} +#html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -# html_additional_pages = {} +#html_additional_pages = {} # If false, no module index is generated. -# html_domain_indices = True +#html_domain_indices = True # If false, no index is generated. -# html_use_index = True +#html_use_index = True # If true, the index is split into individual pages for each letter. -# html_split_index = False +#html_split_index = False # If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True +#html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True +#html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True +#html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -# html_use_opensearch = '' +#html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None +#html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = "cssselectdoc" +htmlhelp_basename = 'cssselectdoc' # -- Options for LaTeX output -------------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - #'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - #'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - #'preamble': '', +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ("index", "cssselect.tex", "cssselect Documentation", "Simon Sapin", "manual"), + ('index', 'cssselect.tex', 'cssselect Documentation', + 'Simon Sapin', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -# latex_logo = None +#latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -# latex_use_parts = False +#latex_use_parts = False # If true, show page references after internal links. -# latex_show_pagerefs = False +#latex_show_pagerefs = False # If true, show URL addresses after external links. -# latex_show_urls = False +#latex_show_urls = False # Documents to append as an appendix to all manuals. -# latex_appendices = [] +#latex_appendices = [] # If false, no module index is generated. -# latex_domain_indices = True +#latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [("index", "cssselect", "cssselect Documentation", ["Simon Sapin"], 1)] +man_pages = [ + ('index', 'cssselect', 'cssselect Documentation', + ['Simon Sapin'], 1) +] # If true, show URL addresses after external links. -# man_show_urls = False +#man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -223,35 +231,20 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ( - "index", - "cssselect", - "cssselect Documentation", - "Simon Sapin", - "cssselect", - "One line description of project.", - "Miscellaneous", - ), + ('index', 'cssselect', 'cssselect Documentation', + 'Simon Sapin', 'cssselect', 'One line description of project.', + 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. -# texinfo_appendices = [] +#texinfo_appendices = [] # If false, no module index is generated. -# texinfo_domain_indices = True +#texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' +#texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} - - -# --- Nitpicking options ------------------------------------------------------ - -nitpicky = True -nitpick_ignore = [ - # explicitly not a part of the public API - ("py:class", "Token"), -] +intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/conftest.py b/docs/conftest.py deleted file mode 100644 index a71d108..0000000 --- a/docs/conftest.py +++ /dev/null @@ -1,21 +0,0 @@ -from doctest import ELLIPSIS, NORMALIZE_WHITESPACE - -from sybil import Sybil -from sybil.parsers.doctest import DocTestParser -from sybil.parsers.skip import skip - -try: - # sybil 3.0.0+ - from sybil.parsers.codeblock import PythonCodeBlockParser -except ImportError: - from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser - - -pytest_collect_file = Sybil( - parsers=[ - DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), - PythonCodeBlockParser(future_imports=["print_function"]), - skip, - ], - pattern="*.rst", -).pytest() diff --git a/docs/index.rst b/docs/index.rst index a024f20..4ac7401 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -99,24 +99,12 @@ These applicable pseudo-classes are not yet implemented: you specify an element type, but not with ``*`` On the other hand, *cssselect* supports some selectors that are not -in the Level 3 specification. - -These parts of the Level 4 specification are supported (note that a large part -of the Level 4 additions is not applicable to cssselect similarly to ``:hover`` -or not representable in XPath 1.0 so the complete specification is unlikely to -be implemented): - -* The ``:scope`` pseudo-class. Limitation: it can only be used at a start of a - selector. -* The ``:is()``, ``:where()`` and ``:has()`` pseudo-classes. Limitation: - ``:has()`` cannot contain nested ``:has()`` or ``:not()``. - -These are non-standard extensions: +in the Level 3 specification: * The ``:contains(text)`` pseudo-class that existed in `an early draft`_ but was then removed. * The ``!=`` attribute operator. ``[foo!=bar]`` is the same as - ``:not([foo=bar])``. + ``:not([foo=bar])`` * ``:not()`` accepts a *sequence of simple selectors*, not just single *simple selector*. For example, ``:not(a.important[rel])`` is allowed, even though the negation contains 3 *simple selectors*. @@ -151,7 +139,7 @@ and their signature. You can look at the `source code`_ to see how it works. However, be aware that this API is not very stable yet. It might change and break your sub-class. -.. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py +.. _source code: https://github.com/SimonSapin/cssselect/blob/master/cssselect/xpath.py Namespaces diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 21cb2eb..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -sphinx==8.2.3 -sphinx-rtd-theme==3.0.2 diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index c7c54a0..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,239 +0,0 @@ -[build-system] -build-backend = "hatchling.build" -requires = ["hatchling>=1.27.0"] - -[project] -name = "cssselect" -license = "BSD-3-Clause" -license-files = ["LICENSE", "AUTHORS"] -description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" -readme = "README.rst" -authors = [{ name = "Ian Bicking", email = "ianb@colorstudy.com" }] -maintainers = [{ name = "Paul Tremberth", email = "paul.tremberth@gmail.com" }] -requires-python = ">=3.10" -classifiers = [ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] -dynamic = ["version"] - -[project.urls] -"Homepage" = "https://github.com/scrapy/cssselect" - -[tool.hatch.version] -path = "cssselect/__init__.py" - -[tool.hatch.build.targets.sdist] -include = [ - "/cssselect", - "/docs", - "/tests", - "/CHANGES", - "/README.rst", - "/tox.ini", -] -exclude = [ - "/docs/_build", -] - -[tool.hatch.build.targets.wheel] -packages = ["cssselect"] - -[tool.bumpversion] -current_version = "1.4.0" -commit = true -tag = true - -[[tool.bumpversion.files]] -filename = "cssselect/__init__.py" - -[[tool.bumpversion.files]] -filename = "CHANGES" -search = "^Unreleased\\.$" -replace = "Released on {now:%Y-%m-%d}." -regex = true - -[tool.coverage.run] -branch = true -source = ["cssselect"] - -[tool.coverage.report] -exclude_also = [ - "def __repr__", - "if sys.version_info", - "if __name__ == '__main__':", -] - -[tool.mypy] -strict = true - -[tool.pylint.MASTER] -persistent = "no" -extension-pkg-allow-list = ["lxml"] - -[tool.pylint."MESSAGES CONTROL"] -enable = [ - "useless-suppression", -] -disable = [ - "consider-using-f-string", - "fixme", - "invalid-name", - "line-too-long", - "missing-class-docstring", - "missing-function-docstring", - "missing-module-docstring", - "no-member", - "not-callable", - "redefined-builtin", - "redefined-outer-name", - "too-few-public-methods", - "too-many-arguments", - "too-many-branches", - "too-many-function-args", - "too-many-lines", - "too-many-locals", - "too-many-positional-arguments", - "too-many-public-methods", - "too-many-statements", - "unused-argument", -] - -[tool.pytest.ini_options] -testpaths = ["tests"] - -[tool.ruff.lint] -extend-select = [ - # flake8-builtins - "A", - # flake8-async - "ASYNC", - # flake8-bugbear - "B", - # flake8-comprehensions - "C4", - # flake8-commas - "COM", - # pydocstyle - "D", - # flake8-future-annotations - "FA", - # flynt - "FLY", - # refurb - "FURB", - # isort - "I", - # flake8-implicit-str-concat - "ISC", - # flake8-logging - "LOG", - # Perflint - "PERF", - # pygrep-hooks - "PGH", - # flake8-pie - "PIE", - # pylint - "PL", - # flake8-pytest-style - "PT", - # flake8-use-pathlib - "PTH", - # flake8-pyi - "PYI", - # flake8-quotes - "Q", - # flake8-return - "RET", - # flake8-raise - "RSE", - # Ruff-specific rules - "RUF", - # flake8-bandit - "S", - # flake8-simplify - "SIM", - # flake8-slots - "SLOT", - # flake8-debugger - "T10", - # flake8-type-checking - "TC", - # pyupgrade - "UP", - # pycodestyle warnings - "W", - # flake8-2020 - "YTT", -] -ignore = [ - # Trailing comma missing - "COM812", - # Missing docstring in public module - "D100", - # Missing docstring in public class - "D101", - # Missing docstring in public method - "D102", - # Missing docstring in public function - "D103", - # Missing docstring in public package - "D104", - # Missing docstring in magic method - "D105", - # Missing docstring in public nested class - "D106", - # Missing docstring in __init__ - "D107", - # One-line docstring should fit on one line with quotes - "D200", - # No blank lines allowed after function docstring - "D202", - # 1 blank line required between summary line and description - "D205", - # Multi-line docstring closing quotes should be on a separate line - "D209", - # First line should end with a period - "D400", - # First line should be in imperative mood; try rephrasing - "D401", - # First line should not be the function's "signature" - "D402", - # First word of the first line should be properly capitalized - "D403", - # Too many return statements - "PLR0911", - # Too many branches - "PLR0912", - # Too many arguments in function definition - "PLR0913", - # Too many statements - "PLR0915", - # Magic value used in comparison - "PLR2004", - # String contains ambiguous {}. - "RUF001", - # Docstring contains ambiguous {}. - "RUF002", - # Comment contains ambiguous {}. - "RUF003", - # Mutable class attributes should be annotated with `typing.ClassVar` - "RUF012", - # Use of `assert` detected - "S101", -] - -[tool.ruff.lint.isort] -split-on-trailing-comma = false - -[tool.ruff.lint.pydocstyle] -convention = "pep257" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..ccddf11 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,10 @@ +[build_sphinx] +source-dir = docs +build-dir = docs/_build +#all_files = 1 + +[upload_sphinx] # Sphinx-PyPI-upload +upload-dir = docs/_build/html + +[pytest] +python_files=tests.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..bd1e385 --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +# coding: utf8 + +import re +import os.path +try: + from setuptools import setup + extra_kwargs = {'test_suite': 'cssselect.tests'} +except ImportError: + from distutils.core import setup + extra_kwargs = {} + + +ROOT = os.path.dirname(__file__) +README = open(os.path.join(ROOT, 'README.rst')).read() +INIT_PY = open(os.path.join(ROOT, 'cssselect', '__init__.py')).read() +VERSION = re.search("VERSION = '([^']+)'", INIT_PY).group(1) + + +setup( + name='cssselect', + version=VERSION, + author='Ian Bicking', + author_email='ianb@colorstudy.com', + maintainer='Simon Sapin', + maintainer_email='simon.sapin@exyr.org', + description= + 'cssselect parses CSS3 Selectors and translates them to XPath 1.0', + long_description=README, + url='http://packages.python.org/cssselect/', + license='BSD', + packages=['cssselect'], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.5', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + ], + **extra_kwargs +) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_cssselect.py b/tests/test_cssselect.py deleted file mode 100644 index dc67bb7..0000000 --- a/tests/test_cssselect.py +++ /dev/null @@ -1,1540 +0,0 @@ -#!/usr/bin/env python -""" -Tests for cssselect -=================== - -These tests can be run either by py.test or by the standard library's -unittest. They use plain ``assert`` statements and do little reporting -themselves in case of failure. - -Use py.test to get fancy error reporting and assert introspection. - - -:copyright: (c) 2007-2012 Ian Bicking and contributors. -See AUTHORS for more details. -:license: BSD, see LICENSE for more details. - -""" - -from __future__ import annotations - -import sys -import typing -import unittest -from typing import TYPE_CHECKING - -import pytest -from lxml import etree, html - -from cssselect import ( - ExpressionError, - GenericTranslator, - HTMLTranslator, - SelectorSyntaxError, - parse, -) -from cssselect.parser import ( - Function, - FunctionalPseudoElement, - PseudoElement, - Token, - parse_series, - tokenize, -) -from cssselect.xpath import XPathExpr - -if TYPE_CHECKING: - from collections.abc import Sequence - - -class TestCssselect(unittest.TestCase): - def test_tokenizer(self) -> None: - tokens = [ - str(item) - for item in tokenize(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)') - ] - assert tokens == [ - "", - "", - "' at 5>", - "", - # the no-break space is not whitespace in CSS - "", # f\xa0 - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - - def test_parser(self) -> None: - def repr_parse(css: str) -> list[str]: - selectors = parse(css) - for selector in selectors: - assert selector.pseudo_element is None - return [repr(selector.parsed_tree) for selector in selectors] - - def parse_many(first: str, *others: str) -> list[str]: - result = repr_parse(first) - for other in others: - assert repr_parse(other) == result - return result - - assert parse_many("*") == ["Element[*]"] - assert parse_many("*|*") == ["Element[*]"] - assert parse_many("*|foo") == ["Element[foo]"] - assert parse_many("|foo") == ["Element[foo]"] - assert parse_many("foo|*") == ["Element[foo|*]"] - assert parse_many("foo|bar") == ["Element[foo|bar]"] - # This will never match, but it is valid: - assert parse_many("#foo#bar") == ["Hash[Hash[Element[*]#foo]#bar]"] - assert parse_many( - "div>.foo", - "div> .foo", - "div >.foo", - "div > .foo", - "div \n> \t \t .foo", - "div\r>\n\n\n.foo", - "div\f>\f.foo", - ) == ["CombinedSelector[Element[div] > Class[Element[*].foo]]"] - assert parse_many( - "td.foo,.bar", "td.foo, .bar", "td.foo\t\r\n\f ,\t\r\n\f .bar" - ) == [ - "Class[Element[td].foo]", - "Class[Element[*].bar]", - ] - assert parse_many("div, td.foo, div.bar span") == [ - "Element[div]", - "Class[Element[td].foo]", - "CombinedSelector[Class[Element[div].bar] Element[span]]", - ] - assert parse_many("div > p") == ["CombinedSelector[Element[div] > Element[p]]"] - assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] - assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] - assert parse_many("td :first") == [ - "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" - ] - assert parse_many("td :first") == [ - "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" - ] - assert parse_many("a[name]", "a[ name\t]") == ["Attrib[Element[a][name]]"] - assert parse_many("a [name]") == [ - "CombinedSelector[Element[a] Attrib[Element[*][name]]]" - ] - assert parse_many('a[rel="include"]', "a[rel = include]") == [ - "Attrib[Element[a][rel = 'include']]" - ] - assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [ - "Attrib[Element[a][hreflang |= 'en']]" - ] - assert parse_many("div:nth-child(10)") == [ - "Function[Element[div]:nth-child(['10'])]" - ] - assert parse_many(":nth-child(2n+2)") == [ - "Function[Element[*]:nth-child(['2', 'n', '+2'])]" - ] - assert parse_many("div:nth-of-type(10)") == [ - "Function[Element[div]:nth-of-type(['10'])]" - ] - assert parse_many("div div:nth-of-type(10) .aclass") == [ - "CombinedSelector[CombinedSelector[Element[div] " - "Function[Element[div]:nth-of-type(['10'])]] " - " Class[Element[*].aclass]]" - ] - assert parse_many("label:only") == ["Pseudo[Element[label]:only]"] - assert parse_many("a:lang(fr)") == ["Function[Element[a]:lang(['fr'])]"] - assert parse_many('div:contains("foo")') == [ - "Function[Element[div]:contains(['foo'])]" - ] - assert parse_many("div#foobar") == ["Hash[Element[div]#foobar]"] - assert parse_many("div:not(div.foo)") == [ - "Negation[Element[div]:not(Class[Element[div].foo])]" - ] - assert parse_many("div:has(div.foo)") == [ - "Relation[Element[div]:has(Selector[Class[Element[div].foo]])]" - ] - assert parse_many("div:is(.foo, #bar)") == [ - "Matching[Element[div]:is(Class[Element[*].foo], Hash[Element[*]#bar])]" - ] - assert parse_many(":is(:hover, :visited)") == [ - "Matching[Element[*]:is(Pseudo[Element[*]:hover], Pseudo[Element[*]:visited])]" - ] - assert parse_many(":where(:hover, :visited)") == [ - "SpecificityAdjustment[Element[*]:where(Pseudo[Element[*]:hover]," - " Pseudo[Element[*]:visited])]" - ] - assert parse_many("td ~ th") == ["CombinedSelector[Element[td] ~ Element[th]]"] - assert parse_many(":scope > foo") == [ - "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" - ] - assert parse_many(" :scope > foo") == [ - "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" - ] - assert parse_many(":scope > foo bar > div") == [ - "CombinedSelector[CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " - "Element[foo]] Element[bar]] > Element[div]]" - ] - assert parse_many(":scope > #foo #bar") == [ - "CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " - "Hash[Element[*]#foo]] Hash[Element[*]#bar]]" - ] - - def test_pseudo_elements(self) -> None: - def parse_pseudo(css: str) -> list[tuple[str, str | None]]: - result: list[tuple[str, str | None]] = [] - for selector in parse(css): - pseudo = selector.pseudo_element - pseudo = str(pseudo) if pseudo else pseudo - # No Symbol here - assert pseudo is None or isinstance(pseudo, str) - selector_as_str = repr(selector.parsed_tree) - result.append((selector_as_str, pseudo)) - return result - - def parse_one(css: str) -> tuple[str, str | None]: - result = parse_pseudo(css) - assert len(result) == 1 - return result[0] - - def test_pseudo_repr(css: str) -> str: - result = parse(css) - assert len(result) == 1 - selector = result[0] - return repr(selector.parsed_tree) - - assert parse_one("foo") == ("Element[foo]", None) - assert parse_one("*") == ("Element[*]", None) - assert parse_one(":empty") == ("Pseudo[Element[*]:empty]", None) - assert parse_one(":scope") == ("Pseudo[Element[*]:scope]", None) - - # Special cases for CSS 2.1 pseudo-elements - assert parse_one(":BEfore") == ("Element[*]", "before") - assert parse_one(":aftER") == ("Element[*]", "after") - assert parse_one(":First-Line") == ("Element[*]", "first-line") - assert parse_one(":First-Letter") == ("Element[*]", "first-letter") - - assert parse_one("::befoRE") == ("Element[*]", "before") - assert parse_one("::AFter") == ("Element[*]", "after") - assert parse_one("::firsT-linE") == ("Element[*]", "first-line") - assert parse_one("::firsT-letteR") == ("Element[*]", "first-letter") - - assert parse_one("::text-content") == ("Element[*]", "text-content") - assert parse_one("::attr(name)") == ( - "Element[*]", - "FunctionalPseudoElement[::attr(['name'])]", - ) - - assert parse_one("::Selection") == ("Element[*]", "selection") - assert parse_one("foo:after") == ("Element[foo]", "after") - assert parse_one("foo::selection") == ("Element[foo]", "selection") - assert parse_one("lorem#ipsum ~ a#b.c[href]:empty::selection") == ( - "CombinedSelector[Hash[Element[lorem]#ipsum] ~ " - "Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]", - "selection", - ) - assert parse_pseudo(":scope > div, foo bar") == [ - ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), - ("CombinedSelector[Element[foo] Element[bar]]", None), - ] - assert parse_pseudo("foo bar, :scope > div") == [ - ("CombinedSelector[Element[foo] Element[bar]]", None), - ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), - ] - assert parse_pseudo("foo bar,:scope > div") == [ - ("CombinedSelector[Element[foo] Element[bar]]", None), - ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), - ] - assert parse_pseudo("foo:before, bar, baz:after") == [ - ("Element[foo]", "before"), - ("Element[bar]", None), - ("Element[baz]", "after"), - ] - - # Special cases for CSS 2.1 pseudo-elements are ignored by default - for pseudo in ("after", "before", "first-line", "first-letter"): - (selector,) = parse(f"e:{pseudo}") - assert selector.pseudo_element == pseudo - assert GenericTranslator().selector_to_xpath(selector, prefix="") == "e" - - # Pseudo Elements are ignored by default, but if allowed they are not - # supported by GenericTranslator - tr = GenericTranslator() - (selector,) = parse("e::foo") - assert selector.pseudo_element == "foo" - assert tr.selector_to_xpath(selector, prefix="") == "e" - with pytest.raises(ExpressionError): - tr.selector_to_xpath(selector, translate_pseudo_elements=True) - - # Special test for the unicode symbols and ':scope' element if check - # Errors if use repr() instead of __repr__() - assert test_pseudo_repr(":fİrst-child") == "Pseudo[Element[*]:fİrst-child]" - assert test_pseudo_repr(":scope") == "Pseudo[Element[*]:scope]" - - def test_specificity(self) -> None: - def specificity(css: str) -> tuple[int, int, int]: - selectors = parse(css) - assert len(selectors) == 1 - return selectors[0].specificity() - - assert specificity("*") == (0, 0, 0) - assert specificity(" foo") == (0, 0, 1) - assert specificity(":empty ") == (0, 1, 0) - assert specificity(":before") == (0, 0, 1) - assert specificity("*:before") == (0, 0, 1) - assert specificity(":nth-child(2)") == (0, 1, 0) - assert specificity(".bar") == (0, 1, 0) - assert specificity("[baz]") == (0, 1, 0) - assert specificity('[baz="4"]') == (0, 1, 0) - assert specificity('[baz^="4"]') == (0, 1, 0) - assert specificity("#lipsum") == (1, 0, 0) - assert specificity("::attr(name)") == (0, 0, 1) - - assert specificity(":not(*)") == (0, 0, 0) - assert specificity(":not(foo)") == (0, 0, 1) - assert specificity(":not(.foo)") == (0, 1, 0) - assert specificity(":not([foo])") == (0, 1, 0) - assert specificity(":not(:empty)") == (0, 1, 0) - assert specificity(":not(#foo)") == (1, 0, 0) - - assert specificity(":has(*)") == (0, 0, 0) - assert specificity(":has(foo)") == (0, 0, 1) - assert specificity(":has(.foo)") == (0, 1, 0) - assert specificity(":has(> foo)") == (0, 0, 1) - - assert specificity(":is(.foo, #bar)") == (1, 0, 0) - assert specificity(":is(:hover, :visited)") == (0, 1, 0) - assert specificity(":where(:hover, :visited)") == (0, 0, 0) - - assert specificity("foo:empty") == (0, 1, 1) - assert specificity("foo:before") == (0, 0, 2) - assert specificity("foo::before") == (0, 0, 2) - assert specificity("foo:empty::before") == (0, 1, 2) - - assert specificity("#lorem + foo#ipsum:first-child > bar:first-line") == ( - 2, - 1, - 3, - ) - - def test_css_export(self) -> None: - def css2css(css: str, res: str | None = None) -> None: - selectors = parse(css) - assert len(selectors) == 1 - assert selectors[0].canonical() == (res or css) - - css2css("*") - css2css(" foo", "foo") - css2css("Foo", "Foo") - css2css(":empty ", ":empty") - css2css(":before", "::before") - css2css(":beFOre", "::before") - css2css("*:before", "::before") - css2css(":nth-child(2)") - css2css(".bar") - css2css("[baz]") - css2css('[baz="4"]', "[baz='4']") - css2css('[baz^="4"]', "[baz^='4']") - css2css("[ns|attr='4']") - css2css("#lipsum") - css2css(":not(*)") - css2css(":not(foo)") - css2css(":not(*.foo)", ":not(.foo)") - css2css(":not(*[foo])", ":not([foo])") - css2css(":not(:empty)") - css2css(":not(#foo)") - css2css(":has(*)") - css2css(":has(foo)") - css2css(":has(*.foo)", ":has(.foo)") - css2css(":is(#bar, .foo)") - css2css(":is(:focused, :visited)") - css2css(":where(:focused, :visited)") - css2css("foo:empty") - css2css("foo::before") - css2css("foo:empty::before") - css2css('::name(arg + "val" - 3)', "::name(arg+'val'-3)") - css2css("#lorem + foo#ipsum:first-child > bar::first-line") - css2css("foo > *") - - def test_parse_errors(self) -> None: - def get_error(css: str) -> str | None: - try: - parse(css) - except SelectorSyntaxError: - return str(sys.exc_info()[1]) - return None - - assert get_error("attributes(href)/html/body/a") == ( - "Expected selector, got " - ) - assert get_error("attributes(href)") == ( - "Expected selector, got " - ) - assert get_error("html/body/a") == ("Expected selector, got ") - assert get_error(" ") == ("Expected selector, got ") - assert get_error("div, ") == ("Expected selector, got ") - assert get_error(" , div") == ("Expected selector, got ") - assert get_error("p, , div") == ("Expected selector, got ") - assert get_error("div > ") == ("Expected selector, got ") - assert get_error(" > div") == ("Expected selector, got ' at 2>") - assert get_error("foo|#bar") == ("Expected ident or '*', got ") - assert get_error("#.foo") == ("Expected selector, got ") - assert get_error(".#foo") == ("Expected ident, got ") - assert get_error(":#foo") == ("Expected ident, got ") - assert get_error("[*]") == ("Expected '|', got ") - assert get_error("[foo|]") == ("Expected ident, got ") - assert get_error("[#]") == ("Expected ident or '*', got ") - assert get_error("[foo=#]") == ( - "Expected string or ident, got " - ) - assert get_error("[href]a") == ("Expected selector, got ") - assert get_error("[rel=stylesheet]") is None - assert get_error("[rel:stylesheet]") == ( - "Operator expected, got " - ) - assert get_error("[rel=stylesheet") == ("Expected ']', got ") - assert get_error(":lang(fr)") is None - assert get_error(":lang(fr") == ("Expected an argument, got ") - assert get_error(':contains("foo') == ("Unclosed string at 10") - assert get_error("foo!") == ("Expected selector, got ") - - # Mis-placed pseudo-elements - assert get_error("a:before:empty") == ( - "Got pseudo-element ::before not at the end of a selector" - ) - assert get_error("li:before a") == ( - "Got pseudo-element ::before not at the end of a selector" - ) - assert get_error(":not(:before)") == ( - "Got pseudo-element ::before inside :not() at 12" - ) - assert get_error(":not(:not(a))") == ("Got nested :not()") - assert get_error(":is(:before)") == ( - "Got pseudo-element ::before inside function" - ) - assert get_error(":is(a b)") == ("Expected an argument, got ") - assert get_error(":where(:before)") == ( - "Got pseudo-element ::before inside function" - ) - assert get_error(":where(a b)") == ( - "Expected an argument, got " - ) - assert get_error(":scope > div :scope header") == ( - 'Got immediate child pseudo-element ":scope" not at the start of a selector' - ) - assert get_error("div :scope header") == ( - 'Got immediate child pseudo-element ":scope" not at the start of a selector' - ) - assert get_error("> div p") == ("Expected selector, got ' at 0>") - - # Unsupported :has() with several arguments - assert get_error(":has(a, b)") == ("Expected an argument, got ") - assert get_error(":has()") == ("Expected selector, got ") - - def test_translation(self) -> None: - def xpath(css: str) -> str: - return str(GenericTranslator().css_to_xpath(css, prefix="")) - - assert xpath("*") == "*" - assert xpath("e") == "e" - assert xpath("*|e") == "e" - assert xpath("e|f") == "e:f" - assert xpath("e[foo]") == "e[@foo]" - assert xpath("e[foo|bar]") == "e[@foo:bar]" - assert xpath('e[foo="bar"]') == "e[@foo = 'bar']" - assert xpath('e[foo~="bar"]') == ( - "e[@foo and contains(concat(' ', normalize-space(@foo), ' '), ' bar ')]" - ) - assert xpath('e[foo^="bar"]') == ("e[@foo and starts-with(@foo, 'bar')]") - assert xpath('e[foo$="bar"]') == ( - "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']" - ) - assert xpath('e[foo*="bar"]') == ("e[@foo and contains(@foo, 'bar')]") - assert xpath('e[hreflang|="en"]') == ( - "e[@hreflang and (@hreflang = 'en' or starts-with(@hreflang, 'en-'))]" - ) - - # --- nth-* and nth-last-* ------------------------------------- - assert xpath("e:nth-child(1)") == ("e[count(preceding-sibling::*) = 0]") - - # always true - assert xpath("e:nth-child(n)") == ("e") - assert xpath("e:nth-child(n+1)") == ("e") - # always true too - assert xpath("e:nth-child(n-10)") == ("e") - # b=2 is the limit... - assert xpath("e:nth-child(n+2)") == ("e[count(preceding-sibling::*) >= 1]") - # always false - assert xpath("e:nth-child(-n)") == ("e[0]") - # equivalent to first child - assert xpath("e:nth-child(-n+1)") == ("e[count(preceding-sibling::*) <= 0]") - - assert xpath("e:nth-child(3n+2)") == ( - "e[(count(preceding-sibling::*) >= 1) and " - "((count(preceding-sibling::*) +2) mod 3 = 0)]" - ) - assert xpath("e:nth-child(3n-2)") == ( - "e[count(preceding-sibling::*) mod 3 = 0]" - ) - assert xpath("e:nth-child(-n+6)") == ("e[count(preceding-sibling::*) <= 5]") - - assert xpath("e:nth-last-child(1)") == ("e[count(following-sibling::*) = 0]") - assert xpath("e:nth-last-child(2n)") == ( - "e[(count(following-sibling::*) +1) mod 2 = 0]" - ) - assert xpath("e:nth-last-child(2n+1)") == ( - "e[count(following-sibling::*) mod 2 = 0]" - ) - assert xpath("e:nth-last-child(2n+2)") == ( - "e[(count(following-sibling::*) >= 1) and " - "((count(following-sibling::*) +1) mod 2 = 0)]" - ) - assert xpath("e:nth-last-child(3n+1)") == ( - "e[count(following-sibling::*) mod 3 = 0]" - ) - # represents the two last e elements - assert xpath("e:nth-last-child(-n+2)") == ( - "e[count(following-sibling::*) <= 1]" - ) - - assert xpath("e:nth-of-type(1)") == ("e[count(preceding-sibling::e) = 0]") - assert xpath("e:nth-last-of-type(1)") == ("e[count(following-sibling::e) = 0]") - assert xpath("div e:nth-last-of-type(1) .aclass") == ( - "div/descendant-or-self::*/e[count(following-sibling::e) = 0]" - "/descendant-or-self::*/*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aclass ')]" - ) - - assert xpath("e:first-child") == ("e[count(preceding-sibling::*) = 0]") - assert xpath("e:last-child") == ("e[count(following-sibling::*) = 0]") - assert xpath("e:first-of-type") == ("e[count(preceding-sibling::e) = 0]") - assert xpath("e:last-of-type") == ("e[count(following-sibling::e) = 0]") - assert xpath("e:only-child") == ("e[count(parent::*/child::*) = 1]") - assert xpath("e:only-of-type") == ("e[count(parent::*/child::e) = 1]") - assert xpath("e:empty") == ("e[not(*) and not(string-length())]") - assert xpath("e:EmPTY") == ("e[not(*) and not(string-length())]") - assert xpath("e:root") == ("e[not(parent::*)]") - assert xpath("e:hover") == ("e[0]") # never matches - assert ( - xpath("div:has(bar.foo)") == "div[descendant::bar" - "[@class and contains(concat(' ', normalize-space(@class), ' '), ' foo ')]]" - ) - assert xpath("e:has(> f)") == "e[./f]" - assert xpath("e:has(f)") == "e[descendant::f]" - assert xpath("e:has(~ f)") == "e[following-sibling::f]" - assert ( - xpath("e:has(+ f)") - == "e[following-sibling::*[(name() = 'f') and (position() = 1)]]" - ) - assert xpath('e:contains("foo")') == ("e[contains(., 'foo')]") - assert xpath("e:ConTains(foo)") == ("e[contains(., 'foo')]") - assert xpath("e.warning") == ( - "e[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' warning ')]" - ) - assert xpath("e#myid") == ("e[@id = 'myid']") - assert xpath("e:not(:nth-child(odd))") == ( - "e[not(count(preceding-sibling::*) mod 2 = 0)]" - ) - assert xpath("e:nOT(*)") == ("e[0]") # never matches - assert xpath("e f") == ("e/descendant-or-self::*/f") - assert xpath("e > f") == ("e/f") - assert xpath("e + f") == ( - "e/following-sibling::*[(name() = 'f') and (position() = 1)]" - ) - assert xpath("e ~ f") == ("e/following-sibling::f") - assert xpath("e ~ f:nth-child(3)") == ( - "e/following-sibling::f[count(preceding-sibling::*) = 2]" - ) - assert xpath("div#container p") == ( - "div[@id = 'container']/descendant-or-self::*/p" - ) - assert xpath("e:where(foo)") == "e[name() = 'foo']" - assert xpath("e:where(foo, bar)") == "e[(name() = 'foo') or (name() = 'bar')]" - - # Invalid characters in XPath element names - assert xpath(r"di\a0 v") == ("*[name() = 'di v']") # di\xa0v - assert xpath(r"di\[v") == ("*[name() = 'di[v']") - assert xpath(r"[h\a0 ref]") == ("*[attribute::*[name() = 'h ref']]") # h\xa0ref - assert xpath(r"[h\]ref]") == ("*[attribute::*[name() = 'h]ref']]") - - with pytest.raises(ExpressionError): - xpath(":fİrst-child") - with pytest.raises(ExpressionError): - xpath(":first-of-type") - with pytest.raises(ExpressionError): - xpath(":only-of-type") - with pytest.raises(ExpressionError): - xpath(":last-of-type") - with pytest.raises(ExpressionError): - xpath(":nth-of-type(1)") - with pytest.raises(ExpressionError): - xpath(":nth-last-of-type(1)") - with pytest.raises(ExpressionError): - xpath(":nth-child(n-)") - with pytest.raises(ExpressionError): - xpath(":after") - with pytest.raises(ExpressionError): - xpath(":lorem-ipsum") - with pytest.raises(ExpressionError): - xpath(":lorem(ipsum)") - with pytest.raises(ExpressionError): - xpath("::lorem-ipsum") - with pytest.raises(TypeError): - GenericTranslator().css_to_xpath(4) # type: ignore[arg-type] - with pytest.raises(TypeError): - GenericTranslator().selector_to_xpath("foo") # type: ignore[arg-type] - - def test_unicode(self) -> None: - css = ".a\xc1b" - xpath = GenericTranslator().css_to_xpath(css) - assert css[1:] in xpath - xpath = xpath.encode("ascii", "xmlcharrefreplace").decode("ASCII") - assert xpath == ( - "descendant-or-self::*[@class and contains(" - "concat(' ', normalize-space(@class), ' '), ' aÁb ')]" - ) - - def test_quoting(self) -> None: - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath('*[aval="\'"]') == ( - """descendant-or-self::*[@aval = "'"]""" - ) - assert css_to_xpath("*[aval=\"'''\"]") == ( - """descendant-or-self::*[@aval = "'''"]""" - ) - assert css_to_xpath("*[aval='\"']") == ( - """descendant-or-self::*[@aval = '"']""" - ) - assert css_to_xpath('*[aval=\'"""\']') == ( - '''descendant-or-self::*[@aval = '"""']''' - ) - assert css_to_xpath(':scope > div[dataimg=""]') == ( - "descendant-or-self::*[1]/div[@dataimg = '']" - ) - - def test_unicode_escapes(self) -> None: - # \22 == '"' \20 == ' ' - css_to_xpath = GenericTranslator().css_to_xpath - assert css_to_xpath(r'*[aval="\'\22\'"]') == ( - """descendant-or-self::*[@aval = concat("'",'"',"'")]""" - ) - assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( - """descendant-or-self::*[@aval = concat("'",'"2',"'")]""" - ) - assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( - """descendant-or-self::*[@aval = "' '"]""" - ) - assert css_to_xpath("*[aval=\"'\\20\r\n '\"]") == ( - """descendant-or-self::*[@aval = "' '"]""" - ) - - def test_xpath_pseudo_elements(self) -> None: - class CustomTranslator(GenericTranslator): - def xpath_pseudo_element( - self, xpath: XPathExpr, pseudo_element: PseudoElement - ) -> XPathExpr: - if isinstance(pseudo_element, FunctionalPseudoElement): - method_name = "xpath_{}_functional_pseudo_element".format( - pseudo_element.name.replace("-", "_") - ) - method = getattr(self, method_name, None) - if not method: - raise ExpressionError( - f"The functional pseudo-element ::{pseudo_element.name}() is unknown" - ) - xpath = method(xpath, pseudo_element.arguments) - else: - method_name = "xpath_{}_simple_pseudo_element".format( - pseudo_element.replace("-", "_") - ) - method = getattr(self, method_name, None) - if not method: - raise ExpressionError( - f"The pseudo-element ::{pseudo_element} is unknown" - ) - xpath = method(xpath) - return xpath - - # functional pseudo-class: - # elements that have a certain number of attributes - def xpath_nb_attr_function( - self, xpath: XPathExpr, function: Function - ) -> XPathExpr: - assert function.arguments[0].value - nb_attributes = int(function.arguments[0].value) - return xpath.add_condition(f"count(@*)={nb_attributes}") - - # pseudo-class: - # elements that have 5 attributes - def xpath_five_attributes_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("count(@*)=5") - - # functional pseudo-element: - # element's attribute by name - def xpath_attr_functional_pseudo_element( - self, xpath: XPathExpr, arguments: Sequence[Token] - ) -> XPathExpr: - attribute_name = arguments[0].value - other = XPathExpr( - f"@{attribute_name}", - "", - ) - return xpath.join("/", other) - - # pseudo-element: - # element's text() nodes - def xpath_text_node_simple_pseudo_element( - self, xpath: XPathExpr - ) -> XPathExpr: - other = XPathExpr( - "text()", - "", - ) - return xpath.join("/", other) - - # pseudo-element: - # element's href attribute - def xpath_attr_href_simple_pseudo_element( - self, xpath: XPathExpr - ) -> XPathExpr: - other = XPathExpr( - "@href", - "", - ) - return xpath.join("/", other) - - # pseudo-element: - # used to demonstrate operator precedence - def xpath_first_or_second_pseudo(self, xpath: XPathExpr) -> XPathExpr: - return xpath.add_condition("@id = 'first' or @id = 'second'") - - def xpath(css: str) -> str: - return str(CustomTranslator().css_to_xpath(css)) - - assert xpath(":five-attributes") == "descendant-or-self::*[count(@*)=5]" - assert xpath(":nb-attr(3)") == "descendant-or-self::*[count(@*)=3]" - assert xpath("::attr(href)") == "descendant-or-self::*/@href" - assert xpath("::text-node") == "descendant-or-self::*/text()" - assert xpath("::attr-href") == "descendant-or-self::*/@href" - assert xpath("p img::attr(src)") == ( - "descendant-or-self::p/descendant-or-self::*/img/@src" - ) - assert xpath(":scope") == "descendant-or-self::*[1]" - assert xpath(":first-or-second[href]") == ( - "descendant-or-self::*[(@id = 'first' or @id = 'second') and (@href)]" - ) - - assert str(XPathExpr("", "", condition="@href")) == "[@href]" - - document = etree.fromstring(OPERATOR_PRECEDENCE_IDS) - sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ - - def operator_id(selector: str) -> list[str]: - xpath = CustomTranslator().css_to_xpath(selector) - items = typing.cast("list[etree._Element]", document.xpath(xpath)) - items.sort(key=sort_key) - return [element.get("id", "nil") for element in items] - - assert operator_id(":first-or-second") == ["first", "second"] - assert operator_id(":first-or-second[href]") == ["second"] - assert operator_id("[href]:first-or-second") == ["second"] - - def test_series(self) -> None: - def series(css: str) -> tuple[int, int] | None: - (selector,) = parse(f":nth-child({css})") - args = typing.cast( - "FunctionalPseudoElement", selector.parsed_tree - ).arguments - try: - return parse_series(args) - except ValueError: - return None - - assert series("1n+3") == (1, 3) - assert series("1n +3") == (1, 3) - assert series("1n + 3") == (1, 3) - assert series("1n+ 3") == (1, 3) - assert series("1n-3") == (1, -3) - assert series("1n -3") == (1, -3) - assert series("1n - 3") == (1, -3) - assert series("1n- 3") == (1, -3) - assert series("n-5") == (1, -5) - assert series("odd") == (2, 1) - assert series("even") == (2, 0) - assert series("3n") == (3, 0) - assert series("n") == (1, 0) - assert series("+n") == (1, 0) - assert series("-n") == (-1, 0) - assert series("5") == (0, 5) - assert series("foo") is None - assert series("n+") is None - - def test_lang(self) -> None: - document = etree.fromstring(XMLLANG_IDS) - sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ - css_to_xpath = GenericTranslator().css_to_xpath - - def langid(selector: str) -> list[str]: - xpath = css_to_xpath(selector) - items = typing.cast("list[etree._Element]", document.xpath(xpath)) - items.sort(key=sort_key) - return [element.get("id", "nil") for element in items] - - assert langid(':lang("EN")') == ["first", "second", "third", "fourth"] - assert langid(':lang("en-us")') == ["second", "fourth"] - assert langid(":lang(en-nz)") == ["third"] - assert langid(":lang(fr)") == ["fifth"] - assert langid(":lang(ru)") == ["sixth"] - assert langid(":lang('ZH')") == ["eighth"] - assert langid(":lang(de) :lang(zh)") == ["eighth"] - assert langid(":lang(en), :lang(zh)") == [ - "first", - "second", - "third", - "fourth", - "eighth", - ] - assert langid(":lang(es)") == [] - - def test_argument_types(self) -> None: - class CustomTranslator(GenericTranslator): - def __init__(self) -> None: - self.argument_types: list[str] = [] - - def xpath_pseudo_element( - self, xpath: XPathExpr, pseudo_element: PseudoElement - ) -> XPathExpr: - self.argument_types += typing.cast( - "FunctionalPseudoElement", pseudo_element - ).argument_types() - return xpath - - def argument_types(css: str) -> list[str]: - translator = CustomTranslator() - translator.css_to_xpath(css) - return translator.argument_types - - mappings: list[tuple[str, list[str]]] = [ - ("", []), - ("ident", ["IDENT"]), - ('"string"', ["STRING"]), - ("1", ["NUMBER"]), - ] - for argument_string, argument_list in mappings: - css = f"::pseudo_element({argument_string})" - assert argument_types(css) == argument_list - - def test_select(self) -> None: - document = etree.fromstring(HTML_IDS) - sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ - css_to_xpath = GenericTranslator().css_to_xpath - html_css_to_xpath = HTMLTranslator().css_to_xpath - - def select_ids(selector: str, html_only: bool) -> list[str]: - xpath = css_to_xpath(selector) - items = typing.cast("list[etree._Element]", document.xpath(xpath)) - if html_only: - assert items == [] - xpath = html_css_to_xpath(selector) - items = typing.cast("list[etree._Element]", document.xpath(xpath)) - items.sort(key=sort_key) - return [element.get("id", "nil") for element in items] - - def pcss(main: str, *selectors: str, **kwargs: bool) -> list[str]: - html_only = kwargs.pop("html_only", False) - result = select_ids(main, html_only) - for selector in selectors: - assert select_ids(selector, html_only) == result - return result - - all_ids = pcss("*") - assert all_ids[:6] == [ - "html", - "nil", - "link-href", - "link-nohref", - "nil", - "outer-div", - ] - assert all_ids[-1:] == ["foobar-span"] - assert pcss("div") == ["outer-div", "li-div", "foobar-div"] - assert pcss("DIV", html_only=True) == [ - "outer-div", - "li-div", - "foobar-div", - ] # case-insensitive in HTML - assert pcss("div div") == ["li-div"] - assert pcss("div, div div") == ["outer-div", "li-div", "foobar-div"] - assert pcss("a[name]") == ["name-anchor"] - assert pcss("a[NAme]", html_only=True) == [ - "name-anchor" - ] # case-insensitive in HTML: - assert pcss("a[rel]") == ["tag-anchor", "nofollow-anchor"] - assert pcss('a[rel="tag"]') == ["tag-anchor"] - assert pcss('a[href*="localhost"]') == ["tag-anchor"] - assert pcss('a[href*=""]') == [] - assert pcss('a[href^="http"]') == ["tag-anchor", "nofollow-anchor"] - assert pcss('a[href^="http:"]') == ["tag-anchor"] - assert pcss('a[href^=""]') == [] - assert pcss('a[href$="org"]') == ["nofollow-anchor"] - assert pcss('a[href$=""]') == [] - assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ["foobar-div"] - assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == [] - assert pcss('div[foobar~="cd"]') == [] - assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ["second-li"] - # Attribute values are case sensitive - assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] - assert pcss('*[lang|="e"]') == [] - # ... :lang() is not. - assert pcss(':lang("EN")', "*:lang(en-US)", html_only=True) == [ - "second-li", - "li-div", - ] - assert pcss(':lang("e")', html_only=True) == [] - assert pcss(":scope > div") == [] - assert pcss(":scope body") == ["nil"] - assert pcss(":scope body > div") == ["outer-div", "foobar-div"] - assert pcss(":scope head") == ["nil"] - assert pcss(":scope html") == [] - - # --- nth-* and nth-last-* ------------------------------------- - - # select nothing - assert pcss("li:nth-child(-n)") == [] - # select all children - assert pcss("li:nth-child(n)") == [ - "first-li", - "second-li", - "third-li", - "fourth-li", - "fifth-li", - "sixth-li", - "seventh-li", - ] - - assert pcss("li:nth-child(3)", "#first-li ~ :nth-child(3)") == ["third-li"] - assert pcss("li:nth-child(10)") == [] - assert pcss("li:nth-child(2n)", "li:nth-child(even)", "li:nth-child(2n+0)") == [ - "second-li", - "fourth-li", - "sixth-li", - ] - assert pcss("li:nth-child(+2n+1)", "li:nth-child(odd)") == [ - "first-li", - "third-li", - "fifth-li", - "seventh-li", - ] - assert pcss("li:nth-child(2n+4)") == ["fourth-li", "sixth-li"] - assert pcss("li:nth-child(3n+1)") == ["first-li", "fourth-li", "seventh-li"] - assert pcss("li:nth-child(-n+3)") == ["first-li", "second-li", "third-li"] - assert pcss("li:nth-child(-2n+4)") == ["second-li", "fourth-li"] - assert pcss("li:nth-last-child(0)") == [] - assert pcss("li:nth-last-child(1)") == ["seventh-li"] - assert pcss("li:nth-last-child(2n)", "li:nth-last-child(even)") == [ - "second-li", - "fourth-li", - "sixth-li", - ] - assert pcss("li:nth-last-child(2n+1)") == [ - "first-li", - "third-li", - "fifth-li", - "seventh-li", - ] - assert pcss("li:nth-last-child(2n+2)") == ["second-li", "fourth-li", "sixth-li"] - assert pcss("li:nth-last-child(3n+1)") == [ - "first-li", - "fourth-li", - "seventh-li", - ] - assert pcss("ol:first-of-type") == ["first-ol"] - assert pcss("ol:nth-child(1)") == [] - assert pcss("ol:nth-of-type(2)") == ["second-ol"] - assert pcss("ol:nth-last-of-type(1)") == ["second-ol"] - - # "+" and "~" tests - assert pcss("ol#first-ol li + li:nth-child(4)") == ["fourth-li"] - assert pcss("li + li:nth-child(1)") == [] - assert pcss("li ~ li:nth-child(2n+1)") == [ - "third-li", - "fifth-li", - "seventh-li", - ] # all but the first - assert pcss("li ~ li:nth-last-child(2n+1)") == [ - "third-li", - "fifth-li", - "seventh-li", - ] # all but the first - - assert pcss("span:only-child") == ["foobar-span"] - assert pcss("li div:only-child") == ["li-div"] - assert pcss("div *:only-child") == ["li-div", "foobar-span"] - with pytest.raises(ExpressionError): - pcss("p *:only-of-type") - assert pcss("p:only-of-type") == ["paragraph"] - assert pcss("a:empty", "a:EMpty") == ["name-anchor"] - assert pcss("li:empty") == ["third-li", "fourth-li", "fifth-li", "sixth-li"] - assert pcss(":root", "html:root") == ["html"] - assert pcss("li:root", "* :root") == [] - assert pcss('*:contains("link")', ':CONtains("link")') == [ - "html", - "nil", - "outer-div", - "tag-anchor", - "nofollow-anchor", - ] - assert pcss('*:contains("LInk")') == [] # case sensitive - assert pcss('*:contains("e")') == [ - "html", - "nil", - "outer-div", - "first-ol", - "first-li", - "paragraph", - "p-em", - ] - assert pcss('*:contains("E")') == [] # case-sensitive - assert pcss(".a", ".b", "*.a", "ol.a") == ["first-ol"] - assert pcss(".c", "*.c") == ["first-ol", "third-li", "fourth-li"] - assert pcss("ol *.c", "ol li.c", "li ~ li.c", "ol > li.c") == [ - "third-li", - "fourth-li", - ] - assert pcss("#first-li", "li#first-li", "*#first-li") == ["first-li"] - assert pcss("li div", "li > div", "div div") == ["li-div"] - assert pcss("div > div") == [] - assert pcss("div>.c", "div > .c") == ["first-ol"] - assert pcss("div + div") == ["foobar-div"] - assert pcss("a ~ a") == ["tag-anchor", "nofollow-anchor"] - assert pcss('a[rel="tag"] ~ a') == ["nofollow-anchor"] - assert pcss("ol#first-ol li:last-child") == ["seventh-li"] - assert pcss("ol#first-ol *:last-child") == ["li-div", "seventh-li"] - assert pcss("#outer-div:first-child") == ["outer-div"] - assert pcss("#outer-div :first-child") == [ - "name-anchor", - "first-li", - "li-div", - "p-b", - "checkbox-fieldset-disabled", - "area-href", - ] - assert pcss("a[href]") == ["tag-anchor", "nofollow-anchor"] - assert pcss(":not(*)") == [] - assert pcss("a:not([href])") == ["name-anchor"] - assert pcss("ol :Not(li[class])") == [ - "first-li", - "second-li", - "li-div", - "fifth-li", - "sixth-li", - "seventh-li", - ] - assert pcss("link:has(*)") == [] - assert pcss("ol:has(div)") == ["first-ol"] - assert pcss(":is(#first-li, #second-li)") == ["first-li", "second-li"] - assert pcss("a:is(#name-anchor, #tag-anchor)") == ["name-anchor", "tag-anchor"] - assert pcss(":is(.c)") == ["first-ol", "third-li", "fourth-li"] - assert pcss("ol.a.b.c > li.c:nth-child(3)") == ["third-li"] - - # Invalid characters in XPath element names, should not crash - assert pcss(r"di\a0 v", r"div\[") == [] - assert pcss(r"[h\a0 ref]", r"[h\]ref]") == [] - - # HTML-specific - assert pcss(":link", html_only=True) == [ - "link-href", - "tag-anchor", - "nofollow-anchor", - "area-href", - ] - assert pcss(":visited", html_only=True) == [] - assert pcss(":enabled", html_only=True) == [ - "link-href", - "tag-anchor", - "nofollow-anchor", - "checkbox-unchecked", - "text-checked", - "checkbox-checked", - "area-href", - ] - assert pcss(":disabled", html_only=True) == [ - "checkbox-disabled", - "checkbox-disabled-checked", - "fieldset", - "checkbox-fieldset-disabled", - ] - assert pcss(":checked", html_only=True) == [ - "checkbox-checked", - "checkbox-disabled-checked", - ] - - def test_select_shakespeare(self) -> None: - document = html.document_fromstring(HTML_SHAKESPEARE) - body = typing.cast("list[etree._Element]", document.xpath("//body"))[0] - css_to_xpath = GenericTranslator().css_to_xpath - - basestring_ = (str, bytes) - - def count(selector: str) -> int: - xpath = css_to_xpath(selector) - results = typing.cast("list[etree._Element]", body.xpath(xpath)) - assert not isinstance(results, basestring_) - found = set() - for item in results: - assert item not in found - found.add(item) - assert not isinstance(item, basestring_) - return len(results) - - # Data borrowed from http://mootools.net/slickspeed/ - - ## Changed from original; probably because I'm only - ## searching the body. - # assert count('*') == 252 - assert count("*") == 246 - assert count("div:contains(CELIA)") == 26 - assert count("div:only-child") == 22 # ? - assert count("div:nth-child(even)") == 106 - assert count("div:nth-child(2n)") == 106 - assert count("div:nth-child(odd)") == 137 - assert count("div:nth-child(2n+1)") == 137 - assert count("div:nth-child(n)") == 243 - assert count("div:last-child") == 53 - assert count("div:first-child") == 51 - assert count("div > div") == 242 - assert count("div + div") == 190 - assert count("div ~ div") == 190 - assert count("body") == 1 - assert count("body div") == 243 - assert count("div") == 243 - assert count("div div") == 242 - assert count("div div div") == 241 - assert count("div, div, div") == 243 - assert count("div, a, span") == 243 - assert count(".dialog") == 51 - assert count("div.dialog") == 51 - assert count("div .dialog") == 51 - assert count("div.character, div.dialog") == 99 - assert count("div.direction.dialog") == 0 - assert count("div.dialog.direction") == 0 - assert count("div.dialog.scene") == 1 - assert count("div.scene.scene") == 1 - assert count("div.scene .scene") == 0 - assert count("div.direction .dialog ") == 0 - assert count("div .dialog .direction") == 4 - assert count("div.dialog .dialog .direction") == 4 - assert count("#speech5") == 1 - assert count("div#speech5") == 1 - assert count("div #speech5") == 1 - assert count("div.scene div.dialog") == 49 - assert count("div#scene1 div.dialog div") == 142 - assert count("#scene1 #speech1") == 1 - assert count("div[class]") == 103 - assert count("div[class=dialog]") == 50 - assert count("div[class^=dia]") == 51 - assert count("div[class$=log]") == 50 - assert count("div[class*=sce]") == 1 - assert count("div[class|=dialog]") == 50 # ? Seems right - assert count("div[class!=madeup]") == 243 # ? Seems right - assert count("div[class~=dialog]") == 51 # ? Seems right - assert count(":scope > div") == 1 - assert count(":scope > div > div[class=dialog]") == 1 - assert count(":scope > div div") == 242 - - -OPERATOR_PRECEDENCE_IDS = """ - - - - - -""" - -XMLLANG_IDS = """ - - a - b - c - d - e - f - - - - -""" - -HTML_IDS = """ - - - - -
- - - - link -
    -
  1. content
  2. -
  3. -
    -
    -
  4. -
  5. -
  6. -
  7. -
  8. -
  9. -
-

- hi there - guy - - - - - - - -

- - -
-

-
    -
- - - - -
-
- -""" - - -HTML_SHAKESPEARE = """ - - - - - - -
-
-

As You Like It

-
- by William Shakespeare -
-
-

ACT I, SCENE III. A room in the palace.

-
-
Enter CELIA and ROSALIND
-
-
CELIA
-
-
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
-
-
ROSALIND
-
-
Not one to throw at a dog.
-
-
CELIA
-
-
No, thy words are too precious to be cast away upon
-
curs; throw some of them at me; come, lame me with reasons.
-
-
ROSALIND
-
CELIA
-
-
But is all this for your father?
-
-
-
Then there were two cousins laid up; when the one
-
should be lamed with reasons and the other mad
-
without any.
-
-
ROSALIND
-
-
No, some of it is for my child's father. O, how
-
full of briers is this working-day world!
-
-
CELIA
-
-
They are but burs, cousin, thrown upon thee in
-
holiday foolery: if we walk not in the trodden
-
paths our very petticoats will catch them.
-
-
ROSALIND
-
-
I could shake them off my coat: these burs are in my heart.
-
-
CELIA
-
-
Hem them away.
-
-
ROSALIND
-
-
I would try, if I could cry 'hem' and have him.
-
-
CELIA
-
-
Come, come, wrestle with thy affections.
-
-
ROSALIND
-
-
O, they take the part of a better wrestler than myself!
-
-
CELIA
-
-
O, a good wish upon you! you will try in time, in
-
despite of a fall. But, turning these jests out of
-
service, let us talk in good earnest: is it
-
possible, on such a sudden, you should fall into so
-
strong a liking with old Sir Rowland's youngest son?
-
-
ROSALIND
-
-
The duke my father loved his father dearly.
-
-
CELIA
-
-
Doth it therefore ensue that you should love his son
-
dearly? By this kind of chase, I should hate him,
-
for my father hated his father dearly; yet I hate
-
not Orlando.
-
-
ROSALIND
-
-
No, faith, hate him not, for my sake.
-
-
CELIA
-
-
Why should I not? doth he not deserve well?
-
-
ROSALIND
-
-
Let me love him for that, and do you love him
-
because I do. Look, here comes the duke.
-
-
CELIA
-
-
With his eyes full of anger.
-
Enter DUKE FREDERICK, with Lords
-
-
DUKE FREDERICK
-
-
Mistress, dispatch you with your safest haste
-
And get you from our court.
-
-
ROSALIND
-
-
Me, uncle?
-
-
DUKE FREDERICK
-
-
You, cousin
-
Within these ten days if that thou be'st found
-
So near our public court as twenty miles,
-
Thou diest for it.
-
-
ROSALIND
-
-
I do beseech your grace,
-
Let me the knowledge of my fault bear with me:
-
If with myself I hold intelligence
-
Or have acquaintance with mine own desires,
-
If that I do not dream or be not frantic,--
-
As I do trust I am not--then, dear uncle,
-
Never so much as in a thought unborn
-
Did I offend your highness.
-
-
DUKE FREDERICK
-
-
Thus do all traitors:
-
If their purgation did consist in words,
-
They are as innocent as grace itself:
-
Let it suffice thee that I trust thee not.
-
-
ROSALIND
-
-
Yet your mistrust cannot make me a traitor:
-
Tell me whereon the likelihood depends.
-
-
DUKE FREDERICK
-
-
Thou art thy father's daughter; there's enough.
-
-
ROSALIND
-
-
So was I when your highness took his dukedom;
-
So was I when your highness banish'd him:
-
Treason is not inherited, my lord;
-
Or, if we did derive it from our friends,
-
What's that to me? my father was no traitor:
-
Then, good my liege, mistake me not so much
-
To think my poverty is treacherous.
-
-
CELIA
-
-
Dear sovereign, hear me speak.
-
-
DUKE FREDERICK
-
-
Ay, Celia; we stay'd her for your sake,
-
Else had she with her father ranged along.
-
-
CELIA
-
-
I did not then entreat to have her stay;
-
It was your pleasure and your own remorse:
-
I was too young that time to value her;
-
But now I know her: if she be a traitor,
-
Why so am I; we still have slept together,
-
Rose at an instant, learn'd, play'd, eat together,
-
And wheresoever we went, like Juno's swans,
-
Still we went coupled and inseparable.
-
-
DUKE FREDERICK
-
-
She is too subtle for thee; and her smoothness,
-
Her very silence and her patience
-
Speak to the people, and they pity her.
-
Thou art a fool: she robs thee of thy name;
-
And thou wilt show more bright and seem more virtuous
-
When she is gone. Then open not thy lips:
-
Firm and irrevocable is my doom
-
Which I have pass'd upon her; she is banish'd.
-
-
CELIA
-
-
Pronounce that sentence then on me, my liege:
-
I cannot live out of her company.
-
-
DUKE FREDERICK
-
-
You are a fool. You, niece, provide yourself:
-
If you outstay the time, upon mine honour,
-
And in the greatness of my word, you die.
-
Exeunt DUKE FREDERICK and Lords
-
-
CELIA
-
-
O my poor Rosalind, whither wilt thou go?
-
Wilt thou change fathers? I will give thee mine.
-
I charge thee, be not thou more grieved than I am.
-
-
ROSALIND
-
-
I have more cause.
-
-
CELIA
-
-
Thou hast not, cousin;
-
Prithee be cheerful: know'st thou not, the duke
-
Hath banish'd me, his daughter?
-
-
ROSALIND
-
-
That he hath not.
-
-
CELIA
-
-
No, hath not? Rosalind lacks then the love
-
Which teacheth thee that thou and I am one:
-
Shall we be sunder'd? shall we part, sweet girl?
-
No: let my father seek another heir.
-
Therefore devise with me how we may fly,
-
Whither to go and what to bear with us;
-
And do not seek to take your change upon you,
-
To bear your griefs yourself and leave me out;
-
For, by this heaven, now at our sorrows pale,
-
Say what thou canst, I'll go along with thee.
-
-
ROSALIND
-
-
Why, whither shall we go?
-
-
CELIA
-
-
To seek my uncle in the forest of Arden.
-
-
ROSALIND
-
-
Alas, what danger will it be to us,
-
Maids as we are, to travel forth so far!
-
Beauty provoketh thieves sooner than gold.
-
-
CELIA
-
-
I'll put myself in poor and mean attire
-
And with a kind of umber smirch my face;
-
The like do you: so shall we pass along
-
And never stir assailants.
-
-
ROSALIND
-
-
Were it not better,
-
Because that I am more than common tall,
-
That I did suit me all points like a man?
-
A gallant curtle-axe upon my thigh,
-
A boar-spear in my hand; and--in my heart
-
Lie there what hidden woman's fear there will--
-
We'll have a swashing and a martial outside,
-
As many other mannish cowards have
-
That do outface it with their semblances.
-
-
CELIA
-
-
What shall I call thee when thou art a man?
-
-
ROSALIND
-
-
I'll have no worse a name than Jove's own page;
-
And therefore look you call me Ganymede.
-
But what will you be call'd?
-
-
CELIA
-
-
Something that hath a reference to my state
-
No longer Celia, but Aliena.
-
-
ROSALIND
-
-
But, cousin, what if we assay'd to steal
-
The clownish fool out of your father's court?
-
Would he not be a comfort to our travel?
-
-
CELIA
-
-
He'll go along o'er the wide world with me;
-
Leave me alone to woo him. Let's away,
-
And get our jewels and our wealth together,
-
Devise the fittest time and safest way
-
To hide us from pursuit that will be made
-
After my flight. Now go we in content
-
To liberty and not to banishment.
-
Exeunt
-
-
-
-
- - -""" - - -if __name__ == "__main__": - unittest.main() diff --git a/tox.ini b/tox.ini index 9ff54cf..ca053d8 100644 --- a/tox.ini +++ b/tox.ini @@ -1,49 +1,10 @@ [tox] -envlist = pre-commit,pylint,py,docs,typing +envlist = py25,py26,py27,py32,py33 [testenv] -deps = - lxml>=4.4 - pytest-cov>=7.0.0 - pytest>=5.4 - sybil -commands = - pytest --cov=cssselect \ - --cov-report=term-missing --cov-report=html --cov-report=xml \ - {posargs: cssselect tests docs} +deps=lxml +commands = python cssselect/tests.py -[testenv:pylint] -deps = - {[testenv]deps} - pylint==4.0.4 -commands = - pylint {posargs: cssselect tests docs} - -[testenv:docs] -changedir = docs -deps = - -r docs/requirements.txt -commands = - sphinx-build -W -b html . {envtmpdir}/html - -[testenv:typing] -deps = - {[testenv]deps} - mypy==1.19.1 - types-lxml==2026.1.1 -commands = - mypy {posargs: cssselect tests} - -[testenv:pre-commit] -deps = pre-commit -commands = pre-commit run --all-files --show-diff-on-failure -skip_install = true - -[testenv:twinecheck] -basepython = python3 -deps = - twine==6.2.0 - build==1.4.0 -commands = - python -m build --sdist - twine check dist/* +[testenv:py25] +setenv = + PIP_INSECURE = 1