html-text
Advanced tools
+152
| [tool.bumpversion] | ||
| current_version = "0.7.0" | ||
| commit = true | ||
| tag = true | ||
| tag_name = "{new_version}" | ||
| [[tool.bumpversion.files]] | ||
| filename = "setup.py" | ||
| search = "version=\"{current_version}\"" | ||
| replace = "version=\"{new_version}\"" | ||
| [[tool.bumpversion.files]] | ||
| filename = "html_text/__init__.py" | ||
| search = "__version__ = \"{current_version}\"" | ||
| replace = "__version__ = \"{new_version}\"" | ||
| [tool.coverage.run] | ||
| branch = true | ||
| [tool.coverage.report] | ||
| exclude_also = [ | ||
| "if TYPE_CHECKING:", | ||
| ] | ||
| [[tool.mypy.overrides]] | ||
| module = "tests.*" | ||
| check_untyped_defs = true | ||
| allow_untyped_defs = true | ||
| [tool.ruff.lint] | ||
| extend-select = [ | ||
| # flake8-bugbear | ||
| "B", | ||
| # flake8-comprehensions | ||
| "C4", | ||
| # pydocstyle | ||
| "D", | ||
| # flake8-future-annotations | ||
| "FA", | ||
| # flynt | ||
| "FLY", | ||
| # refurb | ||
| "FURB", | ||
| # isort | ||
| "I", | ||
| # flake8-implicit-str-concat | ||
| "ISC", | ||
| # flake8-logging | ||
| "LOG", | ||
| # Perflint | ||
| "PERF", | ||
| # pygrep-hooks | ||
| "PGH", | ||
| # flake8-pie | ||
| "PIE", | ||
| # pylint | ||
| "PL", | ||
| # flake8-pytest-style | ||
| "PT", | ||
| # flake8-use-pathlib | ||
| "PTH", | ||
| # flake8-pyi | ||
| "PYI", | ||
| # flake8-quotes | ||
| "Q", | ||
| # flake8-return | ||
| "RET", | ||
| # flake8-raise | ||
| "RSE", | ||
| # Ruff-specific rules | ||
| "RUF", | ||
| # flake8-bandit | ||
| "S", | ||
| # flake8-simplify | ||
| "SIM", | ||
| # flake8-slots | ||
| "SLOT", | ||
| # flake8-debugger | ||
| "T10", | ||
| # flake8-type-checking | ||
| "TC", | ||
| # pyupgrade | ||
| "UP", | ||
| # pycodestyle warnings | ||
| "W", | ||
| # flake8-2020 | ||
| "YTT", | ||
| ] | ||
| ignore = [ | ||
| # Missing docstring in public module | ||
| "D100", | ||
| # Missing docstring in public class | ||
| "D101", | ||
| # Missing docstring in public method | ||
| "D102", | ||
| # Missing docstring in public function | ||
| "D103", | ||
| # Missing docstring in public package | ||
| "D104", | ||
| # Missing docstring in magic method | ||
| "D105", | ||
| # Missing docstring in public nested class | ||
| "D106", | ||
| # Missing docstring in __init__ | ||
| "D107", | ||
| # One-line docstring should fit on one line with quotes | ||
| "D200", | ||
| # No blank lines allowed after function docstring | ||
| "D202", | ||
| # 1 blank line required between summary line and description | ||
| "D205", | ||
| # Multi-line docstring closing quotes should be on a separate line | ||
| "D209", | ||
| # First line should end with a period | ||
| "D400", | ||
| # First line should be in imperative mood; try rephrasing | ||
| "D401", | ||
| # First line should not be the function's "signature" | ||
| "D402", | ||
| # First word of the first line should be properly capitalized | ||
| "D403", | ||
| # No blank lines allowed between a section header and its content | ||
| "D412", | ||
| # Too many return statements | ||
| "PLR0911", | ||
| # Too many branches | ||
| "PLR0912", | ||
| # Too many arguments in function definition | ||
| "PLR0913", | ||
| # Too many statements | ||
| "PLR0915", | ||
| # Magic value used in comparison | ||
| "PLR2004", | ||
| # String contains ambiguous {}. | ||
| "RUF001", | ||
| # Docstring contains ambiguous {}. | ||
| "RUF002", | ||
| # Comment contains ambiguous {}. | ||
| "RUF003", | ||
| # Mutable class attributes should be annotated with `typing.ClassVar` | ||
| "RUF012", | ||
| # Use of `assert` detected | ||
| "S101", | ||
| # Using lxml to parse untrusted data is known to be vulnerable to XML attacks | ||
| "S320", | ||
| ] | ||
| [tool.ruff.lint.per-file-ignores] | ||
| "html_text/__init__.py" = ["F401"] | ||
| [tool.ruff.lint.pydocstyle] | ||
| convention = "pep257" |
+7
-0
@@ -5,2 +5,9 @@ ======= | ||
| 0.7.0 (2025-02-10) | ||
| ------------------ | ||
| * Removed support for Python 3.8. | ||
| * Added support for Python 3.13. | ||
| * Added type hints and ``py.typed``. | ||
| * CI improvements. | ||
| 0.6.2 (2024-05-01) | ||
@@ -7,0 +14,0 @@ ------------------ |
@@ -1,4 +0,4 @@ | ||
| Metadata-Version: 2.1 | ||
| Metadata-Version: 2.2 | ||
| Name: html_text | ||
| Version: 0.6.2 | ||
| Version: 0.7.0 | ||
| Summary: Extract text from HTML | ||
@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Classifier: Programming Language :: Python :: 3.9 | ||
@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10 | ||
| Classifier: Programming Language :: Python :: 3.12 | ||
| Classifier: Programming Language :: Python :: 3.13 | ||
| Description-Content-Type: text/x-rst | ||
| License-File: LICENSE | ||
| Requires-Dist: lxml | ||
| Requires-Dist: lxml-html-clean | ||
| Dynamic: author | ||
| Dynamic: author-email | ||
| Dynamic: classifier | ||
| Dynamic: description | ||
| Dynamic: description-content-type | ||
| Dynamic: home-page | ||
| Dynamic: license | ||
| Dynamic: requires-dist | ||
| Dynamic: summary | ||
@@ -160,2 +170,9 @@ ============ | ||
| 0.7.0 (2025-02-10) | ||
| ------------------ | ||
| * Removed support for Python 3.8. | ||
| * Added support for Python 3.13. | ||
| * Added type hints and ``py.typed``. | ||
| * CI improvements. | ||
| 0.6.2 (2024-05-01) | ||
@@ -162,0 +179,0 @@ ------------------ |
@@ -5,6 +5,7 @@ CHANGES.rst | ||
| README.rst | ||
| setup.cfg | ||
| pyproject.toml | ||
| setup.py | ||
| html_text/__init__.py | ||
| html_text/html_text.py | ||
| html_text/py.typed | ||
| html_text.egg-info/PKG-INFO | ||
@@ -11,0 +12,0 @@ html_text.egg-info/SOURCES.txt |
@@ -1,6 +0,12 @@ | ||
| # -*- coding: utf-8 -*- | ||
| __version__ = '0.6.2' | ||
| __version__ = "0.7.0" | ||
| from .html_text import (etree_to_text, extract_text, selector_to_text, | ||
| parse_html, cleaned_selector, cleaner, | ||
| NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) | ||
| from .html_text import ( | ||
| DOUBLE_NEWLINE_TAGS, | ||
| NEWLINE_TAGS, | ||
| cleaned_selector, | ||
| cleaner, | ||
| etree_to_text, | ||
| extract_text, | ||
| parse_html, | ||
| selector_to_text, | ||
| ) |
+122
-78
@@ -1,3 +0,5 @@ | ||
| # -*- coding: utf-8 -*- | ||
| from __future__ import annotations | ||
| import re | ||
| from typing import TYPE_CHECKING | ||
@@ -8,13 +10,49 @@ import lxml | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterable | ||
| NEWLINE_TAGS = frozenset([ | ||
| 'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset', | ||
| 'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', | ||
| 'nav', 'table', 'tr' | ||
| ]) | ||
| DOUBLE_NEWLINE_TAGS = frozenset([ | ||
| 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', | ||
| 'p', 'pre', 'title', 'ul' | ||
| ]) | ||
| import parsel | ||
| NEWLINE_TAGS: frozenset[str] = frozenset( | ||
| [ | ||
| "article", | ||
| "aside", | ||
| "br", | ||
| "dd", | ||
| "details", | ||
| "div", | ||
| "dt", | ||
| "fieldset", | ||
| "figcaption", | ||
| "footer", | ||
| "form", | ||
| "header", | ||
| "hr", | ||
| "legend", | ||
| "li", | ||
| "main", | ||
| "nav", | ||
| "table", | ||
| "tr", | ||
| ] | ||
| ) | ||
| DOUBLE_NEWLINE_TAGS: frozenset[str] = frozenset( | ||
| [ | ||
| "blockquote", | ||
| "dl", | ||
| "figure", | ||
| "h1", | ||
| "h2", | ||
| "h3", | ||
| "h4", | ||
| "h5", | ||
| "h6", | ||
| "ol", | ||
| "p", | ||
| "pre", | ||
| "title", | ||
| "ul", | ||
| ] | ||
| ) | ||
| cleaner = Cleaner( | ||
@@ -38,7 +76,4 @@ scripts=True, | ||
| def _cleaned_html_tree(html): | ||
| if isinstance(html, lxml.html.HtmlElement): | ||
| tree = html | ||
| else: | ||
| tree = parse_html(html) | ||
| def _cleaned_html_tree(html: lxml.html.HtmlElement | str) -> lxml.html.HtmlElement: | ||
| tree = html if isinstance(html, lxml.html.HtmlElement) else parse_html(html) | ||
@@ -54,29 +89,31 @@ # we need this as https://bugs.launchpad.net/lxml/+bug/1838497 | ||
| def parse_html(html): | ||
| """ Create an lxml.html.HtmlElement from a string with html. | ||
| def parse_html(html: str) -> lxml.html.HtmlElement: | ||
| """Create an lxml.html.HtmlElement from a string with html. | ||
| XXX: mostly copy-pasted from parsel.selector.create_root_node | ||
| """ | ||
| body = html.strip().replace('\x00', '').encode('utf8') or b'<html/>' | ||
| parser = lxml.html.HTMLParser(recover=True, encoding='utf8') | ||
| body = html.strip().replace("\x00", "").encode("utf-8") or b"<html/>" | ||
| parser = lxml.html.HTMLParser(recover=True, encoding="utf-8") | ||
| root = lxml.etree.fromstring(body, parser=parser) | ||
| if root is None: | ||
| root = lxml.etree.fromstring(b'<html/>', parser=parser) | ||
| root = lxml.etree.fromstring(b"<html/>", parser=parser) | ||
| return root | ||
| _whitespace = re.compile(r'\s+') | ||
| _has_trailing_whitespace = re.compile(r'\s$').search | ||
| _whitespace = re.compile(r"\s+") | ||
| _has_trailing_whitespace = re.compile(r"\s$").search | ||
| _has_punct_after = re.compile(r'^[,:;.!?")]').search | ||
| _has_open_bracket_before = re.compile(r'\($').search | ||
| _has_open_bracket_before = re.compile(r"\($").search | ||
| def _normalize_whitespace(text): | ||
| return _whitespace.sub(' ', text.strip()) | ||
| def _normalize_whitespace(text: str) -> str: | ||
| return _whitespace.sub(" ", text.strip()) | ||
| def etree_to_text(tree, | ||
| guess_punct_space=True, | ||
| guess_layout=True, | ||
| newline_tags=NEWLINE_TAGS, | ||
| double_newline_tags=DOUBLE_NEWLINE_TAGS): | ||
| def etree_to_text( | ||
| tree: lxml.html.HtmlElement, | ||
| guess_punct_space: bool = True, | ||
| guess_layout: bool = True, | ||
| newline_tags: Iterable[str] = NEWLINE_TAGS, | ||
| double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS, | ||
| ) -> str: | ||
| """ | ||
@@ -96,4 +133,4 @@ Convert a html tree to text. Tree should be cleaned with | ||
| def should_add_space(text): | ||
| """ Return True if extra whitespace should be added before text """ | ||
| def should_add_space(text: str) -> bool: | ||
| """Return True if extra whitespace should be added before text""" | ||
| if prev in {_NEWLINE, _DOUBLE_NEWLINE}: | ||
@@ -103,13 +140,14 @@ return False | ||
| return True | ||
| if not _has_trailing_whitespace(prev): | ||
| if _has_punct_after(text) or _has_open_bracket_before(prev): | ||
| return False | ||
| return True | ||
| assert isinstance(prev, str) | ||
| return bool( | ||
| _has_trailing_whitespace(prev) | ||
| or (not _has_punct_after(text) and not _has_open_bracket_before(prev)) | ||
| ) | ||
| def get_space_between(text): | ||
| def get_space_between(text: str) -> str: | ||
| if not text: | ||
| return ' ' | ||
| return ' ' if should_add_space(text) else '' | ||
| return " " | ||
| return " " if should_add_space(text) else "" | ||
| def add_newlines(tag): | ||
| def add_newlines(tag: str) -> None: | ||
| nonlocal prev | ||
@@ -121,12 +159,12 @@ if not guess_layout: | ||
| if tag in double_newline_tags: | ||
| chunks.append('\n' if prev is _NEWLINE else '\n\n') | ||
| chunks.append("\n" if prev is _NEWLINE else "\n\n") | ||
| prev = _DOUBLE_NEWLINE | ||
| elif tag in newline_tags: | ||
| if prev is not _NEWLINE: | ||
| chunks.append('\n') | ||
| chunks.append("\n") | ||
| prev = _NEWLINE | ||
| def add_text(text_content): | ||
| def add_text(text_content: str | None) -> None: | ||
| nonlocal prev | ||
| text = _normalize_whitespace(text_content) if text_content else '' | ||
| text = _normalize_whitespace(text_content) if text_content else "" | ||
| if not text: | ||
@@ -139,7 +177,9 @@ return | ||
| # Extract text from the ``tree``: fill ``chunks`` variable | ||
| for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')): | ||
| if event == 'start': | ||
| for event, el in lxml.etree.iterwalk(tree, events=("start", "end")): | ||
| if event == "start": | ||
| assert isinstance(el.tag, str) | ||
| add_newlines(el.tag) | ||
| add_text(el.text) | ||
| elif event == 'end': | ||
| elif event == "end": | ||
| assert isinstance(el.tag, str) | ||
| add_newlines(el.tag) | ||
@@ -149,7 +189,11 @@ if el is not tree: | ||
| return ''.join(chunks).strip() | ||
| return "".join(chunks).strip() | ||
| def selector_to_text(sel, guess_punct_space=True, guess_layout=True): | ||
| """ Convert a cleaned parsel.Selector to text. | ||
| def selector_to_text( | ||
| sel: parsel.Selector | parsel.SelectorList[parsel.Selector], | ||
| guess_punct_space: bool = True, | ||
| guess_layout: bool = True, | ||
| ) -> str: | ||
| """Convert a cleaned parsel.Selector to text. | ||
| See html_text.extract_text docstring for description of the approach | ||
@@ -159,2 +203,3 @@ and options. | ||
| import parsel | ||
| if isinstance(sel, parsel.SelectorList): | ||
@@ -165,27 +210,27 @@ # if selecting a specific xpath | ||
| extracted = etree_to_text( | ||
| s.root, | ||
| guess_punct_space=guess_punct_space, | ||
| guess_layout=guess_layout) | ||
| s.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout | ||
| ) | ||
| if extracted: | ||
| text.append(extracted) | ||
| return ' '.join(text) | ||
| else: | ||
| return etree_to_text( | ||
| sel.root, | ||
| guess_punct_space=guess_punct_space, | ||
| guess_layout=guess_layout) | ||
| return " ".join(text) | ||
| return etree_to_text( | ||
| sel.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout | ||
| ) | ||
| def cleaned_selector(html): | ||
| """ Clean parsel.selector. | ||
| """ | ||
| def cleaned_selector(html: lxml.html.HtmlElement | str) -> parsel.Selector: | ||
| """Clean parsel.selector.""" | ||
| import parsel | ||
| try: | ||
| tree = _cleaned_html_tree(html) | ||
| sel = parsel.Selector(root=tree, type='html') | ||
| except (lxml.etree.XMLSyntaxError, | ||
| lxml.etree.ParseError, | ||
| lxml.etree.ParserError, | ||
| UnicodeEncodeError): | ||
| sel = parsel.Selector(root=tree, type="html") | ||
| except ( | ||
| lxml.etree.XMLSyntaxError, | ||
| lxml.etree.ParseError, | ||
| lxml.etree.ParserError, | ||
| UnicodeEncodeError, | ||
| ): | ||
| # likely plain text | ||
| assert isinstance(html, str) | ||
| sel = parsel.Selector(html) | ||
@@ -195,7 +240,9 @@ return sel | ||
| def extract_text(html, | ||
| guess_punct_space=True, | ||
| guess_layout=True, | ||
| newline_tags=NEWLINE_TAGS, | ||
| double_newline_tags=DOUBLE_NEWLINE_TAGS): | ||
| def extract_text( | ||
| html: lxml.html.HtmlElement | str | None, | ||
| guess_punct_space: bool = True, | ||
| guess_layout: bool = True, | ||
| newline_tags: Iterable[str] = NEWLINE_TAGS, | ||
| double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS, | ||
| ) -> str: | ||
| """ | ||
@@ -227,9 +274,6 @@ Convert html to text, cleaning invisible content such as styles. | ||
| if html is None: | ||
| return '' | ||
| no_content_nodes = ( | ||
| lxml.html.HtmlComment, | ||
| lxml.html.HtmlProcessingInstruction | ||
| ) | ||
| return "" | ||
| no_content_nodes = (lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction) | ||
| if isinstance(html, no_content_nodes): | ||
| return '' | ||
| return "" | ||
| cleaned = _cleaned_html_tree(html) | ||
@@ -236,0 +280,0 @@ return etree_to_text( |
+0
-4
@@ -1,3 +0,1 @@ | ||
| include CONTRIBUTING.rst | ||
| include CHANGES.rst | ||
@@ -10,3 +8,1 @@ include LICENSE | ||
| recursive-exclude * *.py[co] | ||
| recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif |
+20
-3
@@ -1,4 +0,4 @@ | ||
| Metadata-Version: 2.1 | ||
| Metadata-Version: 2.2 | ||
| Name: html_text | ||
| Version: 0.6.2 | ||
| Version: 0.7.0 | ||
| Summary: Extract text from HTML | ||
@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Classifier: Programming Language :: Python :: 3.9 | ||
@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10 | ||
| Classifier: Programming Language :: Python :: 3.12 | ||
| Classifier: Programming Language :: Python :: 3.13 | ||
| Description-Content-Type: text/x-rst | ||
| License-File: LICENSE | ||
| Requires-Dist: lxml | ||
| Requires-Dist: lxml-html-clean | ||
| Dynamic: author | ||
| Dynamic: author-email | ||
| Dynamic: classifier | ||
| Dynamic: description | ||
| Dynamic: description-content-type | ||
| Dynamic: home-page | ||
| Dynamic: license | ||
| Dynamic: requires-dist | ||
| Dynamic: summary | ||
@@ -160,2 +170,9 @@ ============ | ||
| 0.7.0 (2025-02-10) | ||
| ------------------ | ||
| * Removed support for Python 3.8. | ||
| * Added support for Python 3.13. | ||
| * Added type hints and ``py.typed``. | ||
| * CI improvements. | ||
| 0.6.2 (2024-05-01) | ||
@@ -162,0 +179,0 @@ ------------------ |
+0
-17
@@ -1,18 +0,1 @@ | ||
| [bumpversion] | ||
| current_version = 0.6.2 | ||
| commit = True | ||
| tag = True | ||
| tag_name = {new_version} | ||
| [bumpversion:file:setup.py] | ||
| search = version='{current_version}' | ||
| replace = version='{new_version}' | ||
| [bumpversion:file:html_text/__init__.py] | ||
| search = __version__ = '{current_version}' | ||
| replace = __version__ = '{new_version}' | ||
| [bdist_wheel] | ||
| universal = 1 | ||
| [egg_info] | ||
@@ -19,0 +2,0 @@ tag_build = |
+25
-26
| #!/usr/bin/env python | ||
| # -*- coding: utf-8 -*- | ||
| from pathlib import Path | ||
| from setuptools import setup | ||
| with open('README.rst') as readme_file: | ||
| readme = readme_file.read() | ||
| readme = Path("README.rst").read_text(encoding="utf-8") | ||
| history = Path("CHANGES.rst").read_text(encoding="utf-8") | ||
| with open('CHANGES.rst') as history_file: | ||
| history = history_file.read() | ||
| setup( | ||
| name='html_text', | ||
| version='0.6.2', | ||
| name="html_text", | ||
| version="0.7.0", | ||
| description="Extract text from HTML", | ||
| long_description=readme + '\n\n' + history, | ||
| long_description=readme + "\n\n" + history, | ||
| long_description_content_type="text/x-rst", | ||
| author="Konstantin Lopukhin", | ||
| author_email='kostia.lopuhin@gmail.com', | ||
| url='https://github.com/zytedata/html-text', | ||
| packages=['html_text'], | ||
| author_email="kostia.lopuhin@gmail.com", | ||
| url="https://github.com/zytedata/html-text", | ||
| packages=["html_text"], | ||
| package_data={ | ||
| "html_text": ["py.typed"], | ||
| }, | ||
| include_package_data=True, | ||
| install_requires=[ | ||
| 'lxml', | ||
| 'lxml-html-clean', | ||
| "lxml", | ||
| "lxml-html-clean", | ||
| ], | ||
@@ -30,15 +31,13 @@ license="MIT license", | ||
| classifiers=[ | ||
| 'Development Status :: 4 - Beta', | ||
| 'Intended Audience :: Developers', | ||
| 'License :: OSI Approved :: MIT License', | ||
| 'Natural Language :: English', | ||
| 'Programming Language :: Python :: 3', | ||
| 'Programming Language :: Python :: 3.8', | ||
| 'Programming Language :: Python :: 3.9', | ||
| 'Programming Language :: Python :: 3.10', | ||
| 'Programming Language :: Python :: 3.11', | ||
| 'Programming Language :: Python :: 3.12', | ||
| "Development Status :: 4 - Beta", | ||
| "Intended Audience :: Developers", | ||
| "License :: OSI Approved :: MIT License", | ||
| "Natural Language :: English", | ||
| "Programming Language :: Python :: 3", | ||
| "Programming Language :: Python :: 3.9", | ||
| "Programming Language :: Python :: 3.10", | ||
| "Programming Language :: Python :: 3.11", | ||
| "Programming Language :: Python :: 3.12", | ||
| "Programming Language :: Python :: 3.13", | ||
| ], | ||
| test_suite='tests', | ||
| tests_require=['pytest'], | ||
| ) |
+132
-109
@@ -1,22 +0,30 @@ | ||
| # -*- coding: utf-8 -*- | ||
| import glob | ||
| import os | ||
| from __future__ import annotations | ||
| from pathlib import Path | ||
| import lxml.html | ||
| import pytest | ||
| from html_text import (extract_text, parse_html, cleaned_selector, | ||
| etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS, | ||
| DOUBLE_NEWLINE_TAGS) | ||
| from html_text import ( | ||
| DOUBLE_NEWLINE_TAGS, | ||
| NEWLINE_TAGS, | ||
| cleaned_selector, | ||
| cleaner, | ||
| etree_to_text, | ||
| extract_text, | ||
| parse_html, | ||
| selector_to_text, | ||
| ) | ||
| ROOT = Path(__file__).parent | ||
| ROOT = os.path.dirname(os.path.abspath(__file__)) | ||
| @pytest.fixture(params=[ | ||
| {'guess_punct_space': True, 'guess_layout': False}, | ||
| {'guess_punct_space': False, 'guess_layout': False}, | ||
| {'guess_punct_space': True, 'guess_layout': True}, | ||
| {'guess_punct_space': False, 'guess_layout': True} | ||
| ]) | ||
| @pytest.fixture( | ||
| params=[ | ||
| {"guess_punct_space": True, "guess_layout": False}, | ||
| {"guess_punct_space": False, "guess_layout": False}, | ||
| {"guess_punct_space": True, "guess_layout": True}, | ||
| {"guess_punct_space": False, "guess_layout": True}, | ||
| ] | ||
| ) | ||
| def all_options(request): | ||
@@ -27,29 +35,32 @@ return request.param | ||
| def test_extract_no_text_html(all_options): | ||
| html = (u'<!DOCTYPE html><html><body><p><video width="320" height="240" ' | ||
| 'controls><source src="movie.mp4" type="video/mp4"><source ' | ||
| 'src="movie.ogg" type="video/ogg"></video></p></body></html>') | ||
| assert extract_text(html, **all_options) == u'' | ||
| html = ( | ||
| '<!DOCTYPE html><html><body><p><video width="320" height="240" ' | ||
| 'controls><source src="movie.mp4" type="video/mp4"><source ' | ||
| 'src="movie.ogg" type="video/ogg"></video></p></body></html>' | ||
| ) | ||
| assert extract_text(html, **all_options) == "" | ||
| def test_extract_text(all_options): | ||
| html = (u'<html><style>.div {}</style>' | ||
| '<body><p>Hello, world!</body></html>') | ||
| assert extract_text(html, **all_options) == u'Hello, world!' | ||
| html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>" | ||
| assert extract_text(html, **all_options) == "Hello, world!" | ||
| def test_declared_encoding(all_options): | ||
| html = (u'<?xml version="1.0" encoding="utf-8" ?>' | ||
| u'<html><style>.div {}</style>' | ||
| u'<body>Hello, world!</p></body></html>') | ||
| assert extract_text(html, **all_options) == u'Hello, world!' | ||
| html = ( | ||
| '<?xml version="1.0" encoding="utf-8" ?>' | ||
| "<html><style>.div {}</style>" | ||
| "<body>Hello, world!</p></body></html>" | ||
| ) | ||
| assert extract_text(html, **all_options) == "Hello, world!" | ||
| def test_empty(all_options): | ||
| assert extract_text(u'', **all_options) == '' | ||
| assert extract_text(u' ', **all_options) == '' | ||
| assert extract_text(None, **all_options) == '' | ||
| assert extract_text("", **all_options) == "" | ||
| assert extract_text(" ", **all_options) == "" | ||
| assert extract_text(None, **all_options) == "" | ||
| def test_comment(all_options): | ||
| assert extract_text(u"<!-- hello world -->", **all_options) == '' | ||
| assert extract_text("<!-- hello world -->", **all_options) == "" | ||
@@ -59,7 +70,7 @@ | ||
| node = lxml.html.fragment_fromstring("<!-- hello world -->") | ||
| assert extract_text(node, **all_options) == '' | ||
| assert extract_text(node, **all_options) == "" | ||
| def test_processing_instruction(all_options): | ||
| assert extract_text('<?dbfo label-width="width"?>', **all_options) == '' | ||
| assert extract_text('<?dbfo label-width="width"?>', **all_options) == "" | ||
@@ -69,23 +80,21 @@ | ||
| node = lxml.html.fragment_fromstring('<?dbfo label-width="width"?>') | ||
| assert extract_text(node, **all_options) == '' | ||
| assert extract_text(node, **all_options) == "" | ||
| def test_extract_text_from_tree(all_options): | ||
| html = (u'<html><style>.div {}</style>' | ||
| '<body><p>Hello, world!</body></html>') | ||
| html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>" | ||
| tree = parse_html(html) | ||
| assert extract_text(tree, **all_options) == u'Hello, world!' | ||
| assert extract_text(tree, **all_options) == "Hello, world!" | ||
| def test_extract_text_from_node(all_options): | ||
| html = (u'<html><style>.div {}</style>' | ||
| '<body><p>Hello, world!</p></body></html>') | ||
| html = "<html><style>.div {}</style><body><p>Hello, world!</p></body></html>" | ||
| tree = parse_html(html) | ||
| node = tree.xpath('//p')[0] | ||
| assert extract_text(node, **all_options) == u'Hello, world!' | ||
| node = tree.xpath("//p")[0] | ||
| assert extract_text(node, **all_options) == "Hello, world!" | ||
| def test_inline_tags_whitespace(all_options): | ||
| html = u'<span>field</span><span>value of</span><span></span>' | ||
| assert extract_text(html, **all_options) == u'field value of' | ||
| html = "<span>field</span><span>value of</span><span></span>" | ||
| assert extract_text(html, **all_options) == "field value of" | ||
@@ -96,17 +105,19 @@ | ||
| tree = parse_html(html) | ||
| node = tree.xpath('/html/frameset')[0] | ||
| assert extract_text(node) == u'' | ||
| node = tree.xpath("/html/frameset")[0] | ||
| assert extract_text(node) == "" | ||
| def test_punct_whitespace(): | ||
| html = u'<div><span>field</span>, and more</div>' | ||
| assert extract_text(html, guess_punct_space=False) == u'field , and more' | ||
| assert extract_text(html, guess_punct_space=True) == u'field, and more' | ||
| html = "<div><span>field</span>, and more</div>" | ||
| assert extract_text(html, guess_punct_space=False) == "field , and more" | ||
| assert extract_text(html, guess_punct_space=True) == "field, and more" | ||
| def test_punct_whitespace_preserved(): | ||
| html = (u'<div><span>по</span><span>ле</span>, and , ' | ||
| u'<span>more </span>!<span>now</div>a (<b>boo</b>)') | ||
| html = ( | ||
| "<div><span>по</span><span>ле</span>, and , " | ||
| "<span>more </span>!<span>now</div>a (<b>boo</b>)" | ||
| ) | ||
| text = extract_text(html, guess_punct_space=True, guess_layout=False) | ||
| assert text == u'по ле, and , more ! now a (boo)' | ||
| assert text == "по ле, and , more ! now a (boo)" | ||
@@ -116,12 +127,14 @@ | ||
| def test_bad_punct_whitespace(): | ||
| html = (u'<pre><span>trees</span> ' | ||
| '<span>=</span> <span>webstruct</span>' | ||
| '<span>.</span><span>load_trees</span>' | ||
| '<span>(</span><span>"train/*.html"</span>' | ||
| '<span>)</span></pre>') | ||
| html = ( | ||
| "<pre><span>trees</span> " | ||
| "<span>=</span> <span>webstruct</span>" | ||
| "<span>.</span><span>load_trees</span>" | ||
| "<span>(</span><span>"train/*.html"</span>" | ||
| "<span>)</span></pre>" | ||
| ) | ||
| text = extract_text(html, guess_punct_space=False, guess_layout=False) | ||
| assert text == u'trees = webstruct . load_trees ( "train/*.html" )' | ||
| assert text == 'trees = webstruct . load_trees ( "train/*.html" )' | ||
| text = extract_text(html, guess_punct_space=True, guess_layout=False) | ||
| assert text == u'trees = webstruct.load_trees("train/*.html")' | ||
| assert text == 'trees = webstruct.load_trees("train/*.html")' | ||
@@ -131,17 +144,21 @@ | ||
| pytest.importorskip("parsel") | ||
| html = (u'<span><span id="extract-me">text<a>more</a>' | ||
| '</span>and more text <a> and some more</a> <a></a> </span>') | ||
| html = ( | ||
| '<span><span id="extract-me">text<a>more</a>' | ||
| "</span>and more text <a> and some more</a> <a></a> </span>" | ||
| ) | ||
| # Selector | ||
| sel = cleaned_selector(html) | ||
| assert selector_to_text(sel, **all_options) == 'text more and more text and some more' | ||
| assert ( | ||
| selector_to_text(sel, **all_options) == "text more and more text and some more" | ||
| ) | ||
| # SelectorList | ||
| subsel = sel.xpath('//span[@id="extract-me"]') | ||
| assert selector_to_text(subsel, **all_options) == 'text more' | ||
| subsel = sel.xpath('//a') | ||
| assert selector_to_text(subsel, **all_options) == 'more and some more' | ||
| assert selector_to_text(subsel, **all_options) == "text more" | ||
| subsel = sel.xpath("//a") | ||
| assert selector_to_text(subsel, **all_options) == "more and some more" | ||
| subsel = sel.xpath('//a[@id="extract-me"]') | ||
| assert selector_to_text(subsel, **all_options) == '' | ||
| subsel = sel.xpath('//foo') | ||
| assert selector_to_text(subsel, **all_options) == '' | ||
| assert selector_to_text(subsel, **all_options) == "" | ||
| subsel = sel.xpath("//foo") | ||
| assert selector_to_text(subsel, **all_options) == "" | ||
@@ -155,22 +172,32 @@ | ||
| def test_guess_layout(): | ||
| html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>' | ||
| '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>' | ||
| '<p>text_6<em>text_7</em>text_8</p>text_9</div>' | ||
| '<script>document.getElementById("demo").innerHTML = ' | ||
| '"This should be skipped";</script> <p>...text_10</p>') | ||
| html = ( | ||
| "<title> title </title><div>text_1.<p>text_2 text_3</p>" | ||
| '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>' | ||
| "<p>text_6<em>text_7</em>text_8</p>text_9</div>" | ||
| '<script>document.getElementById("demo").innerHTML = ' | ||
| '"This should be skipped";</script> <p>...text_10</p>' | ||
| ) | ||
| text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \ | ||
| 'text_8 text_9 ...text_10' | ||
| text = ( | ||
| "title text_1. text_2 text_3 text_4 text_5 text_6 text_7 " | ||
| "text_8 text_9 ...text_10" | ||
| ) | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=False) == text | ||
| text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' | ||
| '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10') | ||
| text = ( | ||
| "title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5" | ||
| "\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10" | ||
| ) | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=True) == text | ||
| text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \ | ||
| 'text_8 text_9...text_10' | ||
| text = ( | ||
| "title text_1. text_2 text_3 text_4 text_5 text_6 text_7 " | ||
| "text_8 text_9...text_10" | ||
| ) | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=False) == text | ||
| text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \ | ||
| 'text_6 text_7 text_8\n\ntext_9\n\n...text_10' | ||
| text = ( | ||
| "title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n" | ||
| "text_6 text_7 text_8\n\ntext_9\n\n...text_10" | ||
| ) | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=True) == text | ||
@@ -180,42 +207,39 @@ | ||
| def test_basic_newline(): | ||
| html = u'<div>a</div><div>b</div>' | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=False) == 'a b' | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=True) == 'a\nb' | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=False) == 'a b' | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=True) == 'a\nb' | ||
| html = "<div>a</div><div>b</div>" | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=False) == "a b" | ||
| assert extract_text(html, guess_punct_space=False, guess_layout=True) == "a\nb" | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=False) == "a b" | ||
| assert extract_text(html, guess_punct_space=True, guess_layout=True) == "a\nb" | ||
| def test_adjust_newline(): | ||
| html = u'<div>text 1</div><p><div>text 2</div></p>' | ||
| assert extract_text(html, guess_layout=True) == 'text 1\n\ntext 2' | ||
| html = "<div>text 1</div><p><div>text 2</div></p>" | ||
| assert extract_text(html, guess_layout=True) == "text 1\n\ntext 2" | ||
| def test_personalize_newlines_sets(): | ||
| html = (u'<span><span>text<a>more</a>' | ||
| '</span>and more text <a> and some more</a> <a></a> </span>') | ||
| html = ( | ||
| "<span><span>text<a>more</a>" | ||
| "</span>and more text <a> and some more</a> <a></a> </span>" | ||
| ) | ||
| text = extract_text(html, guess_layout=True, | ||
| newline_tags=NEWLINE_TAGS | {'a'}) | ||
| assert text == 'text\nmore\nand more text\nand some more' | ||
| text = extract_text(html, guess_layout=True, newline_tags=NEWLINE_TAGS | {"a"}) | ||
| assert text == "text\nmore\nand more text\nand some more" | ||
| text = extract_text(html, guess_layout=True, | ||
| double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'}) | ||
| assert text == 'text\n\nmore\n\nand more text\n\nand some more' | ||
| text = extract_text( | ||
| html, guess_layout=True, double_newline_tags=DOUBLE_NEWLINE_TAGS | {"a"} | ||
| ) | ||
| assert text == "text\n\nmore\n\nand more text\n\nand some more" | ||
| def _webpage_paths(): | ||
| webpages = sorted(glob.glob(os.path.join(ROOT, 'test_webpages', '*.html'))) | ||
| extracted = sorted(glob.glob(os.path.join(ROOT, 'test_webpages','*.txt'))) | ||
| def _webpage_paths() -> list[tuple[Path, Path]]: | ||
| webpages = sorted((ROOT / "test_webpages").glob("*.html")) | ||
| extracted = sorted((ROOT / "test_webpages").glob("*.txt")) | ||
| return list(zip(webpages, extracted)) | ||
| def _load_file(path): | ||
| with open(path, 'rb') as f: | ||
| return f.read().decode('utf8') | ||
| @pytest.mark.parametrize(['page', 'extracted'], _webpage_paths()) | ||
| @pytest.mark.parametrize(("page", "extracted"), _webpage_paths()) | ||
| def test_webpages(page, extracted): | ||
| html = _load_file(page) | ||
| expected = _load_file(extracted) | ||
| html = page.read_text(encoding="utf-8") | ||
| expected = extracted.read_text(encoding="utf-8") | ||
| assert extract_text(html) == expected | ||
@@ -228,4 +252,3 @@ | ||
| def test_deep_html(): | ||
| """ Make sure we don't crash due to recursion limit. | ||
| """ | ||
| """Make sure we don't crash due to recursion limit.""" | ||
| # Build a deep tree manually as default parser would only allow | ||
@@ -236,4 +259,4 @@ # for 255 depth, but deeper trees are possible with other parsers | ||
| for _ in range(n): | ||
| el = lxml.html.Element('div') | ||
| el.text = 'foo' | ||
| el = lxml.html.Element("div") | ||
| el.text = "foo" | ||
| if parent is None: | ||
@@ -246,2 +269,2 @@ root = el | ||
| assert extract_text(root) == ('foo\n' * n).strip() | ||
| assert extract_text(root) == ("foo\n" * n).strip() |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
206354
2.26%29
7.41%478
17.16%