Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoSign in
Socket

html-text

Package Overview
Dependencies
Maintainers
3
Versions
15
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

html-text - pypi Package Compare versions

Comparing version
0.6.2
to
0.7.0
html_text/py.typed
+152
[tool.bumpversion]
current_version = "0.7.0"
commit = true
tag = true
tag_name = "{new_version}"
[[tool.bumpversion.files]]
filename = "setup.py"
search = "version=\"{current_version}\""
replace = "version=\"{new_version}\""
[[tool.bumpversion.files]]
filename = "html_text/__init__.py"
search = "__version__ = \"{current_version}\""
replace = "__version__ = \"{new_version}\""
[tool.coverage.run]
branch = true
[tool.coverage.report]
exclude_also = [
"if TYPE_CHECKING:",
]
[[tool.mypy.overrides]]
module = "tests.*"
check_untyped_defs = true
allow_untyped_defs = true
[tool.ruff.lint]
extend-select = [
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# pydocstyle
"D",
# flake8-future-annotations
"FA",
# flynt
"FLY",
# refurb
"FURB",
# isort
"I",
# flake8-implicit-str-concat
"ISC",
# flake8-logging
"LOG",
# Perflint
"PERF",
# pygrep-hooks
"PGH",
# flake8-pie
"PIE",
# pylint
"PL",
# flake8-pytest-style
"PT",
# flake8-use-pathlib
"PTH",
# flake8-pyi
"PYI",
# flake8-quotes
"Q",
# flake8-return
"RET",
# flake8-raise
"RSE",
# Ruff-specific rules
"RUF",
# flake8-bandit
"S",
# flake8-simplify
"SIM",
# flake8-slots
"SLOT",
# flake8-debugger
"T10",
# flake8-type-checking
"TC",
# pyupgrade
"UP",
# pycodestyle warnings
"W",
# flake8-2020
"YTT",
]
ignore = [
# Missing docstring in public module
"D100",
# Missing docstring in public class
"D101",
# Missing docstring in public method
"D102",
# Missing docstring in public function
"D103",
# Missing docstring in public package
"D104",
# Missing docstring in magic method
"D105",
# Missing docstring in public nested class
"D106",
# Missing docstring in __init__
"D107",
# One-line docstring should fit on one line with quotes
"D200",
# No blank lines allowed after function docstring
"D202",
# 1 blank line required between summary line and description
"D205",
# Multi-line docstring closing quotes should be on a separate line
"D209",
# First line should end with a period
"D400",
# First line should be in imperative mood; try rephrasing
"D401",
# First line should not be the function's "signature"
"D402",
# First word of the first line should be properly capitalized
"D403",
# No blank lines allowed between a section header and its content
"D412",
# Too many return statements
"PLR0911",
# Too many branches
"PLR0912",
# Too many arguments in function definition
"PLR0913",
# Too many statements
"PLR0915",
# Magic value used in comparison
"PLR2004",
# String contains ambiguous {}.
"RUF001",
# Docstring contains ambiguous {}.
"RUF002",
# Comment contains ambiguous {}.
"RUF003",
# Mutable class attributes should be annotated with `typing.ClassVar`
"RUF012",
# Use of `assert` detected
"S101",
# Using lxml to parse untrusted data is known to be vulnerable to XML attacks
"S320",
]
[tool.ruff.lint.per-file-ignores]
"html_text/__init__.py" = ["F401"]
[tool.ruff.lint.pydocstyle]
convention = "pep257"
+7
-0

@@ -5,2 +5,9 @@ =======

0.7.0 (2025-02-10)
------------------
* Removed support for Python 3.8.
* Added support for Python 3.13.
* Added type hints and ``py.typed``.
* CI improvements.
0.6.2 (2024-05-01)

@@ -7,0 +14,0 @@ ------------------

+20
-3

@@ -1,4 +0,4 @@

Metadata-Version: 2.1
Metadata-Version: 2.2
Name: html_text
Version: 0.6.2
Version: 0.7.0
Summary: Extract text from HTML

@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text

Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9

@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10

Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Description-Content-Type: text/x-rst
License-File: LICENSE
Requires-Dist: lxml
Requires-Dist: lxml-html-clean
Dynamic: author
Dynamic: author-email
Dynamic: classifier
Dynamic: description
Dynamic: description-content-type
Dynamic: home-page
Dynamic: license
Dynamic: requires-dist
Dynamic: summary

@@ -160,2 +170,9 @@ ============

0.7.0 (2025-02-10)
------------------
* Removed support for Python 3.8.
* Added support for Python 3.13.
* Added type hints and ``py.typed``.
* CI improvements.
0.6.2 (2024-05-01)

@@ -162,0 +179,0 @@ ------------------

+2
-1

@@ -5,6 +5,7 @@ CHANGES.rst

README.rst
setup.cfg
pyproject.toml
setup.py
html_text/__init__.py
html_text/html_text.py
html_text/py.typed
html_text.egg-info/PKG-INFO

@@ -11,0 +12,0 @@ html_text.egg-info/SOURCES.txt

@@ -1,6 +0,12 @@

# -*- coding: utf-8 -*-
__version__ = '0.6.2'
__version__ = "0.7.0"
from .html_text import (etree_to_text, extract_text, selector_to_text,
parse_html, cleaned_selector, cleaner,
NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
from .html_text import (
DOUBLE_NEWLINE_TAGS,
NEWLINE_TAGS,
cleaned_selector,
cleaner,
etree_to_text,
extract_text,
parse_html,
selector_to_text,
)

@@ -1,3 +0,5 @@

# -*- coding: utf-8 -*-
from __future__ import annotations
import re
from typing import TYPE_CHECKING

@@ -8,13 +10,49 @@ import lxml

if TYPE_CHECKING:
from collections.abc import Iterable
NEWLINE_TAGS = frozenset([
'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
'nav', 'table', 'tr'
])
DOUBLE_NEWLINE_TAGS = frozenset([
'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
'p', 'pre', 'title', 'ul'
])
import parsel
NEWLINE_TAGS: frozenset[str] = frozenset(
[
"article",
"aside",
"br",
"dd",
"details",
"div",
"dt",
"fieldset",
"figcaption",
"footer",
"form",
"header",
"hr",
"legend",
"li",
"main",
"nav",
"table",
"tr",
]
)
DOUBLE_NEWLINE_TAGS: frozenset[str] = frozenset(
[
"blockquote",
"dl",
"figure",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ol",
"p",
"pre",
"title",
"ul",
]
)
cleaner = Cleaner(

@@ -38,7 +76,4 @@ scripts=True,

def _cleaned_html_tree(html):
if isinstance(html, lxml.html.HtmlElement):
tree = html
else:
tree = parse_html(html)
def _cleaned_html_tree(html: lxml.html.HtmlElement | str) -> lxml.html.HtmlElement:
tree = html if isinstance(html, lxml.html.HtmlElement) else parse_html(html)

@@ -54,29 +89,31 @@ # we need this as https://bugs.launchpad.net/lxml/+bug/1838497

def parse_html(html):
""" Create an lxml.html.HtmlElement from a string with html.
def parse_html(html: str) -> lxml.html.HtmlElement:
"""Create an lxml.html.HtmlElement from a string with html.
XXX: mostly copy-pasted from parsel.selector.create_root_node
"""
body = html.strip().replace('\x00', '').encode('utf8') or b'<html/>'
parser = lxml.html.HTMLParser(recover=True, encoding='utf8')
body = html.strip().replace("\x00", "").encode("utf-8") or b"<html/>"
parser = lxml.html.HTMLParser(recover=True, encoding="utf-8")
root = lxml.etree.fromstring(body, parser=parser)
if root is None:
root = lxml.etree.fromstring(b'<html/>', parser=parser)
root = lxml.etree.fromstring(b"<html/>", parser=parser)
return root
_whitespace = re.compile(r'\s+')
_has_trailing_whitespace = re.compile(r'\s$').search
_whitespace = re.compile(r"\s+")
_has_trailing_whitespace = re.compile(r"\s$").search
_has_punct_after = re.compile(r'^[,:;.!?")]').search
_has_open_bracket_before = re.compile(r'\($').search
_has_open_bracket_before = re.compile(r"\($").search
def _normalize_whitespace(text):
return _whitespace.sub(' ', text.strip())
def _normalize_whitespace(text: str) -> str:
return _whitespace.sub(" ", text.strip())
def etree_to_text(tree,
guess_punct_space=True,
guess_layout=True,
newline_tags=NEWLINE_TAGS,
double_newline_tags=DOUBLE_NEWLINE_TAGS):
def etree_to_text(
tree: lxml.html.HtmlElement,
guess_punct_space: bool = True,
guess_layout: bool = True,
newline_tags: Iterable[str] = NEWLINE_TAGS,
double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS,
) -> str:
"""

@@ -96,4 +133,4 @@ Convert a html tree to text. Tree should be cleaned with

def should_add_space(text):
""" Return True if extra whitespace should be added before text """
def should_add_space(text: str) -> bool:
"""Return True if extra whitespace should be added before text"""
if prev in {_NEWLINE, _DOUBLE_NEWLINE}:

@@ -103,13 +140,14 @@ return False

return True
if not _has_trailing_whitespace(prev):
if _has_punct_after(text) or _has_open_bracket_before(prev):
return False
return True
assert isinstance(prev, str)
return bool(
_has_trailing_whitespace(prev)
or (not _has_punct_after(text) and not _has_open_bracket_before(prev))
)
def get_space_between(text):
def get_space_between(text: str) -> str:
if not text:
return ' '
return ' ' if should_add_space(text) else ''
return " "
return " " if should_add_space(text) else ""
def add_newlines(tag):
def add_newlines(tag: str) -> None:
nonlocal prev

@@ -121,12 +159,12 @@ if not guess_layout:

if tag in double_newline_tags:
chunks.append('\n' if prev is _NEWLINE else '\n\n')
chunks.append("\n" if prev is _NEWLINE else "\n\n")
prev = _DOUBLE_NEWLINE
elif tag in newline_tags:
if prev is not _NEWLINE:
chunks.append('\n')
chunks.append("\n")
prev = _NEWLINE
def add_text(text_content):
def add_text(text_content: str | None) -> None:
nonlocal prev
text = _normalize_whitespace(text_content) if text_content else ''
text = _normalize_whitespace(text_content) if text_content else ""
if not text:

@@ -139,7 +177,9 @@ return

# Extract text from the ``tree``: fill ``chunks`` variable
for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')):
if event == 'start':
for event, el in lxml.etree.iterwalk(tree, events=("start", "end")):
if event == "start":
assert isinstance(el.tag, str)
add_newlines(el.tag)
add_text(el.text)
elif event == 'end':
elif event == "end":
assert isinstance(el.tag, str)
add_newlines(el.tag)

@@ -149,7 +189,11 @@ if el is not tree:

return ''.join(chunks).strip()
return "".join(chunks).strip()
def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
""" Convert a cleaned parsel.Selector to text.
def selector_to_text(
sel: parsel.Selector | parsel.SelectorList[parsel.Selector],
guess_punct_space: bool = True,
guess_layout: bool = True,
) -> str:
"""Convert a cleaned parsel.Selector to text.
See html_text.extract_text docstring for description of the approach

@@ -159,2 +203,3 @@ and options.

import parsel
if isinstance(sel, parsel.SelectorList):

@@ -165,27 +210,27 @@ # if selecting a specific xpath

extracted = etree_to_text(
s.root,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout)
s.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout
)
if extracted:
text.append(extracted)
return ' '.join(text)
else:
return etree_to_text(
sel.root,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout)
return " ".join(text)
return etree_to_text(
sel.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout
)
def cleaned_selector(html):
""" Clean parsel.selector.
"""
def cleaned_selector(html: lxml.html.HtmlElement | str) -> parsel.Selector:
"""Clean parsel.selector."""
import parsel
try:
tree = _cleaned_html_tree(html)
sel = parsel.Selector(root=tree, type='html')
except (lxml.etree.XMLSyntaxError,
lxml.etree.ParseError,
lxml.etree.ParserError,
UnicodeEncodeError):
sel = parsel.Selector(root=tree, type="html")
except (
lxml.etree.XMLSyntaxError,
lxml.etree.ParseError,
lxml.etree.ParserError,
UnicodeEncodeError,
):
# likely plain text
assert isinstance(html, str)
sel = parsel.Selector(html)

@@ -195,7 +240,9 @@ return sel

def extract_text(html,
guess_punct_space=True,
guess_layout=True,
newline_tags=NEWLINE_TAGS,
double_newline_tags=DOUBLE_NEWLINE_TAGS):
def extract_text(
html: lxml.html.HtmlElement | str | None,
guess_punct_space: bool = True,
guess_layout: bool = True,
newline_tags: Iterable[str] = NEWLINE_TAGS,
double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS,
) -> str:
"""

@@ -227,9 +274,6 @@ Convert html to text, cleaning invisible content such as styles.

if html is None:
return ''
no_content_nodes = (
lxml.html.HtmlComment,
lxml.html.HtmlProcessingInstruction
)
return ""
no_content_nodes = (lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction)
if isinstance(html, no_content_nodes):
return ''
return ""
cleaned = _cleaned_html_tree(html)

@@ -236,0 +280,0 @@ return etree_to_text(

@@ -1,3 +0,1 @@

include CONTRIBUTING.rst
include CHANGES.rst

@@ -10,3 +8,1 @@ include LICENSE

recursive-exclude * *.py[co]
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif

@@ -1,4 +0,4 @@

Metadata-Version: 2.1
Metadata-Version: 2.2
Name: html_text
Version: 0.6.2
Version: 0.7.0
Summary: Extract text from HTML

@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text

Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9

@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10

Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Description-Content-Type: text/x-rst
License-File: LICENSE
Requires-Dist: lxml
Requires-Dist: lxml-html-clean
Dynamic: author
Dynamic: author-email
Dynamic: classifier
Dynamic: description
Dynamic: description-content-type
Dynamic: home-page
Dynamic: license
Dynamic: requires-dist
Dynamic: summary

@@ -160,2 +170,9 @@ ============

0.7.0 (2025-02-10)
------------------
* Removed support for Python 3.8.
* Added support for Python 3.13.
* Added type hints and ``py.typed``.
* CI improvements.
0.6.2 (2024-05-01)

@@ -162,0 +179,0 @@ ------------------

@@ -1,18 +0,1 @@

[bumpversion]
current_version = 0.6.2
commit = True
tag = True
tag_name = {new_version}
[bumpversion:file:setup.py]
search = version='{current_version}'
replace = version='{new_version}'
[bumpversion:file:html_text/__init__.py]
search = __version__ = '{current_version}'
replace = __version__ = '{new_version}'
[bdist_wheel]
universal = 1
[egg_info]

@@ -19,0 +2,0 @@ tag_build =

+25
-26
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pathlib import Path
from setuptools import setup
with open('README.rst') as readme_file:
readme = readme_file.read()
readme = Path("README.rst").read_text(encoding="utf-8")
history = Path("CHANGES.rst").read_text(encoding="utf-8")
with open('CHANGES.rst') as history_file:
history = history_file.read()
setup(
name='html_text',
version='0.6.2',
name="html_text",
version="0.7.0",
description="Extract text from HTML",
long_description=readme + '\n\n' + history,
long_description=readme + "\n\n" + history,
long_description_content_type="text/x-rst",
author="Konstantin Lopukhin",
author_email='kostia.lopuhin@gmail.com',
url='https://github.com/zytedata/html-text',
packages=['html_text'],
author_email="kostia.lopuhin@gmail.com",
url="https://github.com/zytedata/html-text",
packages=["html_text"],
package_data={
"html_text": ["py.typed"],
},
include_package_data=True,
install_requires=[
'lxml',
'lxml-html-clean',
"lxml",
"lxml-html-clean",
],

@@ -30,15 +31,13 @@ license="MIT license",

classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
],
test_suite='tests',
tests_require=['pytest'],
)

@@ -1,22 +0,30 @@

# -*- coding: utf-8 -*-
import glob
import os
from __future__ import annotations
from pathlib import Path
import lxml.html
import pytest
from html_text import (extract_text, parse_html, cleaned_selector,
etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS,
DOUBLE_NEWLINE_TAGS)
from html_text import (
DOUBLE_NEWLINE_TAGS,
NEWLINE_TAGS,
cleaned_selector,
cleaner,
etree_to_text,
extract_text,
parse_html,
selector_to_text,
)
ROOT = Path(__file__).parent
ROOT = os.path.dirname(os.path.abspath(__file__))
@pytest.fixture(params=[
{'guess_punct_space': True, 'guess_layout': False},
{'guess_punct_space': False, 'guess_layout': False},
{'guess_punct_space': True, 'guess_layout': True},
{'guess_punct_space': False, 'guess_layout': True}
])
@pytest.fixture(
params=[
{"guess_punct_space": True, "guess_layout": False},
{"guess_punct_space": False, "guess_layout": False},
{"guess_punct_space": True, "guess_layout": True},
{"guess_punct_space": False, "guess_layout": True},
]
)
def all_options(request):

@@ -27,29 +35,32 @@ return request.param

def test_extract_no_text_html(all_options):
html = (u'<!DOCTYPE html><html><body><p><video width="320" height="240" '
'controls><source src="movie.mp4" type="video/mp4"><source '
'src="movie.ogg" type="video/ogg"></video></p></body></html>')
assert extract_text(html, **all_options) == u''
html = (
'<!DOCTYPE html><html><body><p><video width="320" height="240" '
'controls><source src="movie.mp4" type="video/mp4"><source '
'src="movie.ogg" type="video/ogg"></video></p></body></html>'
)
assert extract_text(html, **all_options) == ""
def test_extract_text(all_options):
html = (u'<html><style>.div {}</style>'
'<body><p>Hello, world!</body></html>')
assert extract_text(html, **all_options) == u'Hello, world!'
html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>"
assert extract_text(html, **all_options) == "Hello, world!"
def test_declared_encoding(all_options):
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
u'<html><style>.div {}</style>'
u'<body>Hello, world!</p></body></html>')
assert extract_text(html, **all_options) == u'Hello, world!'
html = (
'<?xml version="1.0" encoding="utf-8" ?>'
"<html><style>.div {}</style>"
"<body>Hello, world!</p></body></html>"
)
assert extract_text(html, **all_options) == "Hello, world!"
def test_empty(all_options):
assert extract_text(u'', **all_options) == ''
assert extract_text(u' ', **all_options) == ''
assert extract_text(None, **all_options) == ''
assert extract_text("", **all_options) == ""
assert extract_text(" ", **all_options) == ""
assert extract_text(None, **all_options) == ""
def test_comment(all_options):
assert extract_text(u"<!-- hello world -->", **all_options) == ''
assert extract_text("<!-- hello world -->", **all_options) == ""

@@ -59,7 +70,7 @@

node = lxml.html.fragment_fromstring("<!-- hello world -->")
assert extract_text(node, **all_options) == ''
assert extract_text(node, **all_options) == ""
def test_processing_instruction(all_options):
assert extract_text('<?dbfo label-width="width"?>', **all_options) == ''
assert extract_text('<?dbfo label-width="width"?>', **all_options) == ""

@@ -69,23 +80,21 @@

node = lxml.html.fragment_fromstring('<?dbfo label-width="width"?>')
assert extract_text(node, **all_options) == ''
assert extract_text(node, **all_options) == ""
def test_extract_text_from_tree(all_options):
html = (u'<html><style>.div {}</style>'
'<body><p>Hello, world!</body></html>')
html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>"
tree = parse_html(html)
assert extract_text(tree, **all_options) == u'Hello, world!'
assert extract_text(tree, **all_options) == "Hello, world!"
def test_extract_text_from_node(all_options):
html = (u'<html><style>.div {}</style>'
'<body><p>Hello, world!</p></body></html>')
html = "<html><style>.div {}</style><body><p>Hello, world!</p></body></html>"
tree = parse_html(html)
node = tree.xpath('//p')[0]
assert extract_text(node, **all_options) == u'Hello, world!'
node = tree.xpath("//p")[0]
assert extract_text(node, **all_options) == "Hello, world!"
def test_inline_tags_whitespace(all_options):
html = u'<span>field</span><span>value of</span><span></span>'
assert extract_text(html, **all_options) == u'field value of'
html = "<span>field</span><span>value of</span><span></span>"
assert extract_text(html, **all_options) == "field value of"

@@ -96,17 +105,19 @@

tree = parse_html(html)
node = tree.xpath('/html/frameset')[0]
assert extract_text(node) == u''
node = tree.xpath("/html/frameset")[0]
assert extract_text(node) == ""
def test_punct_whitespace():
html = u'<div><span>field</span>, and more</div>'
assert extract_text(html, guess_punct_space=False) == u'field , and more'
assert extract_text(html, guess_punct_space=True) == u'field, and more'
html = "<div><span>field</span>, and more</div>"
assert extract_text(html, guess_punct_space=False) == "field , and more"
assert extract_text(html, guess_punct_space=True) == "field, and more"
def test_punct_whitespace_preserved():
html = (u'<div><span>по</span><span>ле</span>, and , '
u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
html = (
"<div><span>по</span><span>ле</span>, and , "
"<span>more </span>!<span>now</div>a (<b>boo</b>)"
)
text = extract_text(html, guess_punct_space=True, guess_layout=False)
assert text == u'по ле, and , more ! now a (boo)'
assert text == "по ле, and , more ! now a (boo)"

@@ -116,12 +127,14 @@

def test_bad_punct_whitespace():
html = (u'<pre><span>trees</span> '
'<span>=</span> <span>webstruct</span>'
'<span>.</span><span>load_trees</span>'
'<span>(</span><span>&quot;train/*.html&quot;</span>'
'<span>)</span></pre>')
html = (
"<pre><span>trees</span> "
"<span>=</span> <span>webstruct</span>"
"<span>.</span><span>load_trees</span>"
"<span>(</span><span>&quot;train/*.html&quot;</span>"
"<span>)</span></pre>"
)
text = extract_text(html, guess_punct_space=False, guess_layout=False)
assert text == u'trees = webstruct . load_trees ( "train/*.html" )'
assert text == 'trees = webstruct . load_trees ( "train/*.html" )'
text = extract_text(html, guess_punct_space=True, guess_layout=False)
assert text == u'trees = webstruct.load_trees("train/*.html")'
assert text == 'trees = webstruct.load_trees("train/*.html")'

@@ -131,17 +144,21 @@

pytest.importorskip("parsel")
html = (u'<span><span id="extract-me">text<a>more</a>'
'</span>and more text <a> and some more</a> <a></a> </span>')
html = (
'<span><span id="extract-me">text<a>more</a>'
"</span>and more text <a> and some more</a> <a></a> </span>"
)
# Selector
sel = cleaned_selector(html)
assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
assert (
selector_to_text(sel, **all_options) == "text more and more text and some more"
)
# SelectorList
subsel = sel.xpath('//span[@id="extract-me"]')
assert selector_to_text(subsel, **all_options) == 'text more'
subsel = sel.xpath('//a')
assert selector_to_text(subsel, **all_options) == 'more and some more'
assert selector_to_text(subsel, **all_options) == "text more"
subsel = sel.xpath("//a")
assert selector_to_text(subsel, **all_options) == "more and some more"
subsel = sel.xpath('//a[@id="extract-me"]')
assert selector_to_text(subsel, **all_options) == ''
subsel = sel.xpath('//foo')
assert selector_to_text(subsel, **all_options) == ''
assert selector_to_text(subsel, **all_options) == ""
subsel = sel.xpath("//foo")
assert selector_to_text(subsel, **all_options) == ""

@@ -155,22 +172,32 @@

def test_guess_layout():
html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>'
'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
'<p>text_6<em>text_7</em>text_8</p>text_9</div>'
'<script>document.getElementById("demo").innerHTML = '
'"This should be skipped";</script> <p>...text_10</p>')
html = (
"<title> title </title><div>text_1.<p>text_2 text_3</p>"
'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
"<p>text_6<em>text_7</em>text_8</p>text_9</div>"
'<script>document.getElementById("demo").innerHTML = '
'"This should be skipped";</script> <p>...text_10</p>'
)
text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
'text_8 text_9 ...text_10'
text = (
"title text_1. text_2 text_3 text_4 text_5 text_6 text_7 "
"text_8 text_9 ...text_10"
)
assert extract_text(html, guess_punct_space=False, guess_layout=False) == text
text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
'\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')
text = (
"title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5"
"\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10"
)
assert extract_text(html, guess_punct_space=False, guess_layout=True) == text
text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
'text_8 text_9...text_10'
text = (
"title text_1. text_2 text_3 text_4 text_5 text_6 text_7 "
"text_8 text_9...text_10"
)
assert extract_text(html, guess_punct_space=True, guess_layout=False) == text
text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \
'text_6 text_7 text_8\n\ntext_9\n\n...text_10'
text = (
"title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n"
"text_6 text_7 text_8\n\ntext_9\n\n...text_10"
)
assert extract_text(html, guess_punct_space=True, guess_layout=True) == text

@@ -180,42 +207,39 @@

def test_basic_newline():
html = u'<div>a</div><div>b</div>'
assert extract_text(html, guess_punct_space=False, guess_layout=False) == 'a b'
assert extract_text(html, guess_punct_space=False, guess_layout=True) == 'a\nb'
assert extract_text(html, guess_punct_space=True, guess_layout=False) == 'a b'
assert extract_text(html, guess_punct_space=True, guess_layout=True) == 'a\nb'
html = "<div>a</div><div>b</div>"
assert extract_text(html, guess_punct_space=False, guess_layout=False) == "a b"
assert extract_text(html, guess_punct_space=False, guess_layout=True) == "a\nb"
assert extract_text(html, guess_punct_space=True, guess_layout=False) == "a b"
assert extract_text(html, guess_punct_space=True, guess_layout=True) == "a\nb"
def test_adjust_newline():
html = u'<div>text 1</div><p><div>text 2</div></p>'
assert extract_text(html, guess_layout=True) == 'text 1\n\ntext 2'
html = "<div>text 1</div><p><div>text 2</div></p>"
assert extract_text(html, guess_layout=True) == "text 1\n\ntext 2"
def test_personalize_newlines_sets():
html = (u'<span><span>text<a>more</a>'
'</span>and more text <a> and some more</a> <a></a> </span>')
html = (
"<span><span>text<a>more</a>"
"</span>and more text <a> and some more</a> <a></a> </span>"
)
text = extract_text(html, guess_layout=True,
newline_tags=NEWLINE_TAGS | {'a'})
assert text == 'text\nmore\nand more text\nand some more'
text = extract_text(html, guess_layout=True, newline_tags=NEWLINE_TAGS | {"a"})
assert text == "text\nmore\nand more text\nand some more"
text = extract_text(html, guess_layout=True,
double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'})
assert text == 'text\n\nmore\n\nand more text\n\nand some more'
text = extract_text(
html, guess_layout=True, double_newline_tags=DOUBLE_NEWLINE_TAGS | {"a"}
)
assert text == "text\n\nmore\n\nand more text\n\nand some more"
def _webpage_paths():
webpages = sorted(glob.glob(os.path.join(ROOT, 'test_webpages', '*.html')))
extracted = sorted(glob.glob(os.path.join(ROOT, 'test_webpages','*.txt')))
def _webpage_paths() -> list[tuple[Path, Path]]:
webpages = sorted((ROOT / "test_webpages").glob("*.html"))
extracted = sorted((ROOT / "test_webpages").glob("*.txt"))
return list(zip(webpages, extracted))
def _load_file(path):
with open(path, 'rb') as f:
return f.read().decode('utf8')
@pytest.mark.parametrize(['page', 'extracted'], _webpage_paths())
@pytest.mark.parametrize(("page", "extracted"), _webpage_paths())
def test_webpages(page, extracted):
html = _load_file(page)
expected = _load_file(extracted)
html = page.read_text(encoding="utf-8")
expected = extracted.read_text(encoding="utf-8")
assert extract_text(html) == expected

@@ -228,4 +252,3 @@

def test_deep_html():
""" Make sure we don't crash due to recursion limit.
"""
"""Make sure we don't crash due to recursion limit."""
# Build a deep tree manually as default parser would only allow

@@ -236,4 +259,4 @@ # for 255 depth, but deeper trees are possible with other parsers

for _ in range(n):
el = lxml.html.Element('div')
el.text = 'foo'
el = lxml.html.Element("div")
el.text = "foo"
if parent is None:

@@ -246,2 +269,2 @@ root = el

assert extract_text(root) == ('foo\n' * n).strip()
assert extract_text(root) == ("foo\n" * n).strip()