html-text - PyPI Package Compare versions

html_text/py.typed

+152

pyproject.toml

		[tool.bumpversion]
		current_version = "0.7.0"
		commit = true
		tag = true
		tag_name = "{new_version}"

		[[tool.bumpversion.files]]
		filename = "setup.py"
		search = "version=\"{current_version}\""
		replace = "version=\"{new_version}\""

		[[tool.bumpversion.files]]
		filename = "html_text/__init__.py"
		search = "__version__ = \"{current_version}\""
		replace = "__version__ = \"{new_version}\""

		[tool.coverage.run]
		branch = true

		[tool.coverage.report]
		exclude_also = [
		"if TYPE_CHECKING:",
		]

		[[tool.mypy.overrides]]
		module = "tests.*"
		check_untyped_defs = true
		allow_untyped_defs = true

		[tool.ruff.lint]
		extend-select = [
		# flake8-bugbear
		"B",
		# flake8-comprehensions
		"C4",
		# pydocstyle
		"D",
		# flake8-future-annotations
		"FA",
		# flynt
		"FLY",
		# refurb
		"FURB",
		# isort
		"I",
		# flake8-implicit-str-concat
		"ISC",
		# flake8-logging
		"LOG",
		# Perflint
		"PERF",
		# pygrep-hooks
		"PGH",
		# flake8-pie
		"PIE",
		# pylint
		"PL",
		# flake8-pytest-style
		"PT",
		# flake8-use-pathlib
		"PTH",
		# flake8-pyi
		"PYI",
		# flake8-quotes
		"Q",
		# flake8-return
		"RET",
		# flake8-raise
		"RSE",
		# Ruff-specific rules
		"RUF",
		# flake8-bandit
		"S",
		# flake8-simplify
		"SIM",
		# flake8-slots
		"SLOT",
		# flake8-debugger
		"T10",
		# flake8-type-checking
		"TC",
		# pyupgrade
		"UP",
		# pycodestyle warnings
		"W",
		# flake8-2020
		"YTT",
		]
		ignore = [
		# Missing docstring in public module
		"D100",
		# Missing docstring in public class
		"D101",
		# Missing docstring in public method
		"D102",
		# Missing docstring in public function
		"D103",
		# Missing docstring in public package
		"D104",
		# Missing docstring in magic method
		"D105",
		# Missing docstring in public nested class
		"D106",
		# Missing docstring in __init__
		"D107",
		# One-line docstring should fit on one line with quotes
		"D200",
		# No blank lines allowed after function docstring
		"D202",
		# 1 blank line required between summary line and description
		"D205",
		# Multi-line docstring closing quotes should be on a separate line
		"D209",
		# First line should end with a period
		"D400",
		# First line should be in imperative mood; try rephrasing
		"D401",
		# First line should not be the function's "signature"
		"D402",
		# First word of the first line should be properly capitalized
		"D403",
		# No blank lines allowed between a section header and its content
		"D412",
		# Too many return statements
		"PLR0911",
		# Too many branches
		"PLR0912",
		# Too many arguments in function definition
		"PLR0913",
		# Too many statements
		"PLR0915",
		# Magic value used in comparison
		"PLR2004",
		# String contains ambiguous {}.
		"RUF001",
		# Docstring contains ambiguous {}.
		"RUF002",
		# Comment contains ambiguous {}.
		"RUF003",
		# Mutable class attributes should be annotated with `typing.ClassVar`
		"RUF012",
		# Use of `assert` detected
		"S101",
		# Using lxml to parse untrusted data is known to be vulnerable to XML attacks
		"S320",
		]

		[tool.ruff.lint.per-file-ignores]
		"html_text/__init__.py" = ["F401"]

		[tool.ruff.lint.pydocstyle]
		convention = "pep257"

+7

-0

CHANGES.rst

		@@ -5,2 +5,9 @@ =======

		0.7.0 (2025-02-10)
		------------------
		* Removed support for Python 3.8.
		* Added support for Python 3.13.
		* Added type hints and ``py.typed``.
		* CI improvements.

		0.6.2 (2024-05-01)
		@@ -7,0 +14,0 @@ ------------------

+20

-3

html_text.egg-info/PKG-INFO

		@@ -1,4 +0,4 @@
		Metadata-Version: 2.1
		Metadata-Version: 2.2
		Name: html_text
		Version: 0.6.2
		Version: 0.7.0
		Summary: Extract text from HTML
		@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text
		Classifier: Programming Language :: Python :: 3
		Classifier: Programming Language :: Python :: 3.8
		Classifier: Programming Language :: Python :: 3.9
		@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10
		Classifier: Programming Language :: Python :: 3.12
		Classifier: Programming Language :: Python :: 3.13
		Description-Content-Type: text/x-rst
		License-File: LICENSE
		Requires-Dist: lxml
		Requires-Dist: lxml-html-clean
		Dynamic: author
		Dynamic: author-email
		Dynamic: classifier
		Dynamic: description
		Dynamic: description-content-type
		Dynamic: home-page
		Dynamic: license
		Dynamic: requires-dist
		Dynamic: summary

		@@ -160,2 +170,9 @@ ============

		0.7.0 (2025-02-10)
		------------------
		* Removed support for Python 3.8.
		* Added support for Python 3.13.
		* Added type hints and ``py.typed``.
		* CI improvements.

		0.6.2 (2024-05-01)
		@@ -162,0 +179,0 @@ ------------------

+2

-1

html_text.egg-info/SOURCES.txt

		@@ -5,6 +5,7 @@ CHANGES.rst
		README.rst
		setup.cfg
		pyproject.toml
		setup.py
		html_text/__init__.py
		html_text/html_text.py
		html_text/py.typed
		html_text.egg-info/PKG-INFO
		@@ -11,0 +12,0 @@ html_text.egg-info/SOURCES.txt

+11

-5

html_text/__init__.py

		@@ -1,6 +0,12 @@
		# -- coding: utf-8 --
		__version__ = '0.6.2'
		__version__ = "0.7.0"

		from .html_text import (etree_to_text, extract_text, selector_to_text,
		parse_html, cleaned_selector, cleaner,
		NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
		from .html_text import (
		DOUBLE_NEWLINE_TAGS,
		NEWLINE_TAGS,
		cleaned_selector,
		cleaner,
		etree_to_text,
		extract_text,
		parse_html,
		selector_to_text,
		)

+122

-78

html_text/html_text.py

		@@ -1,3 +0,5 @@
		# -- coding: utf-8 --
		from __future__ import annotations

		import re
		from typing import TYPE_CHECKING

		@@ -8,13 +10,49 @@ import lxml

		if TYPE_CHECKING:
		from collections.abc import Iterable

		NEWLINE_TAGS = frozenset([
		'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
		'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
		'nav', 'table', 'tr'
		])
		DOUBLE_NEWLINE_TAGS = frozenset([
		'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
		'p', 'pre', 'title', 'ul'
		])
		import parsel

		NEWLINE_TAGS: frozenset[str] = frozenset(
		[
		"article",
		"aside",
		"br",
		"dd",
		"details",
		"div",
		"dt",
		"fieldset",
		"figcaption",
		"footer",
		"form",
		"header",
		"hr",
		"legend",
		"li",
		"main",
		"nav",
		"table",
		"tr",
		]
		)
		DOUBLE_NEWLINE_TAGS: frozenset[str] = frozenset(
		[
		"blockquote",
		"dl",
		"figure",
		"h1",
		"h2",
		"h3",
		"h4",
		"h5",
		"h6",
		"ol",
		"p",
		"pre",
		"title",
		"ul",
		]
		)

		cleaner = Cleaner(
		@@ -38,7 +76,4 @@ scripts=True,

		def _cleaned_html_tree(html):
		if isinstance(html, lxml.html.HtmlElement):
		tree = html
		else:
		tree = parse_html(html)
		def _cleaned_html_tree(html: lxml.html.HtmlElement \| str) -> lxml.html.HtmlElement:
		tree = html if isinstance(html, lxml.html.HtmlElement) else parse_html(html)

		@@ -54,29 +89,31 @@ # we need this as https://bugs.launchpad.net/lxml/+bug/1838497

		def parse_html(html):
		""" Create an lxml.html.HtmlElement from a string with html.
		def parse_html(html: str) -> lxml.html.HtmlElement:
		"""Create an lxml.html.HtmlElement from a string with html.
		XXX: mostly copy-pasted from parsel.selector.create_root_node
		"""
		body = html.strip().replace('\x00', '').encode('utf8') or b'<html/>'
		parser = lxml.html.HTMLParser(recover=True, encoding='utf8')
		body = html.strip().replace("\x00", "").encode("utf-8") or b"<html/>"
		parser = lxml.html.HTMLParser(recover=True, encoding="utf-8")
		root = lxml.etree.fromstring(body, parser=parser)
		if root is None:
		root = lxml.etree.fromstring(b'<html/>', parser=parser)
		root = lxml.etree.fromstring(b"<html/>", parser=parser)
		return root


		_whitespace = re.compile(r'\s+')
		_has_trailing_whitespace = re.compile(r'\s$').search
		_whitespace = re.compile(r"\s+")
		_has_trailing_whitespace = re.compile(r"\s$").search
		_has_punct_after = re.compile(r'^[,:;.!?")]').search
		_has_open_bracket_before = re.compile(r'\($').search
		_has_open_bracket_before = re.compile(r"\($").search


		def _normalize_whitespace(text):
		return _whitespace.sub(' ', text.strip())
		def _normalize_whitespace(text: str) -> str:
		return _whitespace.sub(" ", text.strip())


		def etree_to_text(tree,
		guess_punct_space=True,
		guess_layout=True,
		newline_tags=NEWLINE_TAGS,
		double_newline_tags=DOUBLE_NEWLINE_TAGS):
		def etree_to_text(
		tree: lxml.html.HtmlElement,
		guess_punct_space: bool = True,
		guess_layout: bool = True,
		newline_tags: Iterable[str] = NEWLINE_TAGS,
		double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS,
		) -> str:
		"""
		@@ -96,4 +133,4 @@ Convert a html tree to text. Tree should be cleaned with

		def should_add_space(text):
		""" Return True if extra whitespace should be added before text """
		def should_add_space(text: str) -> bool:
		"""Return True if extra whitespace should be added before text"""
		if prev in {_NEWLINE, _DOUBLE_NEWLINE}:
		@@ -103,13 +140,14 @@ return False
		return True
		if not _has_trailing_whitespace(prev):
		if _has_punct_after(text) or _has_open_bracket_before(prev):
		return False
		return True
		assert isinstance(prev, str)
		return bool(
		_has_trailing_whitespace(prev)
		or (not _has_punct_after(text) and not _has_open_bracket_before(prev))
		)

		def get_space_between(text):
		def get_space_between(text: str) -> str:
		if not text:
		return ' '
		return ' ' if should_add_space(text) else ''
		return " "
		return " " if should_add_space(text) else ""

		def add_newlines(tag):
		def add_newlines(tag: str) -> None:
		nonlocal prev
		@@ -121,12 +159,12 @@ if not guess_layout:
		if tag in double_newline_tags:
		chunks.append('\n' if prev is _NEWLINE else '\n\n')
		chunks.append("\n" if prev is _NEWLINE else "\n\n")
		prev = _DOUBLE_NEWLINE
		elif tag in newline_tags:
		if prev is not _NEWLINE:
		chunks.append('\n')
		chunks.append("\n")
		prev = _NEWLINE

		def add_text(text_content):
		def add_text(text_content: str \| None) -> None:
		nonlocal prev
		text = _normalize_whitespace(text_content) if text_content else ''
		text = _normalize_whitespace(text_content) if text_content else ""
		if not text:
		@@ -139,7 +177,9 @@ return
		# Extract text from the ``tree``: fill ``chunks`` variable
		for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')):
		if event == 'start':
		for event, el in lxml.etree.iterwalk(tree, events=("start", "end")):
		if event == "start":
		assert isinstance(el.tag, str)
		add_newlines(el.tag)
		add_text(el.text)
		elif event == 'end':
		elif event == "end":
		assert isinstance(el.tag, str)
		add_newlines(el.tag)
		@@ -149,7 +189,11 @@ if el is not tree:

		return ''.join(chunks).strip()
		return "".join(chunks).strip()


		def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
		""" Convert a cleaned parsel.Selector to text.
		def selector_to_text(
		sel: parsel.Selector \| parsel.SelectorList[parsel.Selector],
		guess_punct_space: bool = True,
		guess_layout: bool = True,
		) -> str:
		"""Convert a cleaned parsel.Selector to text.
		See html_text.extract_text docstring for description of the approach
		@@ -159,2 +203,3 @@ and options.
		import parsel

		if isinstance(sel, parsel.SelectorList):
		@@ -165,27 +210,27 @@ # if selecting a specific xpath
		extracted = etree_to_text(
		s.root,
		guess_punct_space=guess_punct_space,
		guess_layout=guess_layout)
		s.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout
		)
		if extracted:
		text.append(extracted)
		return ' '.join(text)
		else:
		return etree_to_text(
		sel.root,
		guess_punct_space=guess_punct_space,
		guess_layout=guess_layout)
		return " ".join(text)
		return etree_to_text(
		sel.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout
		)


		def cleaned_selector(html):
		""" Clean parsel.selector.
		"""
		def cleaned_selector(html: lxml.html.HtmlElement \| str) -> parsel.Selector:
		"""Clean parsel.selector."""
		import parsel

		try:
		tree = _cleaned_html_tree(html)
		sel = parsel.Selector(root=tree, type='html')
		except (lxml.etree.XMLSyntaxError,
		lxml.etree.ParseError,
		lxml.etree.ParserError,
		UnicodeEncodeError):
		sel = parsel.Selector(root=tree, type="html")
		except (
		lxml.etree.XMLSyntaxError,
		lxml.etree.ParseError,
		lxml.etree.ParserError,
		UnicodeEncodeError,
		):
		# likely plain text
		assert isinstance(html, str)
		sel = parsel.Selector(html)
		@@ -195,7 +240,9 @@ return sel

		def extract_text(html,
		guess_punct_space=True,
		guess_layout=True,
		newline_tags=NEWLINE_TAGS,
		double_newline_tags=DOUBLE_NEWLINE_TAGS):
		def extract_text(
		html: lxml.html.HtmlElement \| str \| None,
		guess_punct_space: bool = True,
		guess_layout: bool = True,
		newline_tags: Iterable[str] = NEWLINE_TAGS,
		double_newline_tags: Iterable[str] = DOUBLE_NEWLINE_TAGS,
		) -> str:
		"""
		@@ -227,9 +274,6 @@ Convert html to text, cleaning invisible content such as styles.
		if html is None:
		return ''
		no_content_nodes = (
		lxml.html.HtmlComment,
		lxml.html.HtmlProcessingInstruction
		)
		return ""
		no_content_nodes = (lxml.html.HtmlComment, lxml.html.HtmlProcessingInstruction)
		if isinstance(html, no_content_nodes):
		return ''
		return ""
		cleaned = _cleaned_html_tree(html)
		@@ -236,0 +280,0 @@ return etree_to_text(

+0

-4

MANIFEST.in

		@@ -1,3 +0,1 @@

		include CONTRIBUTING.rst
		include CHANGES.rst
		@@ -10,3 +8,1 @@ include LICENSE
		recursive-exclude * *.py[co]

		recursive-include docs .rst conf.py Makefile make.bat .jpg .png .gif

+20

-3

PKG-INFO

		@@ -1,4 +0,4 @@
		Metadata-Version: 2.1
		Metadata-Version: 2.2
		Name: html_text
		Version: 0.6.2
		Version: 0.7.0
		Summary: Extract text from HTML
		@@ -14,3 +14,2 @@ Home-page: https://github.com/zytedata/html-text
		Classifier: Programming Language :: Python :: 3
		Classifier: Programming Language :: Python :: 3.8
		Classifier: Programming Language :: Python :: 3.9
		@@ -20,5 +19,16 @@ Classifier: Programming Language :: Python :: 3.10
		Classifier: Programming Language :: Python :: 3.12
		Classifier: Programming Language :: Python :: 3.13
		Description-Content-Type: text/x-rst
		License-File: LICENSE
		Requires-Dist: lxml
		Requires-Dist: lxml-html-clean
		Dynamic: author
		Dynamic: author-email
		Dynamic: classifier
		Dynamic: description
		Dynamic: description-content-type
		Dynamic: home-page
		Dynamic: license
		Dynamic: requires-dist
		Dynamic: summary

		@@ -160,2 +170,9 @@ ============

		0.7.0 (2025-02-10)
		------------------
		* Removed support for Python 3.8.
		* Added support for Python 3.13.
		* Added type hints and ``py.typed``.
		* CI improvements.

		0.6.2 (2024-05-01)
		@@ -162,0 +179,0 @@ ------------------

+0

-17

setup.cfg

		@@ -1,18 +0,1 @@
		[bumpversion]
		current_version = 0.6.2
		commit = True
		tag = True
		tag_name = {new_version}

		[bumpversion:file:setup.py]
		search = version='{current_version}'
		replace = version='{new_version}'

		[bumpversion:file:html_text/__init__.py]
		search = __version__ = '{current_version}'
		replace = __version__ = '{new_version}'

		[bdist_wheel]
		universal = 1

		[egg_info]
		@@ -19,0 +2,0 @@ tag_build =

+25

-26

setup.py

		#!/usr/bin/env python
		# -- coding: utf-8 --
		from pathlib import Path

		from setuptools import setup

		with open('README.rst') as readme_file:
		readme = readme_file.read()
		readme = Path("README.rst").read_text(encoding="utf-8")
		history = Path("CHANGES.rst").read_text(encoding="utf-8")

		with open('CHANGES.rst') as history_file:
		history = history_file.read()


		setup(
		name='html_text',
		version='0.6.2',
		name="html_text",
		version="0.7.0",
		description="Extract text from HTML",
		long_description=readme + '\n\n' + history,
		long_description=readme + "\n\n" + history,
		long_description_content_type="text/x-rst",
		author="Konstantin Lopukhin",
		author_email='kostia.lopuhin@gmail.com',
		url='https://github.com/zytedata/html-text',
		packages=['html_text'],
		author_email="kostia.lopuhin@gmail.com",
		url="https://github.com/zytedata/html-text",
		packages=["html_text"],
		package_data={
		"html_text": ["py.typed"],
		},
		include_package_data=True,
		install_requires=[
		'lxml',
		'lxml-html-clean',
		"lxml",
		"lxml-html-clean",
		],
		@@ -30,15 +31,13 @@ license="MIT license",
		classifiers=[
		'Development Status :: 4 - Beta',
		'Intended Audience :: Developers',
		'License :: OSI Approved :: MIT License',
		'Natural Language :: English',
		'Programming Language :: Python :: 3',
		'Programming Language :: Python :: 3.8',
		'Programming Language :: Python :: 3.9',
		'Programming Language :: Python :: 3.10',
		'Programming Language :: Python :: 3.11',
		'Programming Language :: Python :: 3.12',
		"Development Status :: 4 - Beta",
		"Intended Audience :: Developers",
		"License :: OSI Approved :: MIT License",
		"Natural Language :: English",
		"Programming Language :: Python :: 3",
		"Programming Language :: Python :: 3.9",
		"Programming Language :: Python :: 3.10",
		"Programming Language :: Python :: 3.11",
		"Programming Language :: Python :: 3.12",
		"Programming Language :: Python :: 3.13",
		],
		test_suite='tests',
		tests_require=['pytest'],
		)

+132

-109

tests/test_html_text.py

		@@ -1,22 +0,30 @@
		# -- coding: utf-8 --
		import glob
		import os
		from __future__ import annotations

		from pathlib import Path

		import lxml.html
		import pytest

		from html_text import (extract_text, parse_html, cleaned_selector,
		etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS,
		DOUBLE_NEWLINE_TAGS)
		from html_text import (
		DOUBLE_NEWLINE_TAGS,
		NEWLINE_TAGS,
		cleaned_selector,
		cleaner,
		etree_to_text,
		extract_text,
		parse_html,
		selector_to_text,
		)

		ROOT = Path(__file__).parent

		ROOT = os.path.dirname(os.path.abspath(__file__))


		@pytest.fixture(params=[
		{'guess_punct_space': True, 'guess_layout': False},
		{'guess_punct_space': False, 'guess_layout': False},
		{'guess_punct_space': True, 'guess_layout': True},
		{'guess_punct_space': False, 'guess_layout': True}
		])
		@pytest.fixture(
		params=[
		{"guess_punct_space": True, "guess_layout": False},
		{"guess_punct_space": False, "guess_layout": False},
		{"guess_punct_space": True, "guess_layout": True},
		{"guess_punct_space": False, "guess_layout": True},
		]
		)
		def all_options(request):
		@@ -27,29 +35,32 @@ return request.param
		def test_extract_no_text_html(all_options):
		html = (u'<!DOCTYPE html><html><body><p><video width="320" height="240" '
		'controls><source src="movie.mp4" type="video/mp4"><source '
		'src="movie.ogg" type="video/ogg"></video></p></body></html>')
		assert extract_text(html, **all_options) == u''
		html = (
		'<!DOCTYPE html><html><body><p><video width="320" height="240" '
		'controls><source src="movie.mp4" type="video/mp4"><source '
		'src="movie.ogg" type="video/ogg"></video></p></body></html>'
		)
		assert extract_text(html, **all_options) == ""


		def test_extract_text(all_options):
		html = (u'<html><style>.div {}</style>'
		'<body><p>Hello, world!</body></html>')
		assert extract_text(html, **all_options) == u'Hello, world!'
		html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>"
		assert extract_text(html, **all_options) == "Hello, world!"


		def test_declared_encoding(all_options):
		html = (u'<?xml version="1.0" encoding="utf-8" ?>'
		u'<html><style>.div {}</style>'
		u'<body>Hello, world!</p></body></html>')
		assert extract_text(html, **all_options) == u'Hello, world!'
		html = (
		'<?xml version="1.0" encoding="utf-8" ?>'
		"<html><style>.div {}</style>"
		"<body>Hello, world!</p></body></html>"
		)
		assert extract_text(html, **all_options) == "Hello, world!"


		def test_empty(all_options):
		assert extract_text(u'', **all_options) == ''
		assert extract_text(u' ', **all_options) == ''
		assert extract_text(None, **all_options) == ''
		assert extract_text("", **all_options) == ""
		assert extract_text(" ", **all_options) == ""
		assert extract_text(None, **all_options) == ""


		def test_comment(all_options):
		assert extract_text(u"<!-- hello world -->", **all_options) == ''
		assert extract_text("<!-- hello world -->", **all_options) == ""

		@@ -59,7 +70,7 @@
		node = lxml.html.fragment_fromstring("<!-- hello world -->")
		assert extract_text(node, **all_options) == ''
		assert extract_text(node, **all_options) == ""


		def test_processing_instruction(all_options):
		assert extract_text('<?dbfo label-width="width"?>', **all_options) == ''
		assert extract_text('<?dbfo label-width="width"?>', **all_options) == ""

		@@ -69,23 +80,21 @@
		node = lxml.html.fragment_fromstring('<?dbfo label-width="width"?>')
		assert extract_text(node, **all_options) == ''
		assert extract_text(node, **all_options) == ""


		def test_extract_text_from_tree(all_options):
		html = (u'<html><style>.div {}</style>'
		'<body><p>Hello, world!</body></html>')
		html = "<html><style>.div {}</style><body><p>Hello, world!</body></html>"
		tree = parse_html(html)
		assert extract_text(tree, **all_options) == u'Hello, world!'
		assert extract_text(tree, **all_options) == "Hello, world!"


		def test_extract_text_from_node(all_options):
		html = (u'<html><style>.div {}</style>'
		'<body><p>Hello, world!</p></body></html>')
		html = "<html><style>.div {}</style><body><p>Hello, world!</p></body></html>"
		tree = parse_html(html)
		node = tree.xpath('//p')[0]
		assert extract_text(node, **all_options) == u'Hello, world!'
		node = tree.xpath("//p")[0]
		assert extract_text(node, **all_options) == "Hello, world!"


		def test_inline_tags_whitespace(all_options):
		html = u'<span>field</span><span>value of</span><span></span>'
		assert extract_text(html, **all_options) == u'field value of'
		html = "<span>field</span><span>value of</span><span></span>"
		assert extract_text(html, **all_options) == "field value of"

		@@ -96,17 +105,19 @@
		tree = parse_html(html)
		node = tree.xpath('/html/frameset')[0]
		assert extract_text(node) == u''
		node = tree.xpath("/html/frameset")[0]
		assert extract_text(node) == ""


		def test_punct_whitespace():
		html = u'<div><span>field</span>, and more</div>'
		assert extract_text(html, guess_punct_space=False) == u'field , and more'
		assert extract_text(html, guess_punct_space=True) == u'field, and more'
		html = "<div><span>field</span>, and more</div>"
		assert extract_text(html, guess_punct_space=False) == "field , and more"
		assert extract_text(html, guess_punct_space=True) == "field, and more"


		def test_punct_whitespace_preserved():
		html = (u'<div><span>по</span><span>ле</span>, and , '
		u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
		html = (
		"<div><span>по</span><span>ле</span>, and , "
		"<span>more </span>!<span>now</div>a (<b>boo</b>)"
		)
		text = extract_text(html, guess_punct_space=True, guess_layout=False)
		assert text == u'по ле, and , more ! now a (boo)'
		assert text == "по ле, and , more ! now a (boo)"

		@@ -116,12 +127,14 @@
		def test_bad_punct_whitespace():
		html = (u'<pre><span>trees</span> '
		'<span>=</span> <span>webstruct</span>'
		'<span>.</span><span>load_trees</span>'
		'<span>(</span><span>"train/*.html"</span>'
		'<span>)</span></pre>')
		html = (
		"<pre><span>trees</span> "
		"<span>=</span> <span>webstruct</span>"
		"<span>.</span><span>load_trees</span>"
		"<span>(</span><span>"train/*.html"</span>"
		"<span>)</span></pre>"
		)
		text = extract_text(html, guess_punct_space=False, guess_layout=False)
		assert text == u'trees = webstruct . load_trees ( "train/*.html" )'
		assert text == 'trees = webstruct . load_trees ( "train/*.html" )'

		text = extract_text(html, guess_punct_space=True, guess_layout=False)
		assert text == u'trees = webstruct.load_trees("train/*.html")'
		assert text == 'trees = webstruct.load_trees("train/*.html")'

		@@ -131,17 +144,21 @@
		pytest.importorskip("parsel")
		html = (u'<span><span id="extract-me">text<a>more</a>'
		'</span>and more text <a> and some more</a> <a></a> </span>')
		html = (
		'<span><span id="extract-me">text<a>more</a>'
		"</span>and more text <a> and some more</a> <a></a> </span>"
		)
		# Selector
		sel = cleaned_selector(html)
		assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
		assert (
		selector_to_text(sel, **all_options) == "text more and more text and some more"
		)

		# SelectorList
		subsel = sel.xpath('//span[@id="extract-me"]')
		assert selector_to_text(subsel, **all_options) == 'text more'
		subsel = sel.xpath('//a')
		assert selector_to_text(subsel, **all_options) == 'more and some more'
		assert selector_to_text(subsel, **all_options) == "text more"
		subsel = sel.xpath("//a")
		assert selector_to_text(subsel, **all_options) == "more and some more"
		subsel = sel.xpath('//a[@id="extract-me"]')
		assert selector_to_text(subsel, **all_options) == ''
		subsel = sel.xpath('//foo')
		assert selector_to_text(subsel, **all_options) == ''
		assert selector_to_text(subsel, **all_options) == ""
		subsel = sel.xpath("//foo")
		assert selector_to_text(subsel, **all_options) == ""

		@@ -155,22 +172,32 @@
		def test_guess_layout():
		html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>'
		'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
		'<p>text_6<em>text_7</em>text_8</p>text_9</div>'
		'<script>document.getElementById("demo").innerHTML = '
		'"This should be skipped";</script> <p>...text_10</p>')
		html = (
		"<title> title </title><div>text_1.<p>text_2 text_3</p>"
		'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
		"<p>text_6<em>text_7</em>text_8</p>text_9</div>"
		'<script>document.getElementById("demo").innerHTML = '
		'"This should be skipped";</script> <p>...text_10</p>'
		)

		text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
		'text_8 text_9 ...text_10'
		text = (
		"title text_1. text_2 text_3 text_4 text_5 text_6 text_7 "
		"text_8 text_9 ...text_10"
		)
		assert extract_text(html, guess_punct_space=False, guess_layout=False) == text

		text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
		'\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')
		text = (
		"title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5"
		"\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10"
		)
		assert extract_text(html, guess_punct_space=False, guess_layout=True) == text

		text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
		'text_8 text_9...text_10'
		text = (
		"title text_1. text_2 text_3 text_4 text_5 text_6 text_7 "
		"text_8 text_9...text_10"
		)
		assert extract_text(html, guess_punct_space=True, guess_layout=False) == text

		text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \
		'text_6 text_7 text_8\n\ntext_9\n\n...text_10'
		text = (
		"title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n"
		"text_6 text_7 text_8\n\ntext_9\n\n...text_10"
		)
		assert extract_text(html, guess_punct_space=True, guess_layout=True) == text
		@@ -180,42 +207,39 @@
		def test_basic_newline():
		html = u'<div>a</div><div>b</div>'
		assert extract_text(html, guess_punct_space=False, guess_layout=False) == 'a b'
		assert extract_text(html, guess_punct_space=False, guess_layout=True) == 'a\nb'
		assert extract_text(html, guess_punct_space=True, guess_layout=False) == 'a b'
		assert extract_text(html, guess_punct_space=True, guess_layout=True) == 'a\nb'
		html = "<div>a</div><div>b</div>"
		assert extract_text(html, guess_punct_space=False, guess_layout=False) == "a b"
		assert extract_text(html, guess_punct_space=False, guess_layout=True) == "a\nb"
		assert extract_text(html, guess_punct_space=True, guess_layout=False) == "a b"
		assert extract_text(html, guess_punct_space=True, guess_layout=True) == "a\nb"


		def test_adjust_newline():
		html = u'<div>text 1</div><p><div>text 2</div></p>'
		assert extract_text(html, guess_layout=True) == 'text 1\n\ntext 2'
		html = "<div>text 1</div><p><div>text 2</div></p>"
		assert extract_text(html, guess_layout=True) == "text 1\n\ntext 2"


		def test_personalize_newlines_sets():
		html = (u'<span><span>text<a>more</a>'
		'</span>and more text <a> and some more</a> <a></a> </span>')
		html = (
		"<span><span>text<a>more</a>"
		"</span>and more text <a> and some more</a> <a></a> </span>"
		)

		text = extract_text(html, guess_layout=True,
		newline_tags=NEWLINE_TAGS \| {'a'})
		assert text == 'text\nmore\nand more text\nand some more'
		text = extract_text(html, guess_layout=True, newline_tags=NEWLINE_TAGS \| {"a"})
		assert text == "text\nmore\nand more text\nand some more"

		text = extract_text(html, guess_layout=True,
		double_newline_tags=DOUBLE_NEWLINE_TAGS \| {'a'})
		assert text == 'text\n\nmore\n\nand more text\n\nand some more'
		text = extract_text(
		html, guess_layout=True, double_newline_tags=DOUBLE_NEWLINE_TAGS \| {"a"}
		)
		assert text == "text\n\nmore\n\nand more text\n\nand some more"


		def _webpage_paths():
		webpages = sorted(glob.glob(os.path.join(ROOT, 'test_webpages', '*.html')))
		extracted = sorted(glob.glob(os.path.join(ROOT, 'test_webpages','*.txt')))
		def _webpage_paths() -> list[tuple[Path, Path]]:
		webpages = sorted((ROOT / "test_webpages").glob("*.html"))
		extracted = sorted((ROOT / "test_webpages").glob("*.txt"))
		return list(zip(webpages, extracted))


		def _load_file(path):
		with open(path, 'rb') as f:
		return f.read().decode('utf8')


		@pytest.mark.parametrize(['page', 'extracted'], _webpage_paths())
		@pytest.mark.parametrize(("page", "extracted"), _webpage_paths())
		def test_webpages(page, extracted):
		html = _load_file(page)
		expected = _load_file(extracted)
		html = page.read_text(encoding="utf-8")
		expected = extracted.read_text(encoding="utf-8")
		assert extract_text(html) == expected
		@@ -228,4 +252,3 @@
		def test_deep_html():
		""" Make sure we don't crash due to recursion limit.
		"""
		"""Make sure we don't crash due to recursion limit."""
		# Build a deep tree manually as default parser would only allow
		@@ -236,4 +259,4 @@ # for 255 depth, but deeper trees are possible with other parsers
		for _ in range(n):
		el = lxml.html.Element('div')
		el.text = 'foo'
		el = lxml.html.Element("div")
		el.text = "foo"
		if parent is None:
		@@ -246,2 +269,2 @@ root = el

		assert extract_text(root) == ('foo\n' * n).strip()
		assert extract_text(root) == ("foo\n" * n).strip()

tests/__init__.py→tests/__init__.py

html-text - pypi Package Compare versions

Improved metrics