@@ -5,2 +5,6 @@ =======

		0.6.2 (2024-05-01)
		------------------
		* Support deeper trees by using iteration instead of recursion.

		0.6.1 (2024-04-23)
		@@ -7,0 +11,0 @@ ------------------

+5

-7

html_text.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: html_text
		Version: 0.6.1
		Version: 0.6.2
		Summary: Extract text from HTML
		@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text

		----

		.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
		:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
		:alt: define hyperiongray


		=======
		@@ -165,2 +159,6 @@ History

		0.6.2 (2024-05-01)
		------------------
		* Support deeper trees by using iteration instead of recursion.

		0.6.1 (2024-04-23)
		@@ -167,0 +165,0 @@ ------------------

+1

-1

html_text/__init__.py

		# -- coding: utf-8 --
		__version__ = '0.6.1'
		__version__ = '0.6.2'

		@@ -4,0 +4,0 @@ from .html_text import (etree_to_text, extract_text, selector_to_text,

+21

-25

html_text/html_text.py

		@@ -91,9 +91,5 @@ # -- coding: utf-8 --
		_DOUBLE_NEWLINE = object()
		prev = _DOUBLE_NEWLINE # _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str)

		class Context:
		""" workaround for missing `nonlocal` in Python 2 """
		# _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str)
		prev = _DOUBLE_NEWLINE

		def should_add_space(text, prev):
		def should_add_space(text):
		""" Return True if extra whitespace should be added before text """
		@@ -109,40 +105,40 @@ if prev in {_NEWLINE, _DOUBLE_NEWLINE}:

		def get_space_between(text, prev):
		def get_space_between(text):
		if not text:
		return ' '
		return ' ' if should_add_space(text, prev) else ''
		return ' ' if should_add_space(text) else ''

		def add_newlines(tag, context):
		def add_newlines(tag):
		nonlocal prev
		if not guess_layout:
		return
		prev = context.prev
		if prev is _DOUBLE_NEWLINE: # don't output more than 1 blank line
		return
		if tag in double_newline_tags:
		context.prev = _DOUBLE_NEWLINE
		chunks.append('\n' if prev is _NEWLINE else '\n\n')
		prev = _DOUBLE_NEWLINE
		elif tag in newline_tags:
		context.prev = _NEWLINE
		if prev is not _NEWLINE:
		chunks.append('\n')
		prev = _NEWLINE

		def add_text(text_content, context):
		def add_text(text_content):
		nonlocal prev
		text = _normalize_whitespace(text_content) if text_content else ''
		if not text:
		return
		space = get_space_between(text, context.prev)
		space = get_space_between(text)
		chunks.extend([space, text])
		context.prev = text_content
		prev = text_content

		def traverse_text_fragments(tree, context, handle_tail=True):
		""" Extract text from the ``tree``: fill ``chunks`` variable """
		add_newlines(tree.tag, context)
		add_text(tree.text, context)
		for child in tree:
		traverse_text_fragments(child, context)
		add_newlines(tree.tag, context)
		if handle_tail:
		add_text(tree.tail, context)
		# Extract text from the ``tree``: fill ``chunks`` variable
		for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')):
		if event == 'start':
		add_newlines(el.tag)
		add_text(el.text)
		elif event == 'end':
		add_newlines(el.tag)
		if el is not tree:
		add_text(el.tail)

		traverse_text_fragments(tree, context=Context(), handle_tail=False)
		return ''.join(chunks).strip()
		@@ -149,0 +145,0 @@

+5

-7

PKG-INFO

		Metadata-Version: 2.1
		Name: html_text
		Version: 0.6.1
		Version: 0.6.2
		Summary: Extract text from HTML
		@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text

		----

		.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
		:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
		:alt: define hyperiongray


		=======
		@@ -165,2 +159,6 @@ History

		0.6.2 (2024-05-01)
		------------------
		* Support deeper trees by using iteration instead of recursion.

		0.6.1 (2024-04-23)
		@@ -167,0 +165,0 @@ ------------------

+0

-6

README.rst

		@@ -130,7 +130,1 @@ ============
		`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.

		----

		.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
		:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
		:alt: define hyperiongray

+1

-1

setup.cfg

		[bumpversion]
		current_version = 0.6.1
		current_version = 0.6.2
		commit = True
		@@ -4,0 +4,0 @@ tag = True

+1

-1

setup.py

		@@ -15,3 +15,3 @@ #!/usr/bin/env python
		name='html_text',
		version='0.6.1',
		version='0.6.2',
		description="Extract text from HTML",
		@@ -18,0 +18,0 @@ long_description=readme + '\n\n' + history,

+20

-0

tests/test_html_text.py

		@@ -217,1 +217,21 @@ # -- coding: utf-8 --
		assert etree_to_text(tree) == expected


		def test_deep_html():
		""" Make sure we don't crash due to recursion limit.
		"""
		# Build a deep tree manually as default parser would only allow
		# for 255 depth, but deeper trees are possible with other parsers
		n = 5000
		parent = root = None
		for _ in range(n):
		el = lxml.html.Element('div')
		el.text = 'foo'
		if parent is None:
		root = el
		parent = el
		else:
		parent.append(el)
		parent = el

		assert extract_text(root) == ('foo\n' * n).strip()

html-text - pypi Package Compare versions

Improved metrics