Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoSign in
Socket

html-text

Package Overview
Dependencies
Maintainers
3
Versions
15
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

html-text - pypi Package Compare versions

Comparing version
0.6.1
to
0.6.2
+4
-0
CHANGES.rst

@@ -5,2 +5,6 @@ =======

0.6.2 (2024-05-01)
------------------
* Support deeper trees by using iteration instead of recursion.
0.6.1 (2024-04-23)

@@ -7,0 +11,0 @@ ------------------

+5
-7
Metadata-Version: 2.1
Name: html_text
Version: 0.6.1
Version: 0.6.2
Summary: Extract text from HTML

@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text

----
.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
:alt: define hyperiongray
=======

@@ -165,2 +159,6 @@ History

0.6.2 (2024-05-01)
------------------
* Support deeper trees by using iteration instead of recursion.
0.6.1 (2024-04-23)

@@ -167,0 +165,0 @@ ------------------

# -*- coding: utf-8 -*-
__version__ = '0.6.1'
__version__ = '0.6.2'

@@ -4,0 +4,0 @@ from .html_text import (etree_to_text, extract_text, selector_to_text,

@@ -91,9 +91,5 @@ # -*- coding: utf-8 -*-

_DOUBLE_NEWLINE = object()
prev = _DOUBLE_NEWLINE # _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str)
class Context:
""" workaround for missing `nonlocal` in Python 2 """
# _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str)
prev = _DOUBLE_NEWLINE
def should_add_space(text, prev):
def should_add_space(text):
""" Return True if extra whitespace should be added before text """

@@ -109,40 +105,40 @@ if prev in {_NEWLINE, _DOUBLE_NEWLINE}:

def get_space_between(text, prev):
def get_space_between(text):
if not text:
return ' '
return ' ' if should_add_space(text, prev) else ''
return ' ' if should_add_space(text) else ''
def add_newlines(tag, context):
def add_newlines(tag):
nonlocal prev
if not guess_layout:
return
prev = context.prev
if prev is _DOUBLE_NEWLINE: # don't output more than 1 blank line
return
if tag in double_newline_tags:
context.prev = _DOUBLE_NEWLINE
chunks.append('\n' if prev is _NEWLINE else '\n\n')
prev = _DOUBLE_NEWLINE
elif tag in newline_tags:
context.prev = _NEWLINE
if prev is not _NEWLINE:
chunks.append('\n')
prev = _NEWLINE
def add_text(text_content, context):
def add_text(text_content):
nonlocal prev
text = _normalize_whitespace(text_content) if text_content else ''
if not text:
return
space = get_space_between(text, context.prev)
space = get_space_between(text)
chunks.extend([space, text])
context.prev = text_content
prev = text_content
def traverse_text_fragments(tree, context, handle_tail=True):
""" Extract text from the ``tree``: fill ``chunks`` variable """
add_newlines(tree.tag, context)
add_text(tree.text, context)
for child in tree:
traverse_text_fragments(child, context)
add_newlines(tree.tag, context)
if handle_tail:
add_text(tree.tail, context)
# Extract text from the ``tree``: fill ``chunks`` variable
for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')):
if event == 'start':
add_newlines(el.tag)
add_text(el.text)
elif event == 'end':
add_newlines(el.tag)
if el is not tree:
add_text(el.tail)
traverse_text_fragments(tree, context=Context(), handle_tail=False)
return ''.join(chunks).strip()

@@ -149,0 +145,0 @@

Metadata-Version: 2.1
Name: html_text
Version: 0.6.1
Version: 0.6.2
Summary: Extract text from HTML

@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text

----
.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
:alt: define hyperiongray
=======

@@ -165,2 +159,6 @@ History

0.6.2 (2024-05-01)
------------------
* Support deeper trees by using iteration instead of recursion.
0.6.1 (2024-04-23)

@@ -167,0 +165,0 @@ ------------------

@@ -130,7 +130,1 @@ ============

`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.
----
.. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text
:alt: define hyperiongray
[bumpversion]
current_version = 0.6.1
current_version = 0.6.2
commit = True

@@ -4,0 +4,0 @@ tag = True

@@ -15,3 +15,3 @@ #!/usr/bin/env python

name='html_text',
version='0.6.1',
version='0.6.2',
description="Extract text from HTML",

@@ -18,0 +18,0 @@ long_description=readme + '\n\n' + history,

@@ -217,1 +217,21 @@ # -*- coding: utf-8 -*-

assert etree_to_text(tree) == expected
def test_deep_html():
""" Make sure we don't crash due to recursion limit.
"""
# Build a deep tree manually as default parser would only allow
# for 255 depth, but deeper trees are possible with other parsers
n = 5000
parent = root = None
for _ in range(n):
el = lxml.html.Element('div')
el.text = 'foo'
if parent is None:
root = el
parent = el
else:
parent.append(el)
parent = el
assert extract_text(root) == ('foo\n' * n).strip()