html-text
Advanced tools
+4
-0
@@ -5,2 +5,6 @@ ======= | ||
| 0.6.2 (2024-05-01) | ||
| ------------------ | ||
| * Support deeper trees by using iteration instead of recursion. | ||
| 0.6.1 (2024-04-23) | ||
@@ -7,0 +11,0 @@ ------------------ |
| Metadata-Version: 2.1 | ||
| Name: html_text | ||
| Version: 0.6.1 | ||
| Version: 0.6.2 | ||
| Summary: Extract text from HTML | ||
@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text | ||
| ---- | ||
| .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg | ||
| :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text | ||
| :alt: define hyperiongray | ||
| ======= | ||
@@ -165,2 +159,6 @@ History | ||
| 0.6.2 (2024-05-01) | ||
| ------------------ | ||
| * Support deeper trees by using iteration instead of recursion. | ||
| 0.6.1 (2024-04-23) | ||
@@ -167,0 +165,0 @@ ------------------ |
| # -*- coding: utf-8 -*- | ||
| __version__ = '0.6.1' | ||
| __version__ = '0.6.2' | ||
@@ -4,0 +4,0 @@ from .html_text import (etree_to_text, extract_text, selector_to_text, |
+21
-25
@@ -91,9 +91,5 @@ # -*- coding: utf-8 -*- | ||
| _DOUBLE_NEWLINE = object() | ||
| prev = _DOUBLE_NEWLINE # _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str) | ||
| class Context: | ||
| """ workaround for missing `nonlocal` in Python 2 """ | ||
| # _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str) | ||
| prev = _DOUBLE_NEWLINE | ||
| def should_add_space(text, prev): | ||
| def should_add_space(text): | ||
| """ Return True if extra whitespace should be added before text """ | ||
@@ -109,40 +105,40 @@ if prev in {_NEWLINE, _DOUBLE_NEWLINE}: | ||
| def get_space_between(text, prev): | ||
| def get_space_between(text): | ||
| if not text: | ||
| return ' ' | ||
| return ' ' if should_add_space(text, prev) else '' | ||
| return ' ' if should_add_space(text) else '' | ||
| def add_newlines(tag, context): | ||
| def add_newlines(tag): | ||
| nonlocal prev | ||
| if not guess_layout: | ||
| return | ||
| prev = context.prev | ||
| if prev is _DOUBLE_NEWLINE: # don't output more than 1 blank line | ||
| return | ||
| if tag in double_newline_tags: | ||
| context.prev = _DOUBLE_NEWLINE | ||
| chunks.append('\n' if prev is _NEWLINE else '\n\n') | ||
| prev = _DOUBLE_NEWLINE | ||
| elif tag in newline_tags: | ||
| context.prev = _NEWLINE | ||
| if prev is not _NEWLINE: | ||
| chunks.append('\n') | ||
| prev = _NEWLINE | ||
| def add_text(text_content, context): | ||
| def add_text(text_content): | ||
| nonlocal prev | ||
| text = _normalize_whitespace(text_content) if text_content else '' | ||
| if not text: | ||
| return | ||
| space = get_space_between(text, context.prev) | ||
| space = get_space_between(text) | ||
| chunks.extend([space, text]) | ||
| context.prev = text_content | ||
| prev = text_content | ||
| def traverse_text_fragments(tree, context, handle_tail=True): | ||
| """ Extract text from the ``tree``: fill ``chunks`` variable """ | ||
| add_newlines(tree.tag, context) | ||
| add_text(tree.text, context) | ||
| for child in tree: | ||
| traverse_text_fragments(child, context) | ||
| add_newlines(tree.tag, context) | ||
| if handle_tail: | ||
| add_text(tree.tail, context) | ||
| # Extract text from the ``tree``: fill ``chunks`` variable | ||
| for event, el in lxml.etree.iterwalk(tree, events=('start', 'end')): | ||
| if event == 'start': | ||
| add_newlines(el.tag) | ||
| add_text(el.text) | ||
| elif event == 'end': | ||
| add_newlines(el.tag) | ||
| if el is not tree: | ||
| add_text(el.tail) | ||
| traverse_text_fragments(tree, context=Context(), handle_tail=False) | ||
| return ''.join(chunks).strip() | ||
@@ -149,0 +145,0 @@ |
+5
-7
| Metadata-Version: 2.1 | ||
| Name: html_text | ||
| Version: 0.6.1 | ||
| Version: 0.6.2 | ||
| Summary: Extract text from HTML | ||
@@ -153,9 +153,3 @@ Home-page: https://github.com/zytedata/html-text | ||
| ---- | ||
| .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg | ||
| :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text | ||
| :alt: define hyperiongray | ||
| ======= | ||
@@ -165,2 +159,6 @@ History | ||
| 0.6.2 (2024-05-01) | ||
| ------------------ | ||
| * Support deeper trees by using iteration instead of recursion. | ||
| 0.6.1 (2024-04-23) | ||
@@ -167,0 +165,0 @@ ------------------ |
+0
-6
@@ -130,7 +130,1 @@ ============ | ||
| `webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library. | ||
| ---- | ||
| .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg | ||
| :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=html-text | ||
| :alt: define hyperiongray |
+1
-1
| [bumpversion] | ||
| current_version = 0.6.1 | ||
| current_version = 0.6.2 | ||
| commit = True | ||
@@ -4,0 +4,0 @@ tag = True |
+1
-1
@@ -15,3 +15,3 @@ #!/usr/bin/env python | ||
| name='html_text', | ||
| version='0.6.1', | ||
| version='0.6.2', | ||
| description="Extract text from HTML", | ||
@@ -18,0 +18,0 @@ long_description=readme + '\n\n' + history, |
@@ -217,1 +217,21 @@ # -*- coding: utf-8 -*- | ||
| assert etree_to_text(tree) == expected | ||
| def test_deep_html(): | ||
| """ Make sure we don't crash due to recursion limit. | ||
| """ | ||
| # Build a deep tree manually as default parser would only allow | ||
| # for 255 depth, but deeper trees are possible with other parsers | ||
| n = 5000 | ||
| parent = root = None | ||
| for _ in range(n): | ||
| el = lxml.html.Element('div') | ||
| el.text = 'foo' | ||
| if parent is None: | ||
| root = el | ||
| parent = el | ||
| else: | ||
| parent.append(el) | ||
| parent = el | ||
| assert extract_text(root) == ('foo\n' * n).strip() |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
201799
0.03%408
3.55%