html-text
Advanced tools
+8
-0
@@ -5,2 +5,10 @@ ======= | ||
| 0.5.2 (2020-07-22) | ||
| ------------------ | ||
| * Handle lxml Cleaner exceptions (a workaround for | ||
| https://bugs.launchpad.net/lxml/+bug/1838497 ); | ||
| * Python 3.8 support; | ||
| * testing improvements. | ||
| 0.5.1 (2019-05-27) | ||
@@ -7,0 +15,0 @@ ------------------ |
| Metadata-Version: 1.1 | ||
| Name: html-text | ||
| Version: 0.5.1 | ||
| Version: 0.5.2 | ||
| Summary: Extract text from HTML | ||
@@ -146,2 +146,10 @@ Home-page: https://github.com/TeamHG-Memex/html-text | ||
| 0.5.2 (2020-07-22) | ||
| ------------------ | ||
| * Handle lxml Cleaner exceptions (a workaround for | ||
| https://bugs.launchpad.net/lxml/+bug/1838497 ); | ||
| * Python 3.8 support; | ||
| * testing improvements. | ||
| 0.5.1 (2019-05-27) | ||
@@ -231,1 +239,2 @@ ------------------ | ||
| Classifier: Programming Language :: Python :: 3.7 | ||
| Classifier: Programming Language :: Python :: 3.8 |
| # -*- coding: utf-8 -*- | ||
| __version__ = '0.5.1' | ||
| __version__ = '0.5.2' | ||
@@ -4,0 +4,0 @@ from .html_text import (etree_to_text, extract_text, selector_to_text, |
@@ -42,5 +42,12 @@ # -*- coding: utf-8 -*- | ||
| tree = parse_html(html) | ||
| return cleaner.clean_html(tree) | ||
| # we need this as https://bugs.launchpad.net/lxml/+bug/1838497 | ||
| try: | ||
| cleaned = cleaner.clean_html(tree) | ||
| except AssertionError: | ||
| cleaned = tree | ||
| return cleaned | ||
| def parse_html(html): | ||
@@ -47,0 +54,0 @@ """ Create an lxml.html.HtmlElement from a string with html. |
+10
-1
| Metadata-Version: 1.1 | ||
| Name: html_text | ||
| Version: 0.5.1 | ||
| Version: 0.5.2 | ||
| Summary: Extract text from HTML | ||
@@ -146,2 +146,10 @@ Home-page: https://github.com/TeamHG-Memex/html-text | ||
| 0.5.2 (2020-07-22) | ||
| ------------------ | ||
| * Handle lxml Cleaner exceptions (a workaround for | ||
| https://bugs.launchpad.net/lxml/+bug/1838497 ); | ||
| * Python 3.8 support; | ||
| * testing improvements. | ||
| 0.5.1 (2019-05-27) | ||
@@ -231,1 +239,2 @@ ------------------ | ||
| Classifier: Programming Language :: Python :: 3.7 | ||
| Classifier: Programming Language :: Python :: 3.8 |
+1
-1
| [bumpversion] | ||
| current_version = 0.5.1 | ||
| current_version = 0.5.2 | ||
| commit = True | ||
@@ -4,0 +4,0 @@ tag = True |
+2
-1
@@ -15,3 +15,3 @@ #!/usr/bin/env python | ||
| name='html_text', | ||
| version='0.5.1', | ||
| version='0.5.2', | ||
| description="Extract text from HTML", | ||
@@ -38,2 +38,3 @@ long_description=readme + '\n\n' + history, | ||
| 'Programming Language :: Python :: 3.7', | ||
| 'Programming Language :: Python :: 3.8', | ||
| ], | ||
@@ -40,0 +41,0 @@ test_suite='tests', |
@@ -76,2 +76,9 @@ # -*- coding: utf-8 -*- | ||
| def test_extract_text_from_fail_html(): | ||
| html = "<html><frameset><frame></frameset></html>" | ||
| tree = parse_html(html) | ||
| node = tree.xpath('/html/frameset')[0] | ||
| assert extract_text(node) == u'' | ||
| def test_punct_whitespace(): | ||
@@ -78,0 +85,0 @@ html = u'<div><span>field</span>, and more</div>' |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
202768
0.59%386
2.93%