html-wrapper
Advanced tools
| Metadata-Version: 1.0 | ||
| Name: html-wrapper | ||
| Version: 0.3.1 | ||
| Version: 0.3.2 | ||
| Summary: HTML parser with an lxml backend. Implements a subset of BeautifulSoup API and is an order of magnitude faster | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/thismachinechills/html_wrapper |
+17
-16
| from typing import Union, Dict, AnyStr, Any, Optional, \ | ||
| Iterable, Tuple | ||
| Iterable, Tuple, List | ||
| from functools import lru_cache | ||
@@ -10,11 +10,12 @@ from abc import ABC | ||
| BS4_TYPES = "Tag", "BeautifulSoup" | ||
| STR_ENCODING = 'unicode' | ||
| BS4_TYPES: Tuple[str] = "Tag", "BeautifulSoup" | ||
| STR_ENCODING: str = 'unicode' | ||
| NO_ATTRS: Dict[str, str] = {} | ||
| NO_TEXT = '' | ||
| NO_TEXT: str = '' | ||
| SKIP_COMMA: int = -len(', ') | ||
| COLLECTIONS: Tuple[type] = set, list, tuple | ||
| Attrs = Union[str, Dict] | ||
| Attrs = Union[str, Dict[str, str]] | ||
| CssClassType = str | ||
@@ -215,3 +216,3 @@ | ||
| def get_xpath_str(tag: str, class_: CssClassType = None, **kwargs) -> str: | ||
| tag_xp = f'.//{tag}' | ||
| tags: List[str] = [f'.//{tag}'] | ||
@@ -222,3 +223,3 @@ if class_: | ||
| for attr, val in kwargs.items(): | ||
| tag_xp += '[' | ||
| tags.append('[') | ||
| attr_xp = f'@{attr}' | ||
@@ -228,23 +229,23 @@ | ||
| if val: | ||
| tag_xp += attr_xp | ||
| tags.append(attr_xp) | ||
| else: | ||
| tag_xp += f'not({attr_xp})' | ||
| tags.append(f'not({attr_xp})') | ||
| elif isinstance(val, (set, list, tuple)): | ||
| elif isinstance(val, COLLECTIONS): | ||
| for item in val: | ||
| val_xp = f'"{item}", ' | ||
| val_xp = val_xp[:SKIP_COMMA] if val else '' | ||
| tag_xp += f'contains({attr_xp}, {val_xp})' | ||
| val_xp = val_xp[:SKIP_COMMA] if val else NO_TEXT | ||
| tags.append(f'contains({attr_xp}, {val_xp})') | ||
| elif isinstance(val, str): | ||
| tag_xp += f'contains({attr_xp}, "{val}")' | ||
| tags.append(f'contains({attr_xp}, "{val}")') | ||
| else: | ||
| tag_xp += "{attr_xp}='{val}'" | ||
| tags.append("{attr_xp}='{val}'") | ||
| tag_xp += ']' | ||
| tags.append(']') | ||
| return tag_xp | ||
| return ''.join(tags) | ||
@@ -251,0 +252,0 @@ |
+1
-1
| Metadata-Version: 1.0 | ||
| Name: html_wrapper | ||
| Version: 0.3.1 | ||
| Version: 0.3.2 | ||
| Summary: HTML parser with an lxml backend. Implements a subset of BeautifulSoup API and is an order of magnitude faster | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/thismachinechills/html_wrapper |
+13
-10
| # html_wrapper | ||
| html_wrapper implements a small subset of the BeautifulSoup API that I use. It's anywhere from 10x-100x faster than bs4. | ||
| `html_wrapper` implements a small subset of the `BeautifulSoup4` API. It can be anywhere from 10x-100x faster than `bs4` for some use cases. | ||
| ## Installation | ||
| `pip3 install html_wrapper` | ||
| `python3 -m pip install html_wrapper` | ||
| ## Example | ||
| Faster to instantiate and parse HTML. Suits my needs. | ||
| It's faster to instantiate and parse HTML. Suits my needs. | ||
| ``` | ||
| In [1]: import bs4 | ||
| ```python3 | ||
| In [1]: from html_wrapper import HtmlWrapper | ||
| In [2]: import html_wrapper | ||
| In [2]: from bs4 import BeautifulSoup | ||
| In [3]: %timeit html_wrapper.HtmlWrapper("<html><body><p>hi</p></body></html>").text | ||
| 10000 loops, best of 3: 20.4 µs per loop | ||
| In [3]: from requests import get | ||
| In [4]: %timeit bs4.BeautifulSoup("<html><body><p>hi</p></body></html>", "lxml").text | ||
| 1000 loops, best of 3: 232 µs per loop | ||
| In [4]: html: bytes = get("https://en.wikipedia.org/wiki/HTML").content | ||
| In [5]: %timeit HtmlWrapper(html).text | ||
| 23.4 ms ± 563 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) | ||
| In [6]: %timeit BeautifulSoup(html).text | ||
| 190 ms ± 29.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) | ||
| ``` | ||
@@ -25,0 +28,0 @@ |
+1
-1
@@ -5,3 +5,3 @@ from setuptools import setup | ||
| setup(name="html_wrapper", | ||
| version="0.3.1", | ||
| version="0.3.2", | ||
| description="HTML parser with an lxml backend. Implements a subset of BeautifulSoup API and is an order of magnitude faster", | ||
@@ -8,0 +8,0 @@ url="https://github.com/thismachinechills/html_wrapper", |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
8193
3.47%194
0.52%