🕷️ scrapery

A blazing fast, lightweight, and modern parsing library for HTML, XML, and JSON, designed for web scraping and data extraction.
It supports both XPath and CSS selectors, along with seamless DOM navigation, making parsing and extracting data straightforward and intuitive.
✨ Features
- ⚡ Blazing Fast Performance – Optimized for high-speed HTML, XML, and JSON parsing
- 🎯 Dual Selector Support – Use XPath or CSS selectors for flexible extraction
- 🛡 Comprehensive Error Handling – Detailed exceptions for different error scenarios
- 🧩 Robust Parsing – Encoding detection and content normalization for reliable results
- 🧑💻 Function-Based API – Clean and intuitive interface for ease of use
- 📦 Multi-Format Support – Parse HTML, XML, and JSON in a single library
- ⚙️ Versatile File Management – Create directories, list files, and handle paths effortlessly
- 📝 Smart String Normalization – Clean text by fixing encodings, removing HTML tags, and standardizing whitespace
- 🔍 Flexible CSV & Excel Handling – Read, filter, save, and append data
- 🔄 Efficient JSON Streaming & Reading – Stream large JSON files or load fully with encoding detection
- 💾 Robust File Reading & Writing – Auto-detect encoding, support large files with mmap, and save JSON or plain text cleanly
- 🌐 URL & Domain Utilities – Extract base domains accurately using industry-standard parsing
- 🛡 Input Validation & Error Handling – Custom validations to ensure reliable data processing
⚡ Performance Comparison
The following benchmarks were run on sample HTML and JSON data to compare scrapery with other popular Python libraries.
scrapery | 12 ms | 8 ms |
Other library | 120 ms | N/A |
⚠️ Actual performance may vary depending on your environment. These results are meant for illustrative purposes only. No library is endorsed or affiliated with scrapery.
📦 Installation
pip install scrapery
import scrapery import *
html_content = """
<html>
<body>
<h1>Welcome</h1>
<p>Hello<br>World</p>
<a href="/about">About Us</a>
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>30</td></tr>
<tr><td>Jane</td><td>25</td></tr>
</table>
</body>
</html>
"""
doc = parse_html(html_content)
rows = select_all(doc, "table tr")
print("All table rows:")
for row in rows:
print(selector_content(row))
All table rows:
NameAge
John30
Jane25
paragraph = select_one(doc, "p")
print("\nFirst paragraph text:", selector_content(paragraph))
print(selector_content(doc, selector="h1"))
print(selector_content(doc, selector="//h1"))
print(selector_content(doc, selector="a", attr="href"))
print(selector_content(doc, selector="//a", attr="href"))
print(selector_content(doc, selector="td"))
print(selector_content(doc, selector="//td[2]"))
print(selector_content(doc, selector="//tr[3]/td[2]"))
print(selector_content(doc))
print(selector_content(doc, attr="lang"))
p_elem = select_one(doc,"p")
print("Parent tag of <p>:", parent(p_elem).tag)
print("Children of <p>:", [c.tag for c in children(p_elem)])
print("Siblings of <p>:", [s.tag for s in siblings(p_elem)])
print("Next sibling of <p>:", next_sibling(p_elem).tag)
h1_elem = select_one(doc,"h1")
print("Previous sibling of <p>:", next_sibling(h1_elem))
ancs = ancestors(p_elem)
print("Ancestor tags of <p>:", [a.tag for a in ancs])
desc = descendants(select_one(doc,"table"))
print("Descendant tags of <table>:", [d.tag for d in desc])
div_html = '<div class="card primary"></div>'
div_elem = parse_html(div_html)
print("Has class 'card'? ->", has_class(div_elem, "card"))
print("Classes:", get_classes(div_elem))
html = """
<html>
<body>
<a href="/about">About</a>
<img src="/images/logo.png">
</body>
</html>
"""
doc = parse_html(html)
base = "https://example.com"
print(absolute_url(doc, "a", base_url=base))
print(absolute_url(doc, "img", base_url=base, attr="src"))
xml_content = """
<users>
<user id="1"><name>John</name></user>
<user id="2"><name>Jane</name></user>
</users>
"""
xml_doc = parse_xml(xml_content)
users = find_xml_all(xml_doc, "//user")
for u in users:
print(u.attrib, u.xpath("./name/text()")[0])
xml_dict = xml_to_dict(xml_doc)
print(xml_dict)
json_content = '{"users":[{"name":"John","age":30},{"name":"Jane","age":25}]}'
data = parse_json(json_content)
john_age = json_get_value(data, "users.0.age")
print("John's age:", john_age)
names = json_extract_values(data, "name")
print("Names:", names)
flat = json_flatten(data)
print("Flattened JSON:", flat)
1. Create a Directory
from scrapery import create_directory
create_directory("new_folder")
create_directory("parent_folder/sub_folder")
2. Standardize a String
from scrapery import standardized_string
input_string_1 = "<html><body> Hello \nWorld! \tThis is a test. </body></html>"
print("Standardized String 1:", standardized_string(input_string_1))
input_string_2 = " This is a \n\n string with spaces and \t tabs. "
print("Standardized String 2:", standardized_string(input_string_2))
input_string_3 = ""
print("Standardized String 3:", standardized_string(input_string_3))
input_string_4 = None
print("Standardized String 4:", standardized_string(input_string_4))
================================================================
3. Read CSV
from scrapery import read_csv
csv_file_path = 'data.csv'
get_value_by_col_name = 'URL'
filter_col_name = 'Category'
include_filter_col_values = ['Tech']
result = read_csv(csv_file_path, get_value_by_col_name, filter_col_name, include_filter_col_values)
print(result)
Sample CSV
Category,URL
Tech,https://tech1.com
Tech,https://tech2.com
Science,https://science1.com
Result
['https://tech1.com', 'https://tech2.com']
================================================================
4. Save to CSV
from scrapery import save_to_csv
list_data = [[1, 'Alice', 23], [2, 'Bob', 30], [3, 'Charlie', 25]]
headers = ['ID', 'Name', 'Age']
output_file_path = 'output_data.csv'
save_to_csv(data_list, headers, output_file_path)
save_to_csv(data_list, headers, output_file_path, sep="\t")
save_to_csv(data_list, headers, output_file_path, sep=";")
Output (default, sep=","):
ID,Name,Age
1,Alice,23
2,Bob,30
3,Charlie,25
Output (sep="\t"):
ID Name Age
1 Alice 23
2 Bob 30
3 Charlie 25
================================================================
5. Save to Excel file
from scrapery import save_to_xls
save_to_xls(data_list, headers, output_file_path)
================================================================
6. List files in a directory
from scrapery import list_files
files = list_files(directory=output_dir, extension="csv")
print("CSV files in output directory:", files)
================================================================
7. Read back file content
from scrapery import read_file_content
file_path_small_json = 'small_data.json'
content = read_file_content(file_path_small_json, stream_json=False)
print("Small JSON file content (fully loaded):")
print(content)
file_path_large_json = 'large_data.json'
json_stream: Generator[dict, None, None] = read_file_content(file_path_large_json, stream_json=True)
print("\nLarge JSON file content streamed:")
for item in json_stream:
print(item)
file_path_large_txt = 'large_text.txt'
text_content = read_file_content(file_path_large_txt)
print("\nLarge text file content (using mmap):")
print(text_content[:500])
file_path_small_txt = 'small_text.txt'
text_content = read_file_content(file_path_small_txt)
print("\nSmall text file content (with encoding detection):")
print(text_content)
================================================================
8. Save to file
from scrapery import save_file_content
text_content = "Hello, this is a sample text file.\nWelcome to file handling in Python!"
save_file_content("output/text_file.txt", text_content)
json_content = {
"name": "Alice",
"age": 30,
"skills": ["Python", "Data Science", "Machine Learning"]
}
save_file_content("output/data.json", json_content)
number_content = 12345
save_file_content("output/number.txt", number_content)
append_text = "\nThis line is appended."
save_file_content("output/text_file.txt", append_text, mode="a")