You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

email-scraper

Package Overview
Dependencies
Maintainers
1
Versions
6
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

email-scraper - pypi Package Compare versions

Comparing version
0.5
to
0.6
+38
.github/workflows/test-and-publish.yml
name: Build
on: push
jobs:
build-n-publish:
name: Test, build and publish
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
permissions:
id-token: write # required for trusted publishing
steps:
- uses: actions/checkout@v5
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: "${{ matrix.python-version }}"
- name: Dependencies
run: |
pip install build
pip install -e .[dev]
- name: Test
run: |
python -m unittest discover -s tests
- name: Test readme
run: |
rstcheck README.rst
- name: Build a binary wheel and a source tarball
run: |
python -m build
- name: Publish to PyPI
if: startsWith(github.event.ref, 'refs/tags') && matrix.python-version == '3.13'
uses: pypa/gh-action-pypi-publish@release/v1
.idea
*.egg-info
build
dist
*.pyc
.eggs
Copyright 2017 Amir Szekely
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
[build-system]
requires = ["setuptools>=80", "setuptools-scm[simple]>=8", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "email-scraper"
dynamic = ["version"]
description = "Simple utility to extract email addresses from HTML, including obfuscated email addresses"
readme = "README.rst"
authors = [
{ name = "Amir Szekely", email = "kichik@gmail.com" }
]
license = "MIT"
requires-python = ">=3.9"
keywords = ["email", "scraping", "web", "obfuscate"]
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Topic :: Communications :: Email",
"Topic :: Text Processing :: Markup :: HTML"
]
dependencies = [
"tlds"
]
[project.optional-dependencies]
dev = [
"rstcheck"
]
[project.urls]
Homepage = "https://github.com/kichik/email-scraper"
import unittest
from email_scraper.scrape import extract_emails, deobfuscate_html, scrape_emails
class TestExtractor(unittest.TestCase):
def test_basic(self):
self.assertEqual(extract_emails('hello world'), [])
self.assertEqual(extract_emails('hello test@test.com world'), ['test@test.com'])
self.assertEqual(extract_emails('test@test.com test@test.com'), ['test@test.com', 'test@test.com'])
self.assertEqual(extract_emails('test@test.com test@example.com'), ['test@test.com', 'test@example.com'])
self.assertEqual(extract_emails('test@test.com,test@example.com'), ['test@test.com', 'test@example.com'])
self.assertEqual(extract_emails('hello test@test.com. i have been waiting for you.'), ['test@test.com'])
def test_basic_html(self):
self.assertEqual(extract_emails('<a href="mailto:test@test.com">boo</a>'), ['test@test.com'])
self.assertEqual(extract_emails('<a href=\'mailto:test@test.com\'>boo</a>'), ['test@test.com'])
self.assertEqual(extract_emails('<a href="mailto:test@test.com?subject=meh">boo</a>'), ['test@test.com'])
def test_tlds(self):
self.assertEqual(extract_emails('hello@something.com'), ['hello@something.com'])
self.assertEqual(extract_emails('hello@something.pizza'), ['hello@something.pizza'])
self.assertEqual(extract_emails('hello@something.notarealtld'), [])
def test_uppercase(self):
self.assertEqual(extract_emails('HELLO@something.com'), ['HELLO@something.com'])
self.assertEqual(extract_emails('HELLO@SOMETHING.com'), ['HELLO@SOMETHING.com'])
self.assertEqual(extract_emails('HELLO@SOMETHING.COM'), ['HELLO@SOMETHING.COM'])
self.assertEqual(extract_emails('HELLO@SOMETHING.pizza'), ['HELLO@SOMETHING.pizza'])
self.assertEqual(extract_emails('HELLO@SOMETHING.PIZZA'), ['HELLO@SOMETHING.PIZZA'])
class TestDeobfuscate(unittest.TestCase):
def test_entities(self):
self.assertEqual(deobfuscate_html('&#121;&#111;&#117;&#114;&#110;&#097;&#109;&#101;&#064;&#100;&#111;&#109;'
'&#097;&#105;&#110;&#046;&#099;&#111;&#109;'), 'yourname@domain.com')
def test_atob(self):
atob = 'atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')'
self.assertEqual(deobfuscate_html(atob), 'mailto:email@example.com')
class TestHidden(unittest.TestCase):
def test_hidden(self):
self.assertEqual(
extract_emails("foo johnsmith (at) yahoo (dot) com bar"),
["johnsmith@yahoo.com"]
)
class TestScraping(unittest.TestCase):
def test_basic(self):
html = """<html>
<body>
<a href="mailto:hello@test.com">something@test.com</a>
</body>
</html>"""
self.assertEqual(scrape_emails(html), {'hello@test.com', 'something@test.com'})
def test_atob(self):
atob = '<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>'
self.assertEqual(scrape_emails(atob), {'email@example.com'})
def test_entities(self):
html = """<p>For more information, send email to <A HREF="mailto:
&#121;&#111;&#117;&#114;&#110;&#097;&#109;&#101;&#064;&#100;&#111;&#109;&#097;&#105;&#110;&#046;&#099;&#111;&#109;">
&#121;&#111;&#117;&#114;&#110;&#097;&#109;&#101;&#064;&#100;&#111;&#109;&#097;&#105;&#110;&#046;&#099;&#111;&#109;
</A></p>"""
self.assertEqual(scrape_emails(html), {'yourname@domain.com'})
+40
-35

@@ -1,41 +0,46 @@

Metadata-Version: 1.1
Metadata-Version: 2.4
Name: email-scraper
Version: 0.5
Version: 0.6
Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses
Home-page: https://github.com/kichik/email-scraper
Author: Amir Szekely
Author-email: kichik@gmail.com
License: MIT
Description: ####################################################
Python Module for Scraping Email Addresses from HTML
####################################################
The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
in plain text, links, `atob()` obfuscation and HTML entities obfuscation.
Available on PyPI_.
.. _PyPI: https://pypi.org/pypi/email-scraper/
.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
:target: https://github.com/kichik/email-scraper/actions
.. image:: https://badge.fury.io/py/email-scraper.svg
:target: https://badge.fury.io/py/email-scraper
Usage
-----
>>> from email_scraper import scrape_emails
>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
{'hello@world.com'}
>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
{'email@example.com'}
Keywords: email scraping web obfuscate
Platform: UNKNOWN
Author-email: Amir Szekely <kichik@gmail.com>
License-Expression: MIT
Project-URL: Homepage, https://github.com/kichik/email-scraper
Keywords: email,scraping,web,obfuscate
Classifier: Development Status :: 4 - Beta
Classifier: Programming Language :: Python
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python :: 3
Classifier: Topic :: Communications :: Email
Classifier: Topic :: Text Processing :: Markup :: HTML
Requires-Python: >=3.9
Description-Content-Type: text/x-rst
License-File: LICENSE
Requires-Dist: tlds
Provides-Extra: dev
Requires-Dist: rstcheck; extra == "dev"
Dynamic: license-file
####################################################
Python Module for Scraping Email Addresses from HTML
####################################################
The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
in plain text, links, `atob()` obfuscation and HTML entities obfuscation.
Available on PyPI_.
.. _PyPI: https://pypi.org/pypi/email-scraper/
.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
:target: https://github.com/kichik/email-scraper/actions
.. image:: https://badge.fury.io/py/email-scraper.svg
:target: https://badge.fury.io/py/email-scraper
Usage
-----
>>> from email_scraper import scrape_emails
>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
{'hello@world.com'}
>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
{'email@example.com'}

@@ -0,4 +1,6 @@

.gitignore
LICENSE
README.rst
setup.cfg
setup.py
pyproject.toml
.github/workflows/test-and-publish.yml
email_scraper/__init__.py

@@ -11,2 +13,3 @@ email_scraper/scrape.py

email_scraper.egg-info/top_level.txt
email_scraper.egg-info/zip-safe
tests/__init__.py
tests/test_scraping.py

@@ -8,3 +8,3 @@ import base64

'local': 'a-z0-9!#$%&\'*+\\-/=?^_`{|}~',
'domain': 'a-z0-9\-',
'domain': 'a-z0-9-',
'tlds': '|'.join(tld_set)

@@ -16,3 +16,3 @@ }

HIDDEN_REGEX = [
'(\w+({0})\w+({1})\w+)'.format(
r'(\w+({0})\w+({1})\w+)'.format(
at.replace("(", r"\(").replace(")", r"\)").replace("[", r"\[").replace("]", r"\]"),

@@ -62,3 +62,3 @@ dot.replace("(", r"\(").replace(")", r"\)").replace("[", r"\[").replace("]", r"\]"),

html = unescape(html)
html = re.sub('atob\\([\'"]([A-Za-z0-9+/]+)[\'"]\\)', replace_atob, html, 0, re.IGNORECASE)
html = re.sub('atob\\([\'"]([A-Za-z0-9+/]+)[\'"]\\)', replace_atob, html, flags=re.IGNORECASE)
return html

@@ -65,0 +65,0 @@

+40
-35

@@ -1,41 +0,46 @@

Metadata-Version: 1.1
Metadata-Version: 2.4
Name: email-scraper
Version: 0.5
Version: 0.6
Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses
Home-page: https://github.com/kichik/email-scraper
Author: Amir Szekely
Author-email: kichik@gmail.com
License: MIT
Description: ####################################################
Python Module for Scraping Email Addresses from HTML
####################################################
The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
in plain text, links, `atob()` obfuscation and HTML entities obfuscation.
Available on PyPI_.
.. _PyPI: https://pypi.org/pypi/email-scraper/
.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
:target: https://github.com/kichik/email-scraper/actions
.. image:: https://badge.fury.io/py/email-scraper.svg
:target: https://badge.fury.io/py/email-scraper
Usage
-----
>>> from email_scraper import scrape_emails
>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
{'hello@world.com'}
>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
{'email@example.com'}
Keywords: email scraping web obfuscate
Platform: UNKNOWN
Author-email: Amir Szekely <kichik@gmail.com>
License-Expression: MIT
Project-URL: Homepage, https://github.com/kichik/email-scraper
Keywords: email,scraping,web,obfuscate
Classifier: Development Status :: 4 - Beta
Classifier: Programming Language :: Python
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python :: 3
Classifier: Topic :: Communications :: Email
Classifier: Topic :: Text Processing :: Markup :: HTML
Requires-Python: >=3.9
Description-Content-Type: text/x-rst
License-File: LICENSE
Requires-Dist: tlds
Provides-Extra: dev
Requires-Dist: rstcheck; extra == "dev"
Dynamic: license-file
####################################################
Python Module for Scraping Email Addresses from HTML
####################################################
The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
in plain text, links, `atob()` obfuscation and HTML entities obfuscation.
Available on PyPI_.
.. _PyPI: https://pypi.org/pypi/email-scraper/
.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
:target: https://github.com/kichik/email-scraper/actions
.. image:: https://badge.fury.io/py/email-scraper.svg
:target: https://badge.fury.io/py/email-scraper
Usage
-----
>>> from email_scraper import scrape_emails
>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
{'hello@world.com'}
>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
{'email@example.com'}

@@ -1,4 +0,1 @@

[bdist_wheel]
universal = 1
[egg_info]

@@ -5,0 +2,0 @@ tag_build =

Sorry, the diff of this file is not supported yet

import os
import sys
from setuptools import setup, find_packages
version = os.getenv('GITHUB_REF')
if version and version.startswith('refs/tags/'):
version = version.replace('refs/tags/', '')
else:
version = '0.0'
if sys.version_info < (2, 7):
sys.exit('Sorry, Python < 2.7 is not supported')
setup(name='email-scraper',
version=version,
description='Simple utility to extract email addresses from HTML, including obfuscated email addresses',
long_description=open('README.rst').read(),
classifiers=[
'Development Status :: 4 - Beta',
'Programming Language :: Python',
'License :: OSI Approved :: MIT License',
'Topic :: Communications :: Email',
'Topic :: Text Processing :: Markup :: HTML'
],
keywords='email scraping web obfuscate',
author='Amir Szekely',
author_email='kichik@gmail.com',
url='https://github.com/kichik/email-scraper',
license='MIT',
packages=find_packages(exclude=['examples', 'tests']),
install_requires=['tlds'],
include_package_data=True,
zip_safe=True,
test_suite='tests',
)