name: Build

		on: push

		jobs:
		build-n-publish:
		name: Test, build and publish

		runs-on: ubuntu-latest
		strategy:
		matrix:
		python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]

		permissions:
		id-token: write # required for trusted publishing

		steps:
		- uses: actions/checkout@v5
		- name: Set up Python ${{ matrix.python-version }}
		uses: actions/setup-python@v6
		with:
		python-version: "${{ matrix.python-version }}"
		- name: Dependencies
		run: \|
		pip install build
		pip install -e .[dev]
		- name: Test
		run: \|
		python -m unittest discover -s tests
		- name: Test readme
		run: \|
		rstcheck README.rst
		- name: Build a binary wheel and a source tarball
		run: \|
		python -m build
		- name: Publish to PyPI
		if: startsWith(github.event.ref, 'refs/tags') && matrix.python-version == '3.13'
		uses: pypa/gh-action-pypi-publish@release/v1

+6

.gitignore

		.idea
		*.egg-info
		build
		dist
		*.pyc
		.eggs

+7

LICENSE

		Copyright 2017 Amir Szekely

		Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

		The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+33

pyproject.toml

		[build-system]
		requires = ["setuptools>=80", "setuptools-scm[simple]>=8", "wheel"]
		build-backend = "setuptools.build_meta"

		[project]
		name = "email-scraper"
		dynamic = ["version"]
		description = "Simple utility to extract email addresses from HTML, including obfuscated email addresses"
		readme = "README.rst"
		authors = [
		{ name = "Amir Szekely", email = "kichik@gmail.com" }
		]
		license = "MIT"
		requires-python = ">=3.9"
		keywords = ["email", "scraping", "web", "obfuscate"]
		classifiers = [
		"Development Status :: 4 - Beta",
		"Programming Language :: Python",
		"Programming Language :: Python :: 3",
		"Topic :: Communications :: Email",
		"Topic :: Text Processing :: Markup :: HTML"
		]
		dependencies = [
		"tlds"
		]

		[project.optional-dependencies]
		dev = [
		"rstcheck"
		]

		[project.urls]
		Homepage = "https://github.com/kichik/email-scraper"

tests/__init__.py

+67

tests/test_scraping.py

		import unittest

		from email_scraper.scrape import extract_emails, deobfuscate_html, scrape_emails


		class TestExtractor(unittest.TestCase):
		def test_basic(self):
		self.assertEqual(extract_emails('hello world'), [])
		self.assertEqual(extract_emails('hello test@test.com world'), ['test@test.com'])
		self.assertEqual(extract_emails('test@test.com test@test.com'), ['test@test.com', 'test@test.com'])
		self.assertEqual(extract_emails('test@test.com test@example.com'), ['test@test.com', 'test@example.com'])
		self.assertEqual(extract_emails('test@test.com,test@example.com'), ['test@test.com', 'test@example.com'])
		self.assertEqual(extract_emails('hello test@test.com. i have been waiting for you.'), ['test@test.com'])

		def test_basic_html(self):
		self.assertEqual(extract_emails('<a href="mailto:test@test.com">boo</a>'), ['test@test.com'])
		self.assertEqual(extract_emails('<a href=\'mailto:test@test.com\'>boo</a>'), ['test@test.com'])
		self.assertEqual(extract_emails('<a href="mailto:test@test.com?subject=meh">boo</a>'), ['test@test.com'])

		def test_tlds(self):
		self.assertEqual(extract_emails('hello@something.com'), ['hello@something.com'])
		self.assertEqual(extract_emails('hello@something.pizza'), ['hello@something.pizza'])
		self.assertEqual(extract_emails('hello@something.notarealtld'), [])

		def test_uppercase(self):
		self.assertEqual(extract_emails('HELLO@something.com'), ['HELLO@something.com'])
		self.assertEqual(extract_emails('HELLO@SOMETHING.com'), ['HELLO@SOMETHING.com'])
		self.assertEqual(extract_emails('HELLO@SOMETHING.COM'), ['HELLO@SOMETHING.COM'])
		self.assertEqual(extract_emails('HELLO@SOMETHING.pizza'), ['HELLO@SOMETHING.pizza'])
		self.assertEqual(extract_emails('HELLO@SOMETHING.PIZZA'), ['HELLO@SOMETHING.PIZZA'])


		class TestDeobfuscate(unittest.TestCase):
		def test_entities(self):
		self.assertEqual(deobfuscate_html('yourname@dom'
		'ain.com'), 'yourname@domain.com')

		def test_atob(self):
		atob = 'atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')'
		self.assertEqual(deobfuscate_html(atob), 'mailto:email@example.com')

		class TestHidden(unittest.TestCase):
		def test_hidden(self):
		self.assertEqual(
		extract_emails("foo johnsmith (at) yahoo (dot) com bar"),
		["johnsmith@yahoo.com"]
		)

		class TestScraping(unittest.TestCase):
		def test_basic(self):
		html = """<html>
		<body>
		<a href="mailto:hello@test.com">something@test.com</a>
		</body>
		</html>"""
		self.assertEqual(scrape_emails(html), {'hello@test.com', 'something@test.com'})

		def test_atob(self):
		atob = '<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>'
		self.assertEqual(scrape_emails(atob), {'email@example.com'})

		def test_entities(self):
		html = """<p>For more information, send email to <A HREF="mailto:
		yourname@domain.com">
		yourname@domain.com
		</A></p>"""
		self.assertEqual(scrape_emails(html), {'yourname@domain.com'})

+40

-35

email_scraper.egg-info/PKG-INFO

		@@ -1,41 +0,46 @@
		Metadata-Version: 1.1
		Metadata-Version: 2.4
		Name: email-scraper
		Version: 0.5
		Version: 0.6
		Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses
		Home-page: https://github.com/kichik/email-scraper
		Author: Amir Szekely
		Author-email: kichik@gmail.com
		License: MIT
		Description: ####################################################
		Python Module for Scraping Email Addresses from HTML
		####################################################

		The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
		in plain text, links, `atob()` obfuscation and HTML entities obfuscation.

		Available on PyPI_.

		.. _PyPI: https://pypi.org/pypi/email-scraper/

		.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
		:target: https://github.com/kichik/email-scraper/actions

		.. image:: https://badge.fury.io/py/email-scraper.svg
		:target: https://badge.fury.io/py/email-scraper

		Usage
		-----

		>>> from email_scraper import scrape_emails
		>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
		{'hello@world.com'}
		>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
		{'email@example.com'}

		Keywords: email scraping web obfuscate
		Platform: UNKNOWN
		Author-email: Amir Szekely <kichik@gmail.com>
		License-Expression: MIT
		Project-URL: Homepage, https://github.com/kichik/email-scraper
		Keywords: email,scraping,web,obfuscate
		Classifier: Development Status :: 4 - Beta
		Classifier: Programming Language :: Python
		Classifier: License :: OSI Approved :: MIT License
		Classifier: Programming Language :: Python :: 3
		Classifier: Topic :: Communications :: Email
		Classifier: Topic :: Text Processing :: Markup :: HTML
		Requires-Python: >=3.9
		Description-Content-Type: text/x-rst
		License-File: LICENSE
		Requires-Dist: tlds
		Provides-Extra: dev
		Requires-Dist: rstcheck; extra == "dev"
		Dynamic: license-file

		####################################################
		Python Module for Scraping Email Addresses from HTML
		####################################################

		The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
		in plain text, links, `atob()` obfuscation and HTML entities obfuscation.

		Available on PyPI_.

		.. _PyPI: https://pypi.org/pypi/email-scraper/

		.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
		:target: https://github.com/kichik/email-scraper/actions

		.. image:: https://badge.fury.io/py/email-scraper.svg
		:target: https://badge.fury.io/py/email-scraper

		Usage
		-----

		>>> from email_scraper import scrape_emails
		>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
		{'hello@world.com'}
		>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
		{'email@example.com'}

+3

-0

email_scraper.egg-info/requires.txt

		tlds

		[dev]
		rstcheck

+6

-3

email_scraper.egg-info/SOURCES.txt

		@@ -0,4 +1,6 @@
		.gitignore
		LICENSE
		README.rst
		setup.cfg
		setup.py
		pyproject.toml
		.github/workflows/test-and-publish.yml
		email_scraper/__init__.py
		@@ -11,2 +13,3 @@ email_scraper/scrape.py
		email_scraper.egg-info/top_level.txt
		email_scraper.egg-info/zip-safe
		tests/__init__.py
		tests/test_scraping.py

+3

-3

email_scraper/scrape.py

		@@ -8,3 +8,3 @@ import base64
		'local': 'a-z0-9!#$%&\'*+\\-/=?^_`{\|}~',
		'domain': 'a-z0-9\-',
		'domain': 'a-z0-9-',
		'tlds': '\|'.join(tld_set)
		@@ -16,3 +16,3 @@ }
		HIDDEN_REGEX = [
		'(\w+({0})\w+({1})\w+)'.format(
		r'(\w+({0})\w+({1})\w+)'.format(
		at.replace("(", r"$").replace(")", r"$").replace("[", r"\[").replace("]", r"\]"),
		@@ -62,3 +62,3 @@ dot.replace("(", r"$").replace(")", r"$").replace("[", r"\[").replace("]", r"\]"),
		html = unescape(html)
		html = re.sub('atob\$[\'"]([A-Za-z0-9+/]+)[\'"]\$', replace_atob, html, 0, re.IGNORECASE)
		html = re.sub('atob\$[\'"]([A-Za-z0-9+/]+)[\'"]\$', replace_atob, html, flags=re.IGNORECASE)
		return html
		@@ -65,0 +65,0 @@

+40

-35

PKG-INFO

		@@ -1,41 +0,46 @@
		Metadata-Version: 1.1
		Metadata-Version: 2.4
		Name: email-scraper
		Version: 0.5
		Version: 0.6
		Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses
		Home-page: https://github.com/kichik/email-scraper
		Author: Amir Szekely
		Author-email: kichik@gmail.com
		License: MIT
		Description: ####################################################
		Python Module for Scraping Email Addresses from HTML
		####################################################

		The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
		in plain text, links, `atob()` obfuscation and HTML entities obfuscation.

		Available on PyPI_.

		.. _PyPI: https://pypi.org/pypi/email-scraper/

		.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
		:target: https://github.com/kichik/email-scraper/actions

		.. image:: https://badge.fury.io/py/email-scraper.svg
		:target: https://badge.fury.io/py/email-scraper

		Usage
		-----

		>>> from email_scraper import scrape_emails
		>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
		{'hello@world.com'}
		>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
		{'email@example.com'}

		Keywords: email scraping web obfuscate
		Platform: UNKNOWN
		Author-email: Amir Szekely <kichik@gmail.com>
		License-Expression: MIT
		Project-URL: Homepage, https://github.com/kichik/email-scraper
		Keywords: email,scraping,web,obfuscate
		Classifier: Development Status :: 4 - Beta
		Classifier: Programming Language :: Python
		Classifier: License :: OSI Approved :: MIT License
		Classifier: Programming Language :: Python :: 3
		Classifier: Topic :: Communications :: Email
		Classifier: Topic :: Text Processing :: Markup :: HTML
		Requires-Python: >=3.9
		Description-Content-Type: text/x-rst
		License-File: LICENSE
		Requires-Dist: tlds
		Provides-Extra: dev
		Requires-Dist: rstcheck; extra == "dev"
		Dynamic: license-file

		####################################################
		Python Module for Scraping Email Addresses from HTML
		####################################################

		The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails
		in plain text, links, `atob()` obfuscation and HTML entities obfuscation.

		Available on PyPI_.

		.. _PyPI: https://pypi.org/pypi/email-scraper/

		.. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg
		:target: https://github.com/kichik/email-scraper/actions

		.. image:: https://badge.fury.io/py/email-scraper.svg
		:target: https://badge.fury.io/py/email-scraper

		Usage
		-----

		>>> from email_scraper import scrape_emails
		>>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>')
		{'hello@world.com'}
		>>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>')
		{'email@example.com'}

+0

-3

setup.cfg

		@@ -1,4 +0,1 @@
		[bdist_wheel]
		universal = 1

		[egg_info]
		@@ -5,0 +2,0 @@ tag_build =

email_scraper.egg-info/zip-safe

Sorry, the diff of this file is not supported yet

-36

setup.py

		import os

		import sys
		from setuptools import setup, find_packages

		version = os.getenv('GITHUB_REF')
		if version and version.startswith('refs/tags/'):
		version = version.replace('refs/tags/', '')
		else:
		version = '0.0'

		if sys.version_info < (2, 7):
		sys.exit('Sorry, Python < 2.7 is not supported')

		setup(name='email-scraper',
		version=version,
		description='Simple utility to extract email addresses from HTML, including obfuscated email addresses',
		long_description=open('README.rst').read(),
		classifiers=[
		'Development Status :: 4 - Beta',
		'Programming Language :: Python',
		'License :: OSI Approved :: MIT License',
		'Topic :: Communications :: Email',
		'Topic :: Text Processing :: Markup :: HTML'
		],
		keywords='email scraping web obfuscate',
		author='Amir Szekely',
		author_email='kichik@gmail.com',
		url='https://github.com/kichik/email-scraper',
		license='MIT',
		packages=find_packages(exclude=['examples', 'tests']),
		install_requires=['tlds'],
		include_package_data=True,
		zip_safe=True,
		test_suite='tests',
		)

email-scraper - pypi Package Compare versions

Improved metrics