email-scraper
Advanced tools
| name: Build | ||
| on: push | ||
| jobs: | ||
| build-n-publish: | ||
| name: Test, build and publish | ||
| runs-on: ubuntu-latest | ||
| strategy: | ||
| matrix: | ||
| python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] | ||
| permissions: | ||
| id-token: write # required for trusted publishing | ||
| steps: | ||
| - uses: actions/checkout@v5 | ||
| - name: Set up Python ${{ matrix.python-version }} | ||
| uses: actions/setup-python@v6 | ||
| with: | ||
| python-version: "${{ matrix.python-version }}" | ||
| - name: Dependencies | ||
| run: | | ||
| pip install build | ||
| pip install -e .[dev] | ||
| - name: Test | ||
| run: | | ||
| python -m unittest discover -s tests | ||
| - name: Test readme | ||
| run: | | ||
| rstcheck README.rst | ||
| - name: Build a binary wheel and a source tarball | ||
| run: | | ||
| python -m build | ||
| - name: Publish to PyPI | ||
| if: startsWith(github.event.ref, 'refs/tags') && matrix.python-version == '3.13' | ||
| uses: pypa/gh-action-pypi-publish@release/v1 |
| .idea | ||
| *.egg-info | ||
| build | ||
| dist | ||
| *.pyc | ||
| .eggs |
+7
| Copyright 2017 Amir Szekely | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
| The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| [build-system] | ||
| requires = ["setuptools>=80", "setuptools-scm[simple]>=8", "wheel"] | ||
| build-backend = "setuptools.build_meta" | ||
| [project] | ||
| name = "email-scraper" | ||
| dynamic = ["version"] | ||
| description = "Simple utility to extract email addresses from HTML, including obfuscated email addresses" | ||
| readme = "README.rst" | ||
| authors = [ | ||
| { name = "Amir Szekely", email = "kichik@gmail.com" } | ||
| ] | ||
| license = "MIT" | ||
| requires-python = ">=3.9" | ||
| keywords = ["email", "scraping", "web", "obfuscate"] | ||
| classifiers = [ | ||
| "Development Status :: 4 - Beta", | ||
| "Programming Language :: Python", | ||
| "Programming Language :: Python :: 3", | ||
| "Topic :: Communications :: Email", | ||
| "Topic :: Text Processing :: Markup :: HTML" | ||
| ] | ||
| dependencies = [ | ||
| "tlds" | ||
| ] | ||
| [project.optional-dependencies] | ||
| dev = [ | ||
| "rstcheck" | ||
| ] | ||
| [project.urls] | ||
| Homepage = "https://github.com/kichik/email-scraper" |
| import unittest | ||
| from email_scraper.scrape import extract_emails, deobfuscate_html, scrape_emails | ||
| class TestExtractor(unittest.TestCase): | ||
| def test_basic(self): | ||
| self.assertEqual(extract_emails('hello world'), []) | ||
| self.assertEqual(extract_emails('hello test@test.com world'), ['test@test.com']) | ||
| self.assertEqual(extract_emails('test@test.com test@test.com'), ['test@test.com', 'test@test.com']) | ||
| self.assertEqual(extract_emails('test@test.com test@example.com'), ['test@test.com', 'test@example.com']) | ||
| self.assertEqual(extract_emails('test@test.com,test@example.com'), ['test@test.com', 'test@example.com']) | ||
| self.assertEqual(extract_emails('hello test@test.com. i have been waiting for you.'), ['test@test.com']) | ||
| def test_basic_html(self): | ||
| self.assertEqual(extract_emails('<a href="mailto:test@test.com">boo</a>'), ['test@test.com']) | ||
| self.assertEqual(extract_emails('<a href=\'mailto:test@test.com\'>boo</a>'), ['test@test.com']) | ||
| self.assertEqual(extract_emails('<a href="mailto:test@test.com?subject=meh">boo</a>'), ['test@test.com']) | ||
| def test_tlds(self): | ||
| self.assertEqual(extract_emails('hello@something.com'), ['hello@something.com']) | ||
| self.assertEqual(extract_emails('hello@something.pizza'), ['hello@something.pizza']) | ||
| self.assertEqual(extract_emails('hello@something.notarealtld'), []) | ||
| def test_uppercase(self): | ||
| self.assertEqual(extract_emails('HELLO@something.com'), ['HELLO@something.com']) | ||
| self.assertEqual(extract_emails('HELLO@SOMETHING.com'), ['HELLO@SOMETHING.com']) | ||
| self.assertEqual(extract_emails('HELLO@SOMETHING.COM'), ['HELLO@SOMETHING.COM']) | ||
| self.assertEqual(extract_emails('HELLO@SOMETHING.pizza'), ['HELLO@SOMETHING.pizza']) | ||
| self.assertEqual(extract_emails('HELLO@SOMETHING.PIZZA'), ['HELLO@SOMETHING.PIZZA']) | ||
| class TestDeobfuscate(unittest.TestCase): | ||
| def test_entities(self): | ||
| self.assertEqual(deobfuscate_html('yourname@dom' | ||
| 'ain.com'), 'yourname@domain.com') | ||
| def test_atob(self): | ||
| atob = 'atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')' | ||
| self.assertEqual(deobfuscate_html(atob), 'mailto:email@example.com') | ||
| class TestHidden(unittest.TestCase): | ||
| def test_hidden(self): | ||
| self.assertEqual( | ||
| extract_emails("foo johnsmith (at) yahoo (dot) com bar"), | ||
| ["johnsmith@yahoo.com"] | ||
| ) | ||
| class TestScraping(unittest.TestCase): | ||
| def test_basic(self): | ||
| html = """<html> | ||
| <body> | ||
| <a href="mailto:hello@test.com">something@test.com</a> | ||
| </body> | ||
| </html>""" | ||
| self.assertEqual(scrape_emails(html), {'hello@test.com', 'something@test.com'}) | ||
| def test_atob(self): | ||
| atob = '<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>' | ||
| self.assertEqual(scrape_emails(atob), {'email@example.com'}) | ||
| def test_entities(self): | ||
| html = """<p>For more information, send email to <A HREF="mailto: | ||
| yourname@domain.com"> | ||
| yourname@domain.com | ||
| </A></p>""" | ||
| self.assertEqual(scrape_emails(html), {'yourname@domain.com'}) |
@@ -1,41 +0,46 @@ | ||
| Metadata-Version: 1.1 | ||
| Metadata-Version: 2.4 | ||
| Name: email-scraper | ||
| Version: 0.5 | ||
| Version: 0.6 | ||
| Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses | ||
| Home-page: https://github.com/kichik/email-scraper | ||
| Author: Amir Szekely | ||
| Author-email: kichik@gmail.com | ||
| License: MIT | ||
| Description: #################################################### | ||
| Python Module for Scraping Email Addresses from HTML | ||
| #################################################### | ||
| The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails | ||
| in plain text, links, `atob()` obfuscation and HTML entities obfuscation. | ||
| Available on PyPI_. | ||
| .. _PyPI: https://pypi.org/pypi/email-scraper/ | ||
| .. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg | ||
| :target: https://github.com/kichik/email-scraper/actions | ||
| .. image:: https://badge.fury.io/py/email-scraper.svg | ||
| :target: https://badge.fury.io/py/email-scraper | ||
| Usage | ||
| ----- | ||
| >>> from email_scraper import scrape_emails | ||
| >>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>') | ||
| {'hello@world.com'} | ||
| >>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>') | ||
| {'email@example.com'} | ||
| Keywords: email scraping web obfuscate | ||
| Platform: UNKNOWN | ||
| Author-email: Amir Szekely <kichik@gmail.com> | ||
| License-Expression: MIT | ||
| Project-URL: Homepage, https://github.com/kichik/email-scraper | ||
| Keywords: email,scraping,web,obfuscate | ||
| Classifier: Development Status :: 4 - Beta | ||
| Classifier: Programming Language :: Python | ||
| Classifier: License :: OSI Approved :: MIT License | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Topic :: Communications :: Email | ||
| Classifier: Topic :: Text Processing :: Markup :: HTML | ||
| Requires-Python: >=3.9 | ||
| Description-Content-Type: text/x-rst | ||
| License-File: LICENSE | ||
| Requires-Dist: tlds | ||
| Provides-Extra: dev | ||
| Requires-Dist: rstcheck; extra == "dev" | ||
| Dynamic: license-file | ||
| #################################################### | ||
| Python Module for Scraping Email Addresses from HTML | ||
| #################################################### | ||
| The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails | ||
| in plain text, links, `atob()` obfuscation and HTML entities obfuscation. | ||
| Available on PyPI_. | ||
| .. _PyPI: https://pypi.org/pypi/email-scraper/ | ||
| .. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg | ||
| :target: https://github.com/kichik/email-scraper/actions | ||
| .. image:: https://badge.fury.io/py/email-scraper.svg | ||
| :target: https://badge.fury.io/py/email-scraper | ||
| Usage | ||
| ----- | ||
| >>> from email_scraper import scrape_emails | ||
| >>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>') | ||
| {'hello@world.com'} | ||
| >>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>') | ||
| {'email@example.com'} |
| tlds | ||
| [dev] | ||
| rstcheck |
@@ -0,4 +1,6 @@ | ||
| .gitignore | ||
| LICENSE | ||
| README.rst | ||
| setup.cfg | ||
| setup.py | ||
| pyproject.toml | ||
| .github/workflows/test-and-publish.yml | ||
| email_scraper/__init__.py | ||
@@ -11,2 +13,3 @@ email_scraper/scrape.py | ||
| email_scraper.egg-info/top_level.txt | ||
| email_scraper.egg-info/zip-safe | ||
| tests/__init__.py | ||
| tests/test_scraping.py |
@@ -8,3 +8,3 @@ import base64 | ||
| 'local': 'a-z0-9!#$%&\'*+\\-/=?^_`{|}~', | ||
| 'domain': 'a-z0-9\-', | ||
| 'domain': 'a-z0-9-', | ||
| 'tlds': '|'.join(tld_set) | ||
@@ -16,3 +16,3 @@ } | ||
| HIDDEN_REGEX = [ | ||
| '(\w+({0})\w+({1})\w+)'.format( | ||
| r'(\w+({0})\w+({1})\w+)'.format( | ||
| at.replace("(", r"\(").replace(")", r"\)").replace("[", r"\[").replace("]", r"\]"), | ||
@@ -62,3 +62,3 @@ dot.replace("(", r"\(").replace(")", r"\)").replace("[", r"\[").replace("]", r"\]"), | ||
| html = unescape(html) | ||
| html = re.sub('atob\\([\'"]([A-Za-z0-9+/]+)[\'"]\\)', replace_atob, html, 0, re.IGNORECASE) | ||
| html = re.sub('atob\\([\'"]([A-Za-z0-9+/]+)[\'"]\\)', replace_atob, html, flags=re.IGNORECASE) | ||
| return html | ||
@@ -65,0 +65,0 @@ |
+40
-35
@@ -1,41 +0,46 @@ | ||
| Metadata-Version: 1.1 | ||
| Metadata-Version: 2.4 | ||
| Name: email-scraper | ||
| Version: 0.5 | ||
| Version: 0.6 | ||
| Summary: Simple utility to extract email addresses from HTML, including obfuscated email addresses | ||
| Home-page: https://github.com/kichik/email-scraper | ||
| Author: Amir Szekely | ||
| Author-email: kichik@gmail.com | ||
| License: MIT | ||
| Description: #################################################### | ||
| Python Module for Scraping Email Addresses from HTML | ||
| #################################################### | ||
| The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails | ||
| in plain text, links, `atob()` obfuscation and HTML entities obfuscation. | ||
| Available on PyPI_. | ||
| .. _PyPI: https://pypi.org/pypi/email-scraper/ | ||
| .. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg | ||
| :target: https://github.com/kichik/email-scraper/actions | ||
| .. image:: https://badge.fury.io/py/email-scraper.svg | ||
| :target: https://badge.fury.io/py/email-scraper | ||
| Usage | ||
| ----- | ||
| >>> from email_scraper import scrape_emails | ||
| >>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>') | ||
| {'hello@world.com'} | ||
| >>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>') | ||
| {'email@example.com'} | ||
| Keywords: email scraping web obfuscate | ||
| Platform: UNKNOWN | ||
| Author-email: Amir Szekely <kichik@gmail.com> | ||
| License-Expression: MIT | ||
| Project-URL: Homepage, https://github.com/kichik/email-scraper | ||
| Keywords: email,scraping,web,obfuscate | ||
| Classifier: Development Status :: 4 - Beta | ||
| Classifier: Programming Language :: Python | ||
| Classifier: License :: OSI Approved :: MIT License | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Topic :: Communications :: Email | ||
| Classifier: Topic :: Text Processing :: Markup :: HTML | ||
| Requires-Python: >=3.9 | ||
| Description-Content-Type: text/x-rst | ||
| License-File: LICENSE | ||
| Requires-Dist: tlds | ||
| Provides-Extra: dev | ||
| Requires-Dist: rstcheck; extra == "dev" | ||
| Dynamic: license-file | ||
| #################################################### | ||
| Python Module for Scraping Email Addresses from HTML | ||
| #################################################### | ||
| The `email_scraper` module provides a simple method that extracts email addresses from HTML. It is able to find emails | ||
| in plain text, links, `atob()` obfuscation and HTML entities obfuscation. | ||
| Available on PyPI_. | ||
| .. _PyPI: https://pypi.org/pypi/email-scraper/ | ||
| .. image:: https://github.com/kichik/email-scraper/workflows/Build/badge.svg | ||
| :target: https://github.com/kichik/email-scraper/actions | ||
| .. image:: https://badge.fury.io/py/email-scraper.svg | ||
| :target: https://badge.fury.io/py/email-scraper | ||
| Usage | ||
| ----- | ||
| >>> from email_scraper import scrape_emails | ||
| >>> scrape_emails('<html><body><a href="mailto:hello@world.com">email me</a></body></html>') | ||
| {'hello@world.com'} | ||
| >>> scarpe_emails('<a href="javascript:window.location.href=atob(\'bWFpbHRvOmVtYWlsQGV4YW1wbGUuY29t\')">E-Mail</a>') | ||
| {'email@example.com'} |
+0
-3
@@ -1,4 +0,1 @@ | ||
| [bdist_wheel] | ||
| universal = 1 | ||
| [egg_info] | ||
@@ -5,0 +2,0 @@ tag_build = |
Sorry, the diff of this file is not supported yet
-36
| import os | ||
| import sys | ||
| from setuptools import setup, find_packages | ||
| version = os.getenv('GITHUB_REF') | ||
| if version and version.startswith('refs/tags/'): | ||
| version = version.replace('refs/tags/', '') | ||
| else: | ||
| version = '0.0' | ||
| if sys.version_info < (2, 7): | ||
| sys.exit('Sorry, Python < 2.7 is not supported') | ||
| setup(name='email-scraper', | ||
| version=version, | ||
| description='Simple utility to extract email addresses from HTML, including obfuscated email addresses', | ||
| long_description=open('README.rst').read(), | ||
| classifiers=[ | ||
| 'Development Status :: 4 - Beta', | ||
| 'Programming Language :: Python', | ||
| 'License :: OSI Approved :: MIT License', | ||
| 'Topic :: Communications :: Email', | ||
| 'Topic :: Text Processing :: Markup :: HTML' | ||
| ], | ||
| keywords='email scraping web obfuscate', | ||
| author='Amir Szekely', | ||
| author_email='kichik@gmail.com', | ||
| url='https://github.com/kichik/email-scraper', | ||
| license='MIT', | ||
| packages=find_packages(exclude=['examples', 'tests']), | ||
| install_requires=['tlds'], | ||
| include_package_data=True, | ||
| zip_safe=True, | ||
| test_suite='tests', | ||
| ) |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
13927
61.96%16
33.33%114
23.91%