New Research: Supply Chain Attack on Axios Pulls Malicious Dependency from npm.Details
Socket
Book a DemoSign in
Socket

tldextract

Package Overview
Dependencies
Maintainers
1
Versions
65
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tldextract - pypi Package Compare versions

Comparing version
5.2.0
to
5.3.0
+17
-0
CHANGELOG.md

@@ -6,2 +6,19 @@ # tldextract Changelog

## 5.3.0 (2025-04-21)
* Features
* Add result field `registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
* To complement the existing public suffix field `suffix`
* Add result property `top_domain_under_public_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
* Add result property `top_domain_under_registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
* Deprecate `registered_domain` property
* Use `top_domain_under_public_suffix` instead, which has the same behavior
but a more accurate name
* Bugfixes
* Fix missing `reverse_domain_name` property in CLI `--json` output ([`a545c67`](https://github.com/john-kurkowski/tldextract/commit/a545c67d87223616fc13e90692886b3ca9af18bb))
* Misc.
* Expand internal `suffix_index` return type to be richer than bools, and
include the registry suffix during trie traversal
([#344](https://github.com/john-kurkowski/tldextract/issues/344))
## 5.2.0 (2025-04-07)

@@ -8,0 +25,0 @@

+3
-3
Metadata-Version: 2.4
Name: tldextract
Version: 5.2.0
Version: 5.3.0
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com>

>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.registered_domain
>>> ext.top_domain_under_public_suffix
'bbc.co.uk'

@@ -291,3 +291,3 @@ >>> ext.fqdn

split_suffix = extractor.extract_urllib(split_url)
url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
```

@@ -294,0 +294,0 @@

@@ -92,2 +92,5 @@ [project]

addopts = "--doctest-modules"
filterwarnings = [
"ignore:The 'registered_domain' property is deprecated:DeprecationWarning:tldextract.*:"
]

@@ -94,0 +97,0 @@ [tool.ruff.format]

@@ -52,3 +52,3 @@ # tldextract [![PyPI version](https://badge.fury.io/py/tldextract.svg)](https://badge.fury.io/py/tldextract) [![Build Status](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml/badge.svg)](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)

>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.registered_domain
>>> ext.top_domain_under_public_suffix
'bbc.co.uk'

@@ -250,3 +250,3 @@ >>> ext.fqdn

split_suffix = extractor.extract_urllib(split_url)
url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
```

@@ -253,0 +253,0 @@

@@ -80,5 +80,3 @@ """tldextract integration tests."""

assert json.loads(stdout) == {
"subdomain": "www",
"domain": "bbc",
"suffix": "co.uk",
"fqdn": "www.bbc.co.uk",

@@ -89,2 +87,8 @@ "ipv4": "",

"registered_domain": "bbc.co.uk",
"registry_suffix": "co.uk",
"reverse_domain_name": "co.uk.bbc.www",
"subdomain": "www",
"suffix": "co.uk",
"top_domain_under_public_suffix": "bbc.co.uk",
"top_domain_under_registry_suffix": "bbc.co.uk",
}

@@ -35,8 +35,15 @@ """tldextract unit tests with a custom suffix list."""

assert tld("foo.blogspot.com") == ExtractResult("foo", "blogspot", "com", False)
assert tld("foo.blogspot.com") == ExtractResult(
subdomain="foo",
domain="blogspot",
suffix="com",
is_private=False,
registry_suffix="com",
)
assert tld("foo.blogspot.com", include_psl_private_domains=True) == ExtractResult(
"",
"foo",
"blogspot.com",
True,
subdomain="",
domain="foo",
suffix="blogspot.com",
is_private=True,
registry_suffix="com",
)

@@ -43,0 +50,0 @@

@@ -377,2 +377,38 @@ """Main tldextract unit tests."""

def test_top_domain_under_public_suffix() -> None:
"""Test property `top_domain_under_public_suffix`."""
assert (
tldextract.extract(
"http://www.example.auth.us-east-1.amazoncognito.com",
include_psl_private_domains=False,
).top_domain_under_public_suffix
== "amazoncognito.com"
)
assert (
tldextract.extract(
"http://www.example.auth.us-east-1.amazoncognito.com",
include_psl_private_domains=True,
).top_domain_under_public_suffix
== "example.auth.us-east-1.amazoncognito.com"
)
def test_top_domain_under_registry_suffix() -> None:
"""Test property `top_domain_under_registry_suffix`."""
assert (
tldextract.extract(
"http://www.example.auth.us-east-1.amazoncognito.com",
include_psl_private_domains=False,
).top_domain_under_registry_suffix
== "amazoncognito.com"
)
assert (
tldextract.extract(
"http://www.example.auth.us-east-1.amazoncognito.com",
include_psl_private_domains=True,
).top_domain_under_registry_suffix
== "amazoncognito.com"
)
def test_ipv4() -> None:

@@ -530,3 +566,7 @@ """Test IPv4 addresses."""

assert extract_private("foo.uk.com") == ExtractResult(
subdomain="", domain="foo", suffix="uk.com", is_private=True
subdomain="",
domain="foo",
suffix="uk.com",
is_private=True,
registry_suffix="com",
)

@@ -536,3 +576,9 @@ assert (

== extract_public2("foo.uk.com")
== ExtractResult(subdomain="foo", domain="uk", suffix="com", is_private=False)
== ExtractResult(
subdomain="foo",
domain="uk",
suffix="com",
is_private=False,
registry_suffix="com",
)
)

@@ -560,7 +606,17 @@

"blogspot.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True)
) == ExtractResult(
subdomain="",
domain="",
suffix="blogspot.com",
is_private=True,
registry_suffix="com",
)
assert tldextract.extract(
"foo.blogspot.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="foo", suffix="blogspot.com", is_private=True
subdomain="",
domain="foo",
suffix="blogspot.com",
is_private=True,
registry_suffix="com",
)

@@ -581,2 +637,3 @@

is_private=False,
registry_suffix="com",
)

@@ -586,7 +643,17 @@ assert tldextract.extract(

) == ExtractResult(
subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False
subdomain="ap-south-1",
domain="amazonaws",
suffix="com",
is_private=False,
registry_suffix="com",
)
assert tldextract.extract(
"amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False)
) == ExtractResult(
subdomain="",
domain="amazonaws",
suffix="com",
is_private=False,
registry_suffix="com",
)
assert tldextract.extract(

@@ -600,2 +667,3 @@ "the-quick-brown-fox.cn-north-1.amazonaws.com.cn",

is_private=False,
registry_suffix="com.cn",
)

@@ -605,3 +673,7 @@ assert tldextract.extract(

) == ExtractResult(
subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False
subdomain="cn-north-1",
domain="amazonaws",
suffix="com.cn",
is_private=False,
registry_suffix="com.cn",
)

@@ -611,3 +683,7 @@ assert tldextract.extract(

) == ExtractResult(
subdomain="", domain="amazonaws", suffix="com.cn", is_private=False
subdomain="",
domain="amazonaws",
suffix="com.cn",
is_private=False,
registry_suffix="com.cn",
)

@@ -621,2 +697,3 @@ assert tldextract.extract(

is_private=True,
registry_suffix="com",
)

@@ -630,2 +707,3 @@ assert tldextract.extract(

is_private=True,
registry_suffix="com",
)

@@ -636,3 +714,7 @@

) == ExtractResult(
subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True
subdomain="",
domain="",
suffix="s3.ap-south-1.amazonaws.com",
is_private=True,
registry_suffix="com",
)

@@ -646,2 +728,3 @@ assert tldextract.extract(

is_private=True,
registry_suffix="com.cn",
)

@@ -651,3 +734,7 @@ assert tldextract.extract(

) == ExtractResult(
subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True
subdomain="",
domain="",
suffix="icann.compute.amazonaws.com",
is_private=True,
registry_suffix="com",
)

@@ -664,2 +751,3 @@

is_private=True,
registry_suffix="com",
)
Metadata-Version: 2.4
Name: tldextract
Version: 5.2.0
Version: 5.3.0
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.

@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com>

>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.registered_domain
>>> ext.top_domain_under_public_suffix
'bbc.co.uk'

@@ -291,3 +291,3 @@ >>> ext.fqdn

split_suffix = extractor.extract_urllib(split_url)
url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
```

@@ -294,0 +294,0 @@

@@ -20,3 +20,3 @@ # file generated by setuptools-scm

__version__ = version = '5.2.0'
__version_tuple__ = version_tuple = (5, 2, 0)
__version__ = version = '5.3.0'
__version_tuple__ = version_tuple = (5, 3, 0)

@@ -101,3 +101,11 @@ """tldextract CLI."""

if args.json:
properties = ("fqdn", "ipv4", "ipv6", "registered_domain")
properties = (
"fqdn",
"ipv4",
"ipv6",
"registered_domain",
"reverse_domain_name",
"top_domain_under_public_suffix",
"top_domain_under_registry_suffix",
)
print(

@@ -104,0 +112,0 @@ json.dumps(

@@ -31,3 +31,3 @@ """`tldextract` accurately separates a URL's subdomain, domain, and public suffix.

>>> ext = tldextract.extract("http://forums.bbc.co.uk")
>>> ext.registered_domain
>>> ext.top_domain_under_public_suffix
'bbc.co.uk'

@@ -42,4 +42,5 @@ >>> ext.fqdn

import urllib.parse
import warnings
from collections.abc import Collection, Sequence
from dataclasses import dataclass
from dataclasses import dataclass, field
from functools import wraps

@@ -101,15 +102,13 @@

@property
def registered_domain(self) -> str:
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
registry_suffix: str = field(repr=False)
"""The registry suffix of the input URL, if it contained one, or else the empty string.
>>> extract("http://forums.bbc.co.uk").registered_domain
'bbc.co.uk'
>>> extract("http://localhost:8080").registered_domain
''
"""
if self.suffix and self.domain:
return f"{self.domain}.{self.suffix}"
return ""
This field is a domain under which people can register subdomains through a
registar.
This field is unaffected by the `include_psl_private_domains` setting. If
`include_psl_private_domains` was set to `False`, this field is always the
same as `suffix`.
"""
@property

@@ -175,2 +174,52 @@ def fqdn(self) -> str:

@property
def registered_domain(self) -> str:
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
>>> extract("http://forums.bbc.co.uk").registered_domain
'bbc.co.uk'
>>> extract("http://localhost:8080").registered_domain
''
.. deprecated:: 6.0.0
This property is deprecated and will be removed in the next major
version. Use `top_domain_under_public_suffix` instead, which has the
same behavior but a more accurate name.
This is an alias for the `top_domain_under_public_suffix` property.
`registered_domain` is so called because is roughly the domain the
owner paid to register with a registrar or, in the case of a private
domain, "registered" with the domain owner. If the input was not
something one could register, this property returns the empty string.
To distinguish the case of private domains, consider Blogspot, which is
in the PSL's private domains. If `include_psl_private_domains` was set
to `False`, the `registered_domain` property of a Blogspot URL
represents the domain the owner of Blogspot registered with a
registrar, i.e. Google registered "blogspot.com". If
`include_psl_private_domains=True`, the `registered_domain` property
represents the "blogspot.com" _subdomain_ the owner of a blog
"registered" with Blogspot.
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).registered_domain
'blogspot.com'
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).registered_domain
'waiterrant.blogspot.com'
To always get the same joined string, regardless of the
`include_psl_private_domains` setting, consider the
`top_domain_under_registry_suffix` property.
"""
warnings.warn(
"The 'registered_domain' property is deprecated and will be removed in the next major version. "
"Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
DeprecationWarning,
stacklevel=2,
)
return self.top_domain_under_public_suffix
@property
def reverse_domain_name(self) -> str:

@@ -200,3 +249,45 @@ """The domain name in Reverse Domain Name Notation.

@property
def top_domain_under_registry_suffix(self) -> str:
"""The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
The rightmost domain label might be in the `domain` field, or, if the
input URL's suffix is a PSL private domain, in the public suffix
`suffix` field.
If the input was not in the PSL's private domains, this property is
equivalent to `top_domain_under_public_suffix`.
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract("http://localhost:8080").top_domain_under_registry_suffix
''
"""
top_domain_under_public_suffix = self.top_domain_under_public_suffix
if not top_domain_under_public_suffix or not self.is_private:
return top_domain_under_public_suffix
num_labels = self.registry_suffix.count(".") + 2
return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
@property
def top_domain_under_public_suffix(self) -> str:
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
'bbc.co.uk'
>>> extract("http://localhost:8080").top_domain_under_public_suffix
''
"""
if self.suffix and self.domain:
return f"{self.domain}.{self.suffix}"
return ""
class TLDExtract:

@@ -365,21 +456,55 @@ """A callable for extracting, subdomain, domain, and suffix components from a URL."""

):
return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
return ExtractResult(
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
)
labels = netloc_with_ascii_dots.split(".")
suffix_index, is_private = self._get_tld_extractor(
session=session
).suffix_index(labels, include_psl_private_domains=include_psl_private_domains)
maybe_indexes = self._get_tld_extractor(session).suffix_index(
labels, include_psl_private_domains=include_psl_private_domains
)
num_ipv4_labels = 4
if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip(
netloc_with_ascii_dots
if (
not maybe_indexes
and len(labels) == num_ipv4_labels
and looks_like_ip(netloc_with_ascii_dots)
):
return ExtractResult("", netloc_with_ascii_dots, "", is_private)
return ExtractResult(
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
)
elif not maybe_indexes:
return ExtractResult(
subdomain=".".join(labels[:-1]),
domain=labels[-1],
suffix="",
is_private=False,
registry_suffix="",
)
suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else ""
subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""
domain = labels[suffix_index - 1] if suffix_index else ""
return ExtractResult(subdomain, domain, suffix, is_private)
(
(public_suffix_index, public_suffix_node),
(registry_suffix_index, registry_suffix_node),
) = maybe_indexes
subdomain = (
".".join(labels[: public_suffix_index - 1])
if public_suffix_index >= 2
else ""
)
domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
public_suffix = ".".join(labels[public_suffix_index:])
registry_suffix = (
".".join(labels[registry_suffix_index:])
if public_suffix_node.is_private
else public_suffix
)
return ExtractResult(
subdomain=subdomain,
domain=domain,
suffix=public_suffix,
is_private=public_suffix_node.is_private,
registry_suffix=registry_suffix,
)
def update(

@@ -540,6 +665,6 @@ self, fetch_now: bool = False, session: requests.Session | None = None

self, spl: list[str], include_psl_private_domains: bool | None = None
) -> tuple[int, bool]:
"""Return the index of the first suffix label, and whether it is private.
) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
"""Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
Returns len(spl) if no suffix is found.
Returns `None` if no suffix is found.
"""

@@ -549,3 +674,3 @@ if include_psl_private_domains is None:

node = (
node = reg_node = (
self.tlds_incl_private_trie

@@ -555,11 +680,13 @@ if include_psl_private_domains

)
i = len(spl)
j = i
suffix_idx = reg_idx = label_idx = len(spl)
for label in reversed(spl):
decoded_label = _decode_punycode(label)
if decoded_label in node.matches:
j -= 1
label_idx -= 1
node = node.matches[decoded_label]
if node.end:
i = j
suffix_idx = label_idx
if not node.is_private:
reg_node = node
reg_idx = label_idx
continue

@@ -570,11 +697,18 @@

is_wildcard_exception = "!" + decoded_label in node.matches
if is_wildcard_exception:
return j, node.matches["*"].is_private
return j - 1, node.matches["*"].is_private
return (
label_idx if is_wildcard_exception else label_idx - 1,
node.matches["*"],
), (
reg_idx,
reg_node,
)
break
return i, node.is_private
if suffix_idx == len(spl):
return None
return ((suffix_idx, node), (reg_idx, reg_node))
def _decode_punycode(label: str) -> str:

@@ -581,0 +715,0 @@ lowered = label.lower()