tldextract - PyPI Package Compare versions

+17

-0

CHANGELOG.md

		@@ -6,2 +6,19 @@ # tldextract Changelog

		## 5.3.0 (2025-04-21)

		* Features
		* Add result field `registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
		* To complement the existing public suffix field `suffix`
		* Add result property `top_domain_under_public_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
		* Add result property `top_domain_under_registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
		* Deprecate `registered_domain` property
		* Use `top_domain_under_public_suffix` instead, which has the same behavior
		but a more accurate name
		* Bugfixes
		* Fix missing `reverse_domain_name` property in CLI `--json` output ([`a545c67`](https://github.com/john-kurkowski/tldextract/commit/a545c67d87223616fc13e90692886b3ca9af18bb))
		* Misc.
		* Expand internal `suffix_index` return type to be richer than bools, and
		include the registry suffix during trie traversal
		([#344](https://github.com/john-kurkowski/tldextract/issues/344))

		## 5.2.0 (2025-04-07)
		@@ -8,0 +25,0 @@

+3

-3

PKG-INFO

		Metadata-Version: 2.4
		Name: tldextract
		Version: 5.2.0
		Version: 5.3.0
		Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
		@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com>
		>>> ext = tldextract.extract('http://forums.bbc.co.uk')
		>>> ext.registered_domain
		>>> ext.top_domain_under_public_suffix
		'bbc.co.uk'
		@@ -291,3 +291,3 @@ >>> ext.fqdn
		split_suffix = extractor.extract_urllib(split_url)
		url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
		url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
		```
		@@ -294,0 +294,0 @@

+3

-0

pyproject.toml

		@@ -92,2 +92,5 @@ [project]
		addopts = "--doctest-modules"
		filterwarnings = [
		"ignore:The 'registered_domain' property is deprecated:DeprecationWarning:tldextract.*:"
		]

		@@ -94,0 +97,0 @@ [tool.ruff.format]

+2

-2

README.md

		@@ -52,3 +52,3 @@ # tldextract [![PyPI version](https://badge.fury.io/py/tldextract.svg)](https://badge.fury.io/py/tldextract) [![Build Status](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml/badge.svg)](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)
		>>> ext = tldextract.extract('http://forums.bbc.co.uk')
		>>> ext.registered_domain
		>>> ext.top_domain_under_public_suffix
		'bbc.co.uk'
		@@ -250,3 +250,3 @@ >>> ext.fqdn
		split_suffix = extractor.extract_urllib(split_url)
		url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
		url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
		```
		@@ -253,0 +253,0 @@

+6

-2

tests/cli_test.py

		@@ -80,5 +80,3 @@ """tldextract integration tests."""
		assert json.loads(stdout) == {
		"subdomain": "www",
		"domain": "bbc",
		"suffix": "co.uk",
		"fqdn": "www.bbc.co.uk",
		@@ -89,2 +87,8 @@ "ipv4": "",
		"registered_domain": "bbc.co.uk",
		"registry_suffix": "co.uk",
		"reverse_domain_name": "co.uk.bbc.www",
		"subdomain": "www",
		"suffix": "co.uk",
		"top_domain_under_public_suffix": "bbc.co.uk",
		"top_domain_under_registry_suffix": "bbc.co.uk",
		}

+12

-5

tests/custom_suffix_test.py

		@@ -35,8 +35,15 @@ """tldextract unit tests with a custom suffix list."""

		assert tld("foo.blogspot.com") == ExtractResult("foo", "blogspot", "com", False)
		assert tld("foo.blogspot.com") == ExtractResult(
		subdomain="foo",
		domain="blogspot",
		suffix="com",
		is_private=False,
		registry_suffix="com",
		)
		assert tld("foo.blogspot.com", include_psl_private_domains=True) == ExtractResult(
		"",
		"foo",
		"blogspot.com",
		True,
		subdomain="",
		domain="foo",
		suffix="blogspot.com",
		is_private=True,
		registry_suffix="com",
		)
		@@ -43,0 +50,0 @@

+98

-10

tests/main_test.py

		@@ -377,2 +377,38 @@ """Main tldextract unit tests."""

		def test_top_domain_under_public_suffix() -> None:
		"""Test property `top_domain_under_public_suffix`."""
		assert (
		tldextract.extract(
		"http://www.example.auth.us-east-1.amazoncognito.com",
		include_psl_private_domains=False,
		).top_domain_under_public_suffix
		== "amazoncognito.com"
		)
		assert (
		tldextract.extract(
		"http://www.example.auth.us-east-1.amazoncognito.com",
		include_psl_private_domains=True,
		).top_domain_under_public_suffix
		== "example.auth.us-east-1.amazoncognito.com"
		)


		def test_top_domain_under_registry_suffix() -> None:
		"""Test property `top_domain_under_registry_suffix`."""
		assert (
		tldextract.extract(
		"http://www.example.auth.us-east-1.amazoncognito.com",
		include_psl_private_domains=False,
		).top_domain_under_registry_suffix
		== "amazoncognito.com"
		)
		assert (
		tldextract.extract(
		"http://www.example.auth.us-east-1.amazoncognito.com",
		include_psl_private_domains=True,
		).top_domain_under_registry_suffix
		== "amazoncognito.com"
		)


		def test_ipv4() -> None:
		@@ -530,3 +566,7 @@ """Test IPv4 addresses."""
		assert extract_private("foo.uk.com") == ExtractResult(
		subdomain="", domain="foo", suffix="uk.com", is_private=True
		subdomain="",
		domain="foo",
		suffix="uk.com",
		is_private=True,
		registry_suffix="com",
		)
		@@ -536,3 +576,9 @@ assert (
		== extract_public2("foo.uk.com")
		== ExtractResult(subdomain="foo", domain="uk", suffix="com", is_private=False)
		== ExtractResult(
		subdomain="foo",
		domain="uk",
		suffix="com",
		is_private=False,
		registry_suffix="com",
		)
		)
		@@ -560,7 +606,17 @@
		"blogspot.com", include_psl_private_domains=True
		) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True)
		) == ExtractResult(
		subdomain="",
		domain="",
		suffix="blogspot.com",
		is_private=True,
		registry_suffix="com",
		)
		assert tldextract.extract(
		"foo.blogspot.com", include_psl_private_domains=True
		) == ExtractResult(
		subdomain="", domain="foo", suffix="blogspot.com", is_private=True
		subdomain="",
		domain="foo",
		suffix="blogspot.com",
		is_private=True,
		registry_suffix="com",
		)
		@@ -581,2 +637,3 @@
		is_private=False,
		registry_suffix="com",
		)
		@@ -586,7 +643,17 @@ assert tldextract.extract(
		) == ExtractResult(
		subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False
		subdomain="ap-south-1",
		domain="amazonaws",
		suffix="com",
		is_private=False,
		registry_suffix="com",
		)
		assert tldextract.extract(
		"amazonaws.com", include_psl_private_domains=True
		) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False)
		) == ExtractResult(
		subdomain="",
		domain="amazonaws",
		suffix="com",
		is_private=False,
		registry_suffix="com",
		)
		assert tldextract.extract(
		@@ -600,2 +667,3 @@ "the-quick-brown-fox.cn-north-1.amazonaws.com.cn",
		is_private=False,
		registry_suffix="com.cn",
		)
		@@ -605,3 +673,7 @@ assert tldextract.extract(
		) == ExtractResult(
		subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False
		subdomain="cn-north-1",
		domain="amazonaws",
		suffix="com.cn",
		is_private=False,
		registry_suffix="com.cn",
		)
		@@ -611,3 +683,7 @@ assert tldextract.extract(
		) == ExtractResult(
		subdomain="", domain="amazonaws", suffix="com.cn", is_private=False
		subdomain="",
		domain="amazonaws",
		suffix="com.cn",
		is_private=False,
		registry_suffix="com.cn",
		)
		@@ -621,2 +697,3 @@ assert tldextract.extract(
		is_private=True,
		registry_suffix="com",
		)
		@@ -630,2 +707,3 @@ assert tldextract.extract(
		is_private=True,
		registry_suffix="com",
		)
		@@ -636,3 +714,7 @@
		) == ExtractResult(
		subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True
		subdomain="",
		domain="",
		suffix="s3.ap-south-1.amazonaws.com",
		is_private=True,
		registry_suffix="com",
		)
		@@ -646,2 +728,3 @@ assert tldextract.extract(
		is_private=True,
		registry_suffix="com.cn",
		)
		@@ -651,3 +734,7 @@ assert tldextract.extract(
		) == ExtractResult(
		subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True
		subdomain="",
		domain="",
		suffix="icann.compute.amazonaws.com",
		is_private=True,
		registry_suffix="com",
		)
		@@ -664,2 +751,3 @@
		is_private=True,
		registry_suffix="com",
		)

+3

-3

tldextract.egg-info/PKG-INFO

		Metadata-Version: 2.4
		Name: tldextract
		Version: 5.2.0
		Version: 5.3.0
		Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
		@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com>
		>>> ext = tldextract.extract('http://forums.bbc.co.uk')
		>>> ext.registered_domain
		>>> ext.top_domain_under_public_suffix
		'bbc.co.uk'
		@@ -291,3 +291,3 @@ >>> ext.fqdn
		split_suffix = extractor.extract_urllib(split_url)
		url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
		url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
		```
		@@ -294,0 +294,0 @@

+2

-2

tldextract/_version.py

		@@ -20,3 +20,3 @@ # file generated by setuptools-scm

		__version__ = version = '5.2.0'
		__version_tuple__ = version_tuple = (5, 2, 0)
		__version__ = version = '5.3.0'
		__version_tuple__ = version_tuple = (5, 3, 0)

+9

-1

tldextract/cli.py

		@@ -101,3 +101,11 @@ """tldextract CLI."""
		if args.json:
		properties = ("fqdn", "ipv4", "ipv6", "registered_domain")
		properties = (
		"fqdn",
		"ipv4",
		"ipv6",
		"registered_domain",
		"reverse_domain_name",
		"top_domain_under_public_suffix",
		"top_domain_under_registry_suffix",
		)
		print(
		@@ -104,0 +112,0 @@ json.dumps(

+170

-36

tldextract/tldextract.py

		@@ -31,3 +31,3 @@ """`tldextract` accurately separates a URL's subdomain, domain, and public suffix.
		>>> ext = tldextract.extract("http://forums.bbc.co.uk")
		>>> ext.registered_domain
		>>> ext.top_domain_under_public_suffix
		'bbc.co.uk'
		@@ -42,4 +42,5 @@ >>> ext.fqdn
		import urllib.parse
		import warnings
		from collections.abc import Collection, Sequence
		from dataclasses import dataclass
		from dataclasses import dataclass, field
		from functools import wraps
		@@ -101,15 +102,13 @@

		@property
		def registered_domain(self) -> str:
		"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
		registry_suffix: str = field(repr=False)
		"""The registry suffix of the input URL, if it contained one, or else the empty string.

		>>> extract("http://forums.bbc.co.uk").registered_domain
		'bbc.co.uk'
		>>> extract("http://localhost:8080").registered_domain
		''
		"""
		if self.suffix and self.domain:
		return f"{self.domain}.{self.suffix}"
		return ""
		This field is a domain under which people can register subdomains through a
		registar.

		This field is unaffected by the `include_psl_private_domains` setting. If
		`include_psl_private_domains` was set to `False`, this field is always the
		same as `suffix`.
		"""

		@property
		@@ -175,2 +174,52 @@ def fqdn(self) -> str:
		@property
		def registered_domain(self) -> str:
		"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.

		>>> extract("http://forums.bbc.co.uk").registered_domain
		'bbc.co.uk'
		>>> extract("http://localhost:8080").registered_domain
		''

		.. deprecated:: 6.0.0
		This property is deprecated and will be removed in the next major
		version. Use `top_domain_under_public_suffix` instead, which has the
		same behavior but a more accurate name.

		This is an alias for the `top_domain_under_public_suffix` property.
		`registered_domain` is so called because is roughly the domain the
		owner paid to register with a registrar or, in the case of a private
		domain, "registered" with the domain owner. If the input was not
		something one could register, this property returns the empty string.

		To distinguish the case of private domains, consider Blogspot, which is
		in the PSL's private domains. If `include_psl_private_domains` was set
		to `False`, the `registered_domain` property of a Blogspot URL
		represents the domain the owner of Blogspot registered with a
		registrar, i.e. Google registered "blogspot.com". If
		`include_psl_private_domains=True`, the `registered_domain` property
		represents the "blogspot.com" _subdomain_ the owner of a blog
		"registered" with Blogspot.

		>>> extract(
		... "http://waiterrant.blogspot.com", include_psl_private_domains=False
		... ).registered_domain
		'blogspot.com'
		>>> extract(
		... "http://waiterrant.blogspot.com", include_psl_private_domains=True
		... ).registered_domain
		'waiterrant.blogspot.com'

		To always get the same joined string, regardless of the
		`include_psl_private_domains` setting, consider the
		`top_domain_under_registry_suffix` property.
		"""
		warnings.warn(
		"The 'registered_domain' property is deprecated and will be removed in the next major version. "
		"Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
		DeprecationWarning,
		stacklevel=2,
		)
		return self.top_domain_under_public_suffix

		@property
		def reverse_domain_name(self) -> str:
		@@ -200,3 +249,45 @@ """The domain name in Reverse Domain Name Notation.

		@property
		def top_domain_under_registry_suffix(self) -> str:
		"""The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.

		The rightmost domain label might be in the `domain` field, or, if the
		input URL's suffix is a PSL private domain, in the public suffix
		`suffix` field.

		If the input was not in the PSL's private domains, this property is
		equivalent to `top_domain_under_public_suffix`.

		>>> extract(
		... "http://waiterrant.blogspot.com", include_psl_private_domains=False
		... ).top_domain_under_registry_suffix
		'blogspot.com'
		>>> extract(
		... "http://waiterrant.blogspot.com", include_psl_private_domains=True
		... ).top_domain_under_registry_suffix
		'blogspot.com'
		>>> extract("http://localhost:8080").top_domain_under_registry_suffix
		''
		"""
		top_domain_under_public_suffix = self.top_domain_under_public_suffix
		if not top_domain_under_public_suffix or not self.is_private:
		return top_domain_under_public_suffix

		num_labels = self.registry_suffix.count(".") + 2
		return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])

		@property
		def top_domain_under_public_suffix(self) -> str:
		"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.

		>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
		'bbc.co.uk'
		>>> extract("http://localhost:8080").top_domain_under_public_suffix
		''
		"""
		if self.suffix and self.domain:
		return f"{self.domain}.{self.suffix}"
		return ""


		class TLDExtract:
		@@ -365,21 +456,55 @@ """A callable for extracting, subdomain, domain, and suffix components from a URL."""
		):
		return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
		return ExtractResult(
		"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
		)

		labels = netloc_with_ascii_dots.split(".")

		suffix_index, is_private = self._get_tld_extractor(
		session=session
		).suffix_index(labels, include_psl_private_domains=include_psl_private_domains)
		maybe_indexes = self._get_tld_extractor(session).suffix_index(
		labels, include_psl_private_domains=include_psl_private_domains
		)

		num_ipv4_labels = 4
		if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip(
		netloc_with_ascii_dots
		if (
		not maybe_indexes
		and len(labels) == num_ipv4_labels
		and looks_like_ip(netloc_with_ascii_dots)
		):
		return ExtractResult("", netloc_with_ascii_dots, "", is_private)
		return ExtractResult(
		"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
		)
		elif not maybe_indexes:
		return ExtractResult(
		subdomain=".".join(labels[:-1]),
		domain=labels[-1],
		suffix="",
		is_private=False,
		registry_suffix="",
		)

		suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else ""
		subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""
		domain = labels[suffix_index - 1] if suffix_index else ""
		return ExtractResult(subdomain, domain, suffix, is_private)
		(
		(public_suffix_index, public_suffix_node),
		(registry_suffix_index, registry_suffix_node),
		) = maybe_indexes

		subdomain = (
		".".join(labels[: public_suffix_index - 1])
		if public_suffix_index >= 2
		else ""
		)
		domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
		public_suffix = ".".join(labels[public_suffix_index:])
		registry_suffix = (
		".".join(labels[registry_suffix_index:])
		if public_suffix_node.is_private
		else public_suffix
		)
		return ExtractResult(
		subdomain=subdomain,
		domain=domain,
		suffix=public_suffix,
		is_private=public_suffix_node.is_private,
		registry_suffix=registry_suffix,
		)

		def update(
		@@ -540,6 +665,6 @@ self, fetch_now: bool = False, session: requests.Session \| None = None
		self, spl: list[str], include_psl_private_domains: bool \| None = None
		) -> tuple[int, bool]:
		"""Return the index of the first suffix label, and whether it is private.
		) -> tuple[tuple[int, Trie], tuple[int, Trie]] \| None:
		"""Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.

		Returns len(spl) if no suffix is found.
		Returns `None` if no suffix is found.
		"""
		@@ -549,3 +674,3 @@ if include_psl_private_domains is None:

		node = (
		node = reg_node = (
		self.tlds_incl_private_trie
		@@ -555,11 +680,13 @@ if include_psl_private_domains
		)
		i = len(spl)
		j = i
		suffix_idx = reg_idx = label_idx = len(spl)
		for label in reversed(spl):
		decoded_label = _decode_punycode(label)
		if decoded_label in node.matches:
		j -= 1
		label_idx -= 1
		node = node.matches[decoded_label]
		if node.end:
		i = j
		suffix_idx = label_idx
		if not node.is_private:
		reg_node = node
		reg_idx = label_idx
		continue
		@@ -570,11 +697,18 @@
		is_wildcard_exception = "!" + decoded_label in node.matches
		if is_wildcard_exception:
		return j, node.matches["*"].is_private
		return j - 1, node.matches["*"].is_private
		return (
		label_idx if is_wildcard_exception else label_idx - 1,
		node.matches["*"],
		), (
		reg_idx,
		reg_node,
		)

		break

		return i, node.is_private
		if suffix_idx == len(spl):
		return None

		return ((suffix_idx, node), (reg_idx, reg_node))


		def _decode_punycode(label: str) -> str:
		@@ -581,0 +715,0 @@ lowered = label.lower()

tldextract - pypi Package Compare versions

Improved metrics