tldextract
Advanced tools
+17
-0
@@ -6,2 +6,19 @@ # tldextract Changelog | ||
| ## 5.3.0 (2025-04-21) | ||
| * Features | ||
| * Add result field `registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344)) | ||
| * To complement the existing public suffix field `suffix` | ||
| * Add result property `top_domain_under_public_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344)) | ||
| * Add result property `top_domain_under_registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344)) | ||
| * Deprecate `registered_domain` property | ||
| * Use `top_domain_under_public_suffix` instead, which has the same behavior | ||
| but a more accurate name | ||
| * Bugfixes | ||
| * Fix missing `reverse_domain_name` property in CLI `--json` output ([`a545c67`](https://github.com/john-kurkowski/tldextract/commit/a545c67d87223616fc13e90692886b3ca9af18bb)) | ||
| * Misc. | ||
| * Expand internal `suffix_index` return type to be richer than bools, and | ||
| include the registry suffix during trie traversal | ||
| ([#344](https://github.com/john-kurkowski/tldextract/issues/344)) | ||
| ## 5.2.0 (2025-04-07) | ||
@@ -8,0 +25,0 @@ |
+3
-3
| Metadata-Version: 2.4 | ||
| Name: tldextract | ||
| Version: 5.2.0 | ||
| Version: 5.3.0 | ||
| Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well. | ||
@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com> | ||
| >>> ext = tldextract.extract('http://forums.bbc.co.uk') | ||
| >>> ext.registered_domain | ||
| >>> ext.top_domain_under_public_suffix | ||
| 'bbc.co.uk' | ||
@@ -291,3 +291,3 @@ >>> ext.fqdn | ||
| split_suffix = extractor.extract_urllib(split_url) | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}" | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}" | ||
| ``` | ||
@@ -294,0 +294,0 @@ |
+3
-0
@@ -92,2 +92,5 @@ [project] | ||
| addopts = "--doctest-modules" | ||
| filterwarnings = [ | ||
| "ignore:The 'registered_domain' property is deprecated:DeprecationWarning:tldextract.*:" | ||
| ] | ||
@@ -94,0 +97,0 @@ [tool.ruff.format] |
+2
-2
@@ -52,3 +52,3 @@ # tldextract [](https://badge.fury.io/py/tldextract) [](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml) | ||
| >>> ext = tldextract.extract('http://forums.bbc.co.uk') | ||
| >>> ext.registered_domain | ||
| >>> ext.top_domain_under_public_suffix | ||
| 'bbc.co.uk' | ||
@@ -250,3 +250,3 @@ >>> ext.fqdn | ||
| split_suffix = extractor.extract_urllib(split_url) | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}" | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}" | ||
| ``` | ||
@@ -253,0 +253,0 @@ |
@@ -80,5 +80,3 @@ """tldextract integration tests.""" | ||
| assert json.loads(stdout) == { | ||
| "subdomain": "www", | ||
| "domain": "bbc", | ||
| "suffix": "co.uk", | ||
| "fqdn": "www.bbc.co.uk", | ||
@@ -89,2 +87,8 @@ "ipv4": "", | ||
| "registered_domain": "bbc.co.uk", | ||
| "registry_suffix": "co.uk", | ||
| "reverse_domain_name": "co.uk.bbc.www", | ||
| "subdomain": "www", | ||
| "suffix": "co.uk", | ||
| "top_domain_under_public_suffix": "bbc.co.uk", | ||
| "top_domain_under_registry_suffix": "bbc.co.uk", | ||
| } |
@@ -35,8 +35,15 @@ """tldextract unit tests with a custom suffix list.""" | ||
| assert tld("foo.blogspot.com") == ExtractResult("foo", "blogspot", "com", False) | ||
| assert tld("foo.blogspot.com") == ExtractResult( | ||
| subdomain="foo", | ||
| domain="blogspot", | ||
| suffix="com", | ||
| is_private=False, | ||
| registry_suffix="com", | ||
| ) | ||
| assert tld("foo.blogspot.com", include_psl_private_domains=True) == ExtractResult( | ||
| "", | ||
| "foo", | ||
| "blogspot.com", | ||
| True, | ||
| subdomain="", | ||
| domain="foo", | ||
| suffix="blogspot.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -43,0 +50,0 @@ |
+98
-10
@@ -377,2 +377,38 @@ """Main tldextract unit tests.""" | ||
| def test_top_domain_under_public_suffix() -> None: | ||
| """Test property `top_domain_under_public_suffix`.""" | ||
| assert ( | ||
| tldextract.extract( | ||
| "http://www.example.auth.us-east-1.amazoncognito.com", | ||
| include_psl_private_domains=False, | ||
| ).top_domain_under_public_suffix | ||
| == "amazoncognito.com" | ||
| ) | ||
| assert ( | ||
| tldextract.extract( | ||
| "http://www.example.auth.us-east-1.amazoncognito.com", | ||
| include_psl_private_domains=True, | ||
| ).top_domain_under_public_suffix | ||
| == "example.auth.us-east-1.amazoncognito.com" | ||
| ) | ||
| def test_top_domain_under_registry_suffix() -> None: | ||
| """Test property `top_domain_under_registry_suffix`.""" | ||
| assert ( | ||
| tldextract.extract( | ||
| "http://www.example.auth.us-east-1.amazoncognito.com", | ||
| include_psl_private_domains=False, | ||
| ).top_domain_under_registry_suffix | ||
| == "amazoncognito.com" | ||
| ) | ||
| assert ( | ||
| tldextract.extract( | ||
| "http://www.example.auth.us-east-1.amazoncognito.com", | ||
| include_psl_private_domains=True, | ||
| ).top_domain_under_registry_suffix | ||
| == "amazoncognito.com" | ||
| ) | ||
| def test_ipv4() -> None: | ||
@@ -530,3 +566,7 @@ """Test IPv4 addresses.""" | ||
| assert extract_private("foo.uk.com") == ExtractResult( | ||
| subdomain="", domain="foo", suffix="uk.com", is_private=True | ||
| subdomain="", | ||
| domain="foo", | ||
| suffix="uk.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -536,3 +576,9 @@ assert ( | ||
| == extract_public2("foo.uk.com") | ||
| == ExtractResult(subdomain="foo", domain="uk", suffix="com", is_private=False) | ||
| == ExtractResult( | ||
| subdomain="foo", | ||
| domain="uk", | ||
| suffix="com", | ||
| is_private=False, | ||
| registry_suffix="com", | ||
| ) | ||
| ) | ||
@@ -560,7 +606,17 @@ | ||
| "blogspot.com", include_psl_private_domains=True | ||
| ) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True) | ||
| ) == ExtractResult( | ||
| subdomain="", | ||
| domain="", | ||
| suffix="blogspot.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
| assert tldextract.extract( | ||
| "foo.blogspot.com", include_psl_private_domains=True | ||
| ) == ExtractResult( | ||
| subdomain="", domain="foo", suffix="blogspot.com", is_private=True | ||
| subdomain="", | ||
| domain="foo", | ||
| suffix="blogspot.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -581,2 +637,3 @@ | ||
| is_private=False, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -586,7 +643,17 @@ assert tldextract.extract( | ||
| ) == ExtractResult( | ||
| subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False | ||
| subdomain="ap-south-1", | ||
| domain="amazonaws", | ||
| suffix="com", | ||
| is_private=False, | ||
| registry_suffix="com", | ||
| ) | ||
| assert tldextract.extract( | ||
| "amazonaws.com", include_psl_private_domains=True | ||
| ) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False) | ||
| ) == ExtractResult( | ||
| subdomain="", | ||
| domain="amazonaws", | ||
| suffix="com", | ||
| is_private=False, | ||
| registry_suffix="com", | ||
| ) | ||
| assert tldextract.extract( | ||
@@ -600,2 +667,3 @@ "the-quick-brown-fox.cn-north-1.amazonaws.com.cn", | ||
| is_private=False, | ||
| registry_suffix="com.cn", | ||
| ) | ||
@@ -605,3 +673,7 @@ assert tldextract.extract( | ||
| ) == ExtractResult( | ||
| subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False | ||
| subdomain="cn-north-1", | ||
| domain="amazonaws", | ||
| suffix="com.cn", | ||
| is_private=False, | ||
| registry_suffix="com.cn", | ||
| ) | ||
@@ -611,3 +683,7 @@ assert tldextract.extract( | ||
| ) == ExtractResult( | ||
| subdomain="", domain="amazonaws", suffix="com.cn", is_private=False | ||
| subdomain="", | ||
| domain="amazonaws", | ||
| suffix="com.cn", | ||
| is_private=False, | ||
| registry_suffix="com.cn", | ||
| ) | ||
@@ -621,2 +697,3 @@ assert tldextract.extract( | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -630,2 +707,3 @@ assert tldextract.extract( | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -636,3 +714,7 @@ | ||
| ) == ExtractResult( | ||
| subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True | ||
| subdomain="", | ||
| domain="", | ||
| suffix="s3.ap-south-1.amazonaws.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -646,2 +728,3 @@ assert tldextract.extract( | ||
| is_private=True, | ||
| registry_suffix="com.cn", | ||
| ) | ||
@@ -651,3 +734,7 @@ assert tldextract.extract( | ||
| ) == ExtractResult( | ||
| subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True | ||
| subdomain="", | ||
| domain="", | ||
| suffix="icann.compute.amazonaws.com", | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) | ||
@@ -664,2 +751,3 @@ | ||
| is_private=True, | ||
| registry_suffix="com", | ||
| ) |
| Metadata-Version: 2.4 | ||
| Name: tldextract | ||
| Version: 5.2.0 | ||
| Version: 5.3.0 | ||
| Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well. | ||
@@ -93,3 +93,3 @@ Author-email: John Kurkowski <john.kurkowski@gmail.com> | ||
| >>> ext = tldextract.extract('http://forums.bbc.co.uk') | ||
| >>> ext.registered_domain | ||
| >>> ext.top_domain_under_public_suffix | ||
| 'bbc.co.uk' | ||
@@ -291,3 +291,3 @@ >>> ext.fqdn | ||
| split_suffix = extractor.extract_urllib(split_url) | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}" | ||
| url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}" | ||
| ``` | ||
@@ -294,0 +294,0 @@ |
@@ -20,3 +20,3 @@ # file generated by setuptools-scm | ||
| __version__ = version = '5.2.0' | ||
| __version_tuple__ = version_tuple = (5, 2, 0) | ||
| __version__ = version = '5.3.0' | ||
| __version_tuple__ = version_tuple = (5, 3, 0) |
@@ -101,3 +101,11 @@ """tldextract CLI.""" | ||
| if args.json: | ||
| properties = ("fqdn", "ipv4", "ipv6", "registered_domain") | ||
| properties = ( | ||
| "fqdn", | ||
| "ipv4", | ||
| "ipv6", | ||
| "registered_domain", | ||
| "reverse_domain_name", | ||
| "top_domain_under_public_suffix", | ||
| "top_domain_under_registry_suffix", | ||
| ) | ||
| print( | ||
@@ -104,0 +112,0 @@ json.dumps( |
+170
-36
@@ -31,3 +31,3 @@ """`tldextract` accurately separates a URL's subdomain, domain, and public suffix. | ||
| >>> ext = tldextract.extract("http://forums.bbc.co.uk") | ||
| >>> ext.registered_domain | ||
| >>> ext.top_domain_under_public_suffix | ||
| 'bbc.co.uk' | ||
@@ -42,4 +42,5 @@ >>> ext.fqdn | ||
| import urllib.parse | ||
| import warnings | ||
| from collections.abc import Collection, Sequence | ||
| from dataclasses import dataclass | ||
| from dataclasses import dataclass, field | ||
| from functools import wraps | ||
@@ -101,15 +102,13 @@ | ||
| @property | ||
| def registered_domain(self) -> str: | ||
| """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string. | ||
| registry_suffix: str = field(repr=False) | ||
| """The registry suffix of the input URL, if it contained one, or else the empty string. | ||
| >>> extract("http://forums.bbc.co.uk").registered_domain | ||
| 'bbc.co.uk' | ||
| >>> extract("http://localhost:8080").registered_domain | ||
| '' | ||
| """ | ||
| if self.suffix and self.domain: | ||
| return f"{self.domain}.{self.suffix}" | ||
| return "" | ||
| This field is a domain under which people can register subdomains through a | ||
| registar. | ||
| This field is unaffected by the `include_psl_private_domains` setting. If | ||
| `include_psl_private_domains` was set to `False`, this field is always the | ||
| same as `suffix`. | ||
| """ | ||
| @property | ||
@@ -175,2 +174,52 @@ def fqdn(self) -> str: | ||
| @property | ||
| def registered_domain(self) -> str: | ||
| """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string. | ||
| >>> extract("http://forums.bbc.co.uk").registered_domain | ||
| 'bbc.co.uk' | ||
| >>> extract("http://localhost:8080").registered_domain | ||
| '' | ||
| .. deprecated:: 6.0.0 | ||
| This property is deprecated and will be removed in the next major | ||
| version. Use `top_domain_under_public_suffix` instead, which has the | ||
| same behavior but a more accurate name. | ||
| This is an alias for the `top_domain_under_public_suffix` property. | ||
| `registered_domain` is so called because is roughly the domain the | ||
| owner paid to register with a registrar or, in the case of a private | ||
| domain, "registered" with the domain owner. If the input was not | ||
| something one could register, this property returns the empty string. | ||
| To distinguish the case of private domains, consider Blogspot, which is | ||
| in the PSL's private domains. If `include_psl_private_domains` was set | ||
| to `False`, the `registered_domain` property of a Blogspot URL | ||
| represents the domain the owner of Blogspot registered with a | ||
| registrar, i.e. Google registered "blogspot.com". If | ||
| `include_psl_private_domains=True`, the `registered_domain` property | ||
| represents the "blogspot.com" _subdomain_ the owner of a blog | ||
| "registered" with Blogspot. | ||
| >>> extract( | ||
| ... "http://waiterrant.blogspot.com", include_psl_private_domains=False | ||
| ... ).registered_domain | ||
| 'blogspot.com' | ||
| >>> extract( | ||
| ... "http://waiterrant.blogspot.com", include_psl_private_domains=True | ||
| ... ).registered_domain | ||
| 'waiterrant.blogspot.com' | ||
| To always get the same joined string, regardless of the | ||
| `include_psl_private_domains` setting, consider the | ||
| `top_domain_under_registry_suffix` property. | ||
| """ | ||
| warnings.warn( | ||
| "The 'registered_domain' property is deprecated and will be removed in the next major version. " | ||
| "Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.", | ||
| DeprecationWarning, | ||
| stacklevel=2, | ||
| ) | ||
| return self.top_domain_under_public_suffix | ||
| @property | ||
| def reverse_domain_name(self) -> str: | ||
@@ -200,3 +249,45 @@ """The domain name in Reverse Domain Name Notation. | ||
| @property | ||
| def top_domain_under_registry_suffix(self) -> str: | ||
| """The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string. | ||
| The rightmost domain label might be in the `domain` field, or, if the | ||
| input URL's suffix is a PSL private domain, in the public suffix | ||
| `suffix` field. | ||
| If the input was not in the PSL's private domains, this property is | ||
| equivalent to `top_domain_under_public_suffix`. | ||
| >>> extract( | ||
| ... "http://waiterrant.blogspot.com", include_psl_private_domains=False | ||
| ... ).top_domain_under_registry_suffix | ||
| 'blogspot.com' | ||
| >>> extract( | ||
| ... "http://waiterrant.blogspot.com", include_psl_private_domains=True | ||
| ... ).top_domain_under_registry_suffix | ||
| 'blogspot.com' | ||
| >>> extract("http://localhost:8080").top_domain_under_registry_suffix | ||
| '' | ||
| """ | ||
| top_domain_under_public_suffix = self.top_domain_under_public_suffix | ||
| if not top_domain_under_public_suffix or not self.is_private: | ||
| return top_domain_under_public_suffix | ||
| num_labels = self.registry_suffix.count(".") + 2 | ||
| return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:]) | ||
| @property | ||
| def top_domain_under_public_suffix(self) -> str: | ||
| """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string. | ||
| >>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix | ||
| 'bbc.co.uk' | ||
| >>> extract("http://localhost:8080").top_domain_under_public_suffix | ||
| '' | ||
| """ | ||
| if self.suffix and self.domain: | ||
| return f"{self.domain}.{self.suffix}" | ||
| return "" | ||
| class TLDExtract: | ||
@@ -365,21 +456,55 @@ """A callable for extracting, subdomain, domain, and suffix components from a URL.""" | ||
| ): | ||
| return ExtractResult("", netloc_with_ascii_dots, "", is_private=False) | ||
| return ExtractResult( | ||
| "", netloc_with_ascii_dots, "", is_private=False, registry_suffix="" | ||
| ) | ||
| labels = netloc_with_ascii_dots.split(".") | ||
| suffix_index, is_private = self._get_tld_extractor( | ||
| session=session | ||
| ).suffix_index(labels, include_psl_private_domains=include_psl_private_domains) | ||
| maybe_indexes = self._get_tld_extractor(session).suffix_index( | ||
| labels, include_psl_private_domains=include_psl_private_domains | ||
| ) | ||
| num_ipv4_labels = 4 | ||
| if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip( | ||
| netloc_with_ascii_dots | ||
| if ( | ||
| not maybe_indexes | ||
| and len(labels) == num_ipv4_labels | ||
| and looks_like_ip(netloc_with_ascii_dots) | ||
| ): | ||
| return ExtractResult("", netloc_with_ascii_dots, "", is_private) | ||
| return ExtractResult( | ||
| "", netloc_with_ascii_dots, "", is_private=False, registry_suffix="" | ||
| ) | ||
| elif not maybe_indexes: | ||
| return ExtractResult( | ||
| subdomain=".".join(labels[:-1]), | ||
| domain=labels[-1], | ||
| suffix="", | ||
| is_private=False, | ||
| registry_suffix="", | ||
| ) | ||
| suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else "" | ||
| subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else "" | ||
| domain = labels[suffix_index - 1] if suffix_index else "" | ||
| return ExtractResult(subdomain, domain, suffix, is_private) | ||
| ( | ||
| (public_suffix_index, public_suffix_node), | ||
| (registry_suffix_index, registry_suffix_node), | ||
| ) = maybe_indexes | ||
| subdomain = ( | ||
| ".".join(labels[: public_suffix_index - 1]) | ||
| if public_suffix_index >= 2 | ||
| else "" | ||
| ) | ||
| domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else "" | ||
| public_suffix = ".".join(labels[public_suffix_index:]) | ||
| registry_suffix = ( | ||
| ".".join(labels[registry_suffix_index:]) | ||
| if public_suffix_node.is_private | ||
| else public_suffix | ||
| ) | ||
| return ExtractResult( | ||
| subdomain=subdomain, | ||
| domain=domain, | ||
| suffix=public_suffix, | ||
| is_private=public_suffix_node.is_private, | ||
| registry_suffix=registry_suffix, | ||
| ) | ||
| def update( | ||
@@ -540,6 +665,6 @@ self, fetch_now: bool = False, session: requests.Session | None = None | ||
| self, spl: list[str], include_psl_private_domains: bool | None = None | ||
| ) -> tuple[int, bool]: | ||
| """Return the index of the first suffix label, and whether it is private. | ||
| ) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None: | ||
| """Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes. | ||
| Returns len(spl) if no suffix is found. | ||
| Returns `None` if no suffix is found. | ||
| """ | ||
@@ -549,3 +674,3 @@ if include_psl_private_domains is None: | ||
| node = ( | ||
| node = reg_node = ( | ||
| self.tlds_incl_private_trie | ||
@@ -555,11 +680,13 @@ if include_psl_private_domains | ||
| ) | ||
| i = len(spl) | ||
| j = i | ||
| suffix_idx = reg_idx = label_idx = len(spl) | ||
| for label in reversed(spl): | ||
| decoded_label = _decode_punycode(label) | ||
| if decoded_label in node.matches: | ||
| j -= 1 | ||
| label_idx -= 1 | ||
| node = node.matches[decoded_label] | ||
| if node.end: | ||
| i = j | ||
| suffix_idx = label_idx | ||
| if not node.is_private: | ||
| reg_node = node | ||
| reg_idx = label_idx | ||
| continue | ||
@@ -570,11 +697,18 @@ | ||
| is_wildcard_exception = "!" + decoded_label in node.matches | ||
| if is_wildcard_exception: | ||
| return j, node.matches["*"].is_private | ||
| return j - 1, node.matches["*"].is_private | ||
| return ( | ||
| label_idx if is_wildcard_exception else label_idx - 1, | ||
| node.matches["*"], | ||
| ), ( | ||
| reg_idx, | ||
| reg_node, | ||
| ) | ||
| break | ||
| return i, node.is_private | ||
| if suffix_idx == len(spl): | ||
| return None | ||
| return ((suffix_idx, node), (reg_idx, reg_node)) | ||
| def _decode_punycode(label: str) -> str: | ||
@@ -581,0 +715,0 @@ lowered = label.lower() |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
477519
2%2302
10.57%