
Security News
Deno 2.4 Brings Back deno bundle, Improves Dependency Management and Observability
Deno 2.4 brings back bundling, improves dependency updates and telemetry, and makes the runtime more practical for real-world JavaScript projects.
Writing complex regular expressions can prove to be difficult and error-prone. This library aims to provide a simple interface for constructing patterns, finding matches and performing substitutions.
Pattern
instances do not modify internal state, but instead return copies with the modifications.from regularize import pattern, finder
# Start a new pattern
logfile_pattern = pattern()
# Create a base pattern for the logfile names
logfile_pattern = logfile_pattern.\
literal('application.').\
any_number().\
quantify(minimum=1).case_insensitive()
uncompressed_logfile = logfile_pattern.literal('.log').end_anchor()
compressed_logfile = logfile_pattern.literal('.log.gz').end_anchor()
print(uncompressed_logfile)
# Expression: /application\.[0-9]+\.log$/
print(compressed_logfile)
# Expression: /application\.[0-9]+\.log\.gz$/
print(finder(uncompressed_logfile).match('application.1.log'))
# <re.Match object; span=(0, 17), match='application.1.log'>
print(finder(compressed_logfile).match('application.1.log.gz'))
# <re.Match object; span=(0, 20), match='application.1.log.gz'>
from urllib.parse import urlparse
from regularize import pattern
# Valid characters for DNS names
ascii_alphanumeric = pattern().lowercase_ascii_letters(). \
uppercase_ascii_letters().any_number()
domain_pattern = \
ascii_alphanumeric.close_bracket() + \
ascii_alphanumeric.literal('-').quantify(1, 61)
# At least one alphanumeric character before the dot and after the dash
domain_pattern += ascii_alphanumeric.close_bracket()
# Add TLD
tld_pattern = pattern().lowercase_ascii_letters(closed=False). \
uppercase_ascii_letters(). \
quantify(minimum=2)
# Add optional subdomain group
subdomain_pattern = domain_pattern.\
group(name='subdomain', optional=True).\
literal('.').\
group(optional=True)
# Full domain pattern
domain_pattern = subdomain_pattern + domain_pattern.literal('.') + tld_pattern
# Match HTTP or HTTPS scheme
scheme_pattern = pattern().literal('http').any_of('s').\
quantify(minimum=0, maximum=1).\
group('scheme').\
literal('://')
# Match the URL path (if any exists)
path_pattern = pattern().literal('/').any_number().\
lowercase_ascii_letters().literal('%-_').\
quantify(minimum=1).match_all()
# Compose the complete pattern
url_pattern = (scheme_pattern + domain_pattern.group('domain') +
path_pattern.group(name='path', optional=True)).case_insensitive()
url = 'https://www.example.com/p/1'
compiled_url_pattern = url_pattern.compile()
url_regex_matches = compiled_url_pattern.match(url).groupdict()
parsed_url = urlparse(url)
print(url_regex_matches)
# {'scheme': 'https', 'domain': 'www.example.com', 'subdomain': 'www', 'path': '/p/1'}
print(parsed_url)
# ParseResult(scheme='https', netloc='www.example.com', path='/p/1', params='', query='', fragment='')
assert parsed_url.scheme == url_regex_matches['scheme']
assert parsed_url.hostname == url_regex_matches['domain']
assert parsed_url.path == url_regex_matches['path']
assert url_regex_matches['subdomain'] == 'www'
The following example is taken from the common format sample of the Apache web server combined log.
from regularize.expression import Pattern, pattern
apache_webserver_combined_log = (
'127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] '
'"GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" '
'"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:84.0) Gecko/20100101 Firefox/84.0"'
)
ip = pattern().any_of('.', Pattern.ANY_NUMBER).quantify(minimum=7).group('ip')
identd_client_id = pattern().literal('-')
http_auth_user = pattern().any_of(Pattern.ANY_ASCII_CHARACTER, '_', '.').\
at_least_one().group('http_auth_user')
time = pattern().literal('[').none_of(']').quantify(minimum=26).literal(']')
http_verb = pattern().literal('"').group('http_verb',
pattern=pattern().uppercase_ascii_letters().at_least_one())
url = pattern().group(name='url',
pattern=pattern().none_of(Pattern.ANY_WHITESPACE).at_least_one())
http_version = pattern().literal('HTTP/').any_of('1', '2').literal('.').\
any_of('0', '1').group('http_version').literal('"')
http_status_code = pattern().group(name='http_status_code',
pattern=pattern().any_of(Pattern.ANY_NUMBER).exactly(3))
response_bytes = pattern().group(name='response_bytes_without_headers',
pattern=pattern().any_of(Pattern.ANY_NUMBER).at_least_one())
# Note the repetition here. For multiple groups using the same expression,
# we can create a lambda, e.g:
# lambda name: pattern().literal('"').group(name=name, pattern=pattern().none_of('"').at_least_one()).literal('"')
referer = pattern().literal('"').\
group(name='referer', pattern=pattern().none_of('"').at_least_one()).literal('"')
user_agent = pattern().literal('"').\
group(name='user_agent', pattern=pattern().none_of('"').at_least_one())
p = Pattern.join(
pattern().whitespace(),
[ip, identd_client_id, http_auth_user, time,
http_verb, url, http_version, http_status_code,
response_bytes, referer, user_agent]
)
assert {'ip': '127.0.0.1', 'http_auth_user': 'frank', 'http_verb': 'GET', 'url': '/apache_pb.gif',
'http_version': 'HTTP/1.0', 'http_status_code': '200', 'response_bytes_without_headers': '2326',
'user_agent': 'http://www.example.com/start.html'} == \
p.compile().match(apache_webserver_combined_log).groupdict()
from regularize import pattern
from regularize.replace import substitution
html = '''<h1>Article Title</h1>
<p>This is a <b>blog post</b></p>'''
p = pattern().literal('<').any_of('/').quantify(minimum=0).ascii_letters().any_number().at_least_one().literal('>')
s = substitution(p)
text = s.replace(html)
print(text)
'''
Article Title
This is a blog post
'''
Commonly used patterns can be easily added either by creating a sub-class of the Pattern
class,
or by using the extension registry.
There are two prerequisites for new pattern builder methods:
Pattern
instance.from regularize.expression import Pattern
class MyPattern(Pattern):
def html_tag(self, opening=True):
if opening:
new = self.literal('<')
else:
new = self.literal('</')
return new.any_of(Pattern.ANY_ASCII_CHARACTER).at_least_one().literal('>')
from regularize.expression import Pattern
class HTMLTag(Pattern):
def __call__(self, opening=True):
if opening:
new = self.literal('<')
else:
new = self.literal('</')
return new.any_of(Pattern.ANY_ASCII_CHARACTER). \
quantify(minimum=1).literal('>')
p = Pattern()
# The registry is attached to the Pattern class:
Pattern.registry.add('html_tag', HTMLTag)
# But is also accessible through the instance for convenience:
p.extensions.registry.add('html_tag', HTMLTag)
# We can now call the pattern wrapper by its given alias, through the `ext` object:
p = p.ext.html_tag()
print(p.build())
# <[a-z]+>
FAQs
Regular Expression Builder
We found that regularize demonstrated a healthy version release cadence and project activity because the last version was released less than a year ago. It has 1 open source maintainer collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
Deno 2.4 brings back bundling, improves dependency updates and telemetry, and makes the runtime more practical for real-world JavaScript projects.
Security News
CVEForecast.org uses machine learning to project a record-breaking surge in vulnerability disclosures in 2025.
Security News
Browserslist-rs now uses static data to reduce binary size by over 1MB, improving memory use and performance for Rust-based frontend tools.