Search files using the fastest Regex Engine ever - ripgrep - replacement is also supported!
Uses https://github.com/BurntSushi/ripgrep and parses the regular expression output into a pandas DataFrame.
It is much faster than the first version: https://github.com/hansalemaos/PYRipGREP due to more efficient parsing, and substitution is
also supported (BACK UP YOUR DATA BEFORE!). Check out the examples
Tested against Windows 10 / Python 3.10 / Anaconda
pip install rushex
import random
from rushex import FullBore
monsterregex = FullBore(ripgrepexe=r"rg.exe")
files = [
r"C:\testxt\xab.txt",
r"C:\testxt\ö.txt",
r"C:\testxt\xaa.txt",
r"C:\testxt\xaa2.txt",
r"C:\testxt\pypigitupload.py",
r"C:\testxt\pyripbro.py",
r"C:\testxt\pip2dfxxxxxxxxx.py",
]
dfa = monsterregex.find_all_in_files(
regular_expressions=[
r"\b\w{15}\b",
r"\b\d{4}\b",
],
files=files,
ignore_case=True,
allowed_extensions=(),
binary=True,
dfa_size="1G",
multiline=False,
multiline_dotall=False,
field_match_separator="ÇÇÇÇÇ",
)
df3 = monsterregex.find_all_in_folders(
folders=[r"C:\ProgramData\anaconda3\envs\adda"],
regular_expressions=[r"\bnumexpr|pandas\b", r"\bnp\b."],
allowed_extensions=(".py",),
ignore_case=True,
maxsubfolders=-1,
)
df4 = monsterregex.find_all_in_folders(
folders=[r"C:\ProgramData\anaconda3\envs\adda"],
regular_expressions=[r"\bnumexpr|pandas\b", r"\bnp\.\b"],
allowed_extensions=(),
ignore_case=True,
maxsubfolders=-1,
)
df5 = monsterregex.find_all_in_folders(
folders=[r"C:\ProgramData\anaconda3\envs\adda"],
regular_expressions=[r"\bnumexpr|pandas\b", r"\bnp\b."],
allowed_extensions=(".py",),
ignore_case=True,
maxsubfolders=2,
binary=False,
dfa_size="100M",
multiline=False,
multiline_dotall=False,
field_match_separator="ÇÇÇÇÇ",
)
df6 = monsterregex.find_all_in_folders(
folders=[r"C:\ProgramData\anaconda3\envs\adda"],
regular_expressions=[r"\bnp\b.*?\bpd\b"],
allowed_extensions=(".py",),
ignore_case=True,
maxsubfolders=-1,
binary=False,
dfa_size="100M",
multiline=True,
multiline_dotall=True,
field_match_separator="ÇÇÇÇÇ",
)
df7 = monsterregex.find_all_in_folders(
folders=[r"C:\grepte\homepagesavetest13", r"C:\grepte\homepagesavetest14"],
regular_expressions=[r"der|die|das"],
allowed_extensions=(".txt", ".html"),
ignore_case=True,
maxsubfolders=-1,
binary=True,
dfa_size="100M",
multiline=True,
multiline_dotall=False,
field_match_separator="ÇÇÇÇÇ",
)
df7.aa_replacement = df7.apply(
lambda x: FullBore.cb(100)
+ FullBore.cb(random.choice(["Ü", "Ä", "Ö"]))
+ FullBore.cb("-------------")
+ FullBore.cb(x.aa_string_bytes[0]).upper()
+ FullBore.cb(x.aa_string_bytes[1]).lower()
+ FullBore.cb(x.aa_string_bytes[-1]).upper()
+ b"xxxx"
+ FullBore.cb(
FullBore.cb(str(random.randint(1, 2000000))),
),
axis=1,
)
monsterregex.sub(df7)
monsterregex.sub(df7, dryrun=False)
with open(r"C:\all_corpora_filtered_maryfied.txt", mode="rb") as f:
data = f.read()
datau = data.decode("utf-8", "ignore")
results1 = monsterregex.find_all_in_variable(
regular_expressions=[r"\bHaus\w+\b"], variable=data
)
class FullBore(builtins.object)
| FullBore(ripgrepexe: Optional[str] = None, msvc_or_gnu='msvc')
|
| Methods defined here:
|
| __init__(self, ripgrepexe: Optional[str] = None, msvc_or_gnu='msvc')
| Initializes the FullBore class with the path to the ripgrep executable.
|
| Args:
| ripgrepexe (Union[str, None], optional): Path to the ripgrep executable. Defaults to None.
| msvc_or_gnu (str, optional): Compiler to use for installing ripgrep. Defaults to 'msvc'.
|
| find_all_in_files(self, regular_expressions: Union[list, str], files: Union[list, str], ignore_case: bool = True, allowed_extensions: tuple = (), binary: bool = True, dfa_size: str = '1G', multiline: bool = False, multiline_dotall: bool = False, field_match_separator: str = 'ÇÇÇÇÇ') -> pandas.core.frame.DataFrame
| Searches for all regular expressions in the input files.
|
| Args:
| regular_expressions (Union[list, str]): Regular expressions to be searched.
| files (Union[list, str]): Input files to be searched.
| ignore_case (bool, optional): Ignore case while searching. Defaults to True.
| allowed_extensions (tuple, optional): Allowed file extensions. Defaults to ().
| binary (bool, optional): Search in binary mode. Defaults to True.
| dfa_size (str, optional): DFA size for ripgrep. Defaults to "1G".
| multiline (bool, optional): Search in multiline mode. Defaults to False.
| multiline_dotall (bool, optional): Search in multiline dotall mode. Defaults to False.
| field_match_separator (str, optional): Field match separator. Defaults to "ÇÇÇÇÇ".
|
| Returns:
| pd.DataFrame: Dataframe containing the search results.
|
| find_all_in_folders(self, regular_expressions: Union[list, str], folders: Union[list, str], ignore_case: bool = True, allowed_extensions: tuple = (), maxsubfolders: int = -1, binary: bool = True, dfa_size: str = '1G', multiline: bool = False, multiline_dotall: bool = False, field_match_separator: str = 'ÇÇÇÇÇ') -> pandas.core.frame.DataFrame
| Searches for all regular expressions in the input folders.
|
| Args:
| regular_expressions (Union[list, str]): Regular expressions to be searched.
| folders (Union[list, str]): Input folders to be searched.
| ignore_case (bool, optional): Ignore case while searching. Defaults to True.
| allowed_extensions (tuple, optional): Allowed file extensions. Defaults to ().
| maxsubfolders (int, optional): Maximum number of subfolders to be searched. Defaults to -1.
| binary (bool, optional): Search in binary mode. Defaults to True.
| dfa_size (str, optional): DFA size for ripgrep. Defaults to "1G".
| multiline (bool, optional): Search in multiline mode. Defaults to False.
| multiline_dotall (bool, optional): Search in multiline dotall mode. Defaults to False.
| field_match_separator (str, optional): Field match separator. Defaults to "ÇÇÇÇÇ".
|
| Returns:
| pd.DataFrame: Dataframe containing the search results.
|
| find_all_in_variable(self, regular_expressions: Union[list, str], variable: Union[bytes, str], ignore_case: bool = True, binary: bool = True, dfa_size: str = '1G', multiline: bool = False, multiline_dotall: bool = False, outputencoding: str = 'utf-8', field_match_separator: str = 'ÇÇÇÇÇ') -> pandas.core.frame.DataFrame
| Searches for all regular expressions in the input variable.
|
| Args:
| regular_expressions (Union[list, str]): Regular expressions to be searched.
| variable (Union[bytes, str]): Input variable to be searched.
| ignore_case (bool, optional): Ignore case while searching. Defaults to True.
| binary (bool, optional): Search in binary mode. Defaults to True.
| dfa_size (str, optional): DFA size for ripgrep. Defaults to "1G".
| multiline (bool, optional): Search in multiline mode. Defaults to False.
| multiline_dotall (bool, optional): Search in multiline dotall mode. Defaults to False.
| outputencoding (str, optional): Output encoding. Defaults to "utf-8".
| field_match_separator (str, optional): Field match separator. Defaults to "ÇÇÇÇÇ".
|
| Returns:
| pd.DataFrame: Dataframe containing the search results.
|
| sub(self, df: pandas.core.frame.DataFrame, dryrun: bool = True) -> list
| Substitutes the matched regular expressions in the input files.
|
| Args:
| df (pd.DataFrame): Dataframe containing the search results.
| dryrun (bool, optional): If True, performs a dry run. Defaults to True.
|
| Returns:
| list: List of files where the substitutions were made.
|
| sub_in_variable(self, df: pandas.core.frame.DataFrame, variable: Union[bytes, str]) -> Union[bytes, str]
| Substitutes the matched regular expressions in the input variable.
|
| Args:
| df (pd.DataFrame): Dataframe containing the search/replace results.
| variable (Union[bytes, str]): Input variable to be changed.
|
| Returns:
| Union[bytes, str]: Substituted variable.
|
| ----------------------------------------------------------------------
| Static methods defined here:
|
| cb(variable: Union[str, int, bytes]) -> bytes
| Converts the input variable to bytes.
|
| Args:
| variable (Union[str, int, bytes]): Input variable to be converted.
|
| Returns:
| bytes: Converted variable in bytes.
|