Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

abutils

Package Overview
Dependencies
Maintainers
1
Versions
41
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

abutils - npm Package Compare versions

Comparing version
0.4.16
to
0.4.17
+205
abutils/bin.py
#!/usr/bin/env python
# filename: bin.py
#
#
# Copyright (c) 2023 Bryan Briney
# License: The MIT license (http://opensource.org/licenses/MIT)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
# and associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import os
import platform
from typing import Optional
from urllib.request import urlretrieve
__all__ = ["get_path", "get_binary_directory"]
# __all__ = [
# "cdhit",
# "fastp",
# "fasttree",
# "mmseqs",
# "mafft",
# "muscle",
# "muscle_v3",
# "vsearch",
# ]
BIN_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bin")
SYSTEM = platform.system().lower()
MACHINE = platform.machine().lower().replace("x86_64", "amd64")
# cdhit = os.path.join(BIN_DIR, f"cd-hit_{SYSTEM}_{MACHINE}")
# fastp = os.path.join(BIN_DIR, f"fastp_{SYSTEM}")
# fasttree = os.path.join(BIN_DIR, f"fasttree_{SYSTEM}_{MACHINE}")
# mafft = os.path.join(BIN_DIR, f"mafft_{SYSTEM}_{MACHINE}/mafft.bat")
# mmseqs = os.path.join(BIN_DIR, f"mmseqs_{SYSTEM}_{MACHINE}")
# muscle = os.path.join(BIN_DIR, f"muscle_{SYSTEM}_{MACHINE}")
# muscle_v3 = os.path.join(BIN_DIR, f"muscle3_{SYSTEM}_{MACHINE}")
# vsearch = os.path.join(BIN_DIR, f"vsearch_{SYSTEM}_{MACHINE}")
def get_binary_directory() -> str:
"""
Gets the abutils binary directory.
Returns
-------
str
Path to the abutils binary directory.
"""
return BIN_DIR
def copy_to_binary_directory(source: str, name: Optional[str] = None) -> None:
"""
Copies a binary to the abutils binary directory.
Parameters
----------
source : str
Path to the binary to be copied.
name : str, optional
Name of the binary in the binary directory. If not provided, the basename of the
source binary will be used.
"""
binary_dir = get_binary_directory()
name = name if name is not None else os.path.basename(source)
target = os.path.join(binary_dir, source)
if not os.path.exists(target):
os.system(f'cp "{source}" "{target}"')
def get_path(binary: str) -> str:
"""
Gets the path to a pre-packaged binary.
.. note::
If the binary is not found in the binary directory, it will be downloaded from
the brineylab S3 bucket. This is because some binaries are too large
to be included in the ``abutils`` PyPI package.
Parameters
----------
binary : str
Name of the binary. Available binaries are:
- blastn
- cd-hit
- fastp
- fasttree
- makeblastdb
- mafft
- mmseqs
- muscle
- muscle_v3
- vsearch
Returns
-------
str
Path to the binary.
"""
# format (and maybe fix) binary name
binary = binary.lower()
if binary == "mmseqs2":
binary = "mmseqs"
if binary == "muscle_v3":
binary = "muscle3"
if binary == "cd-hit":
binary = "cdhit"
available = [
# blast
"blastn",
"makeblastdb",
# clustering/alignment
"cdhit",
"mafft",
"mmseqs",
"muscle",
"muscle3",
"vsearch",
# qc
"fastp",
# phylogeny
"fasttree",
]
if binary not in available:
raise ValueError(
f"Binary '{binary}' not available. Available binaries: \n"
+ "\n".join(available)
)
# get binary path
if binary in ["fastp", "muscle3", "blastn", "makeblastdb"]:
bin_name = f"{binary}_{SYSTEM}"
bin_path = os.path.join(BIN_DIR, bin_name)
elif binary == "mafft":
bin_name = f"{binary}_{SYSTEM}_{MACHINE}/mafft.bat"
bin_path = os.path.join(BIN_DIR, bin_name)
else:
bin_name = f"{binary}_{SYSTEM}_{MACHINE}"
bin_path = os.path.join(BIN_DIR, bin_name)
# download binary if missing
if not os.path.exists(bin_path):
print(
f"Downloading missing binary: {binary}. This will only occur once on intial use of the binary."
)
download_missing_binary(bin_name, bin_path)
return bin_path
# download missing binary (some are too large to included in the PyPI package)
def download_missing_binary(bin_name: str, bin_path: str) -> None:
url = f"https://brineylab.s3.amazonaws.com/tools/abutils_binaries/{bin_name}"
urlretrieve(url, bin_path)
# # download any missing binaries (some are too large to included in the PyPI package)
# def download_missing_binaries():
# to_download = []
# for b in [
# "cd-hit",
# "fastp",
# "fasttree",
# "mafft",
# "mmseqs",
# "muscle",
# "muscle3",
# "vsearch",
# ]:
# if b in ["fastp", "muscle3"]:
# bin_name = f"{b}_{SYSTEM}"
# bin_path = os.path.join(BIN_DIR, bin_name)
# else:
# bin_name = f"{b}_{SYSTEM}_{MACHINE}"
# bin_path = os.path.join(BIN_DIR, bin_name)
# if not os.path.exists(bin_path):
# to_download.append((bin_name, bin_path))
# if to_download:
# print(
# "Downloading missing binaries. This will only occur once on intial use of abutils."
# )
# url = f"https://brineylab.s3.amazonaws.com/tools/abutils_binaries/{bin_name}"
# urlretrieve(url, bin_path)
# download_missing_binaries()

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

import pytest
from abutils import Sequence
from abutils.tools.cloning import build_synthesis_constructs
# ------------------------------------
# build_synthesis_constructs
# ------------------------------------
def test_build_synthesis_constructs_single_sequence():
seq = Sequence("RINEYLA")
seq["locus"] = "IGH"
constructs = build_synthesis_constructs(seq)
assert len(constructs) == 1
opt_seq = str(constructs[0].sequence)
assert opt_seq.startswith("catcctttttctagtagcaactgcaaccggtgtacac".upper())
assert opt_seq.endswith("gcgtcgaccaagggcccatcggtcttcc".upper())
def test_build_synthesis_constructs_multiple_sequences():
seqs = [Sequence("RINEYLA"), Sequence("ALYENIR")]
for seq in seqs:
seq["locus"] = "IGH"
constructs = build_synthesis_constructs(seqs)
assert len(constructs) == 2
def test_build_synthesis_constructs_with_overhangs():
seq = Sequence("RINEYLA")
seq["locus"] = "IGH"
overhang_5 = {"IGH": "gggg"}
overhang_3 = {"IGH": "cccc"}
constructs = build_synthesis_constructs(
seq, overhang_5=overhang_5, overhang_3=overhang_3
)
opt_seq = str(constructs[0].sequence)
assert opt_seq.startswith("gggg".upper())
assert opt_seq.endswith("cccc".upper())
def test_build_synthesis_constructs_no_overhangs():
seq = Sequence("ATGCATGCTTATGAG")
constructs = build_synthesis_constructs(seq, overhang_5={}, overhang_3={})
assert len(constructs[0].sequence) == len("ATGCATGCTTATGAG")
def test_build_synthesis_constructs_group_by_chain():
seqs = [Sequence("RINEYLA"), Sequence("ALYENIR")]
for seq, locus in zip(seqs, ["IGH", "IGK"]):
seq["locus"] = locus
constructs = build_synthesis_constructs(seqs, group_by_chain=True)
assert constructs[0].id[-3:] == "IGH"
assert constructs[1].id[-3:] == "IGK"
def test_build_synthesis_constructs_sort_func():
seqs = [Sequence("RINEYLA", id="B"), Sequence("ALYENIR", id="A")]
for s in seqs:
s["locus"] = "IGH"
constructs = build_synthesis_constructs(
seqs, sort_func=lambda s: s.id, add_locus_to_name=False
)
assert constructs[0].id == "A"
assert constructs[1].id == "B"
#!/usr/bin/env python
# filename: cloning.py
#
#
# Copyright (c) 2023 Bryan Briney
# License: The MIT license (http://opensource.org/licenses/MIT)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
# and associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
from typing import Callable, Iterable, Optional, Union
from tqdm.auto import tqdm
from ..core.sequence import Sequence, codon_optimize
__all__ = ["build_synthesis_constructs"]
def build_synthesis_constructs(
sequences: Union[Sequence, Iterable[Sequence], str],
overhang_5: Optional[dict] = None,
overhang_3: Optional[dict] = None,
sequence_key: Optional[str] = None,
id_key: Optional[str] = None,
locus_key: Optional[str] = None,
frame: Optional[int] = None,
add_locus_to_name: bool = True,
group_by_chain: bool = False,
show_progress: bool = True,
sort_func: Optional[Callable] = None,
):
"""
Builds codon-optimized synthesis constructs, including Gibson overhangs suitable
for cloning IGH, IGK and IGL variable region constructs into antibody expression
vectors.
.. seealso::
| Thomas Tiller, Eric Meffre, Sergey Yurasov, Makoto Tsuiji, Michel C Nussenzweig, Hedda Wardemann
| Efficient generation of monoclonal antibodies from single human B cells by single cell RT-PCR and expression vector cloning
| *Journal of Immunological Methods* 2008, doi: 10.1016/j.jim.2007.09.017
Parameters
----------
sequences : ``Sequence``, str, or ``Iterable[Sequence]``
A ``Sequence`` object, a sequence as a ``str``, or an ``Iterable`` of ``Sequence`` objects.
.. note::
If the provided sequences are nucleotide sequences, ``frame`` must also be provided
so that codon optimization can be performed on the correct reading frame.
overhang_5 : dict, optional
A ``dict`` mapping the locus name to 5' Gibson overhangs. By default, Gibson
overhangs corresponding to the expression vectors in Tiller et al, 2008:
| **IGH:** ``catcctttttctagtagcaactgcaaccggtgtacac``
| **IGK:** ``atcctttttctagtagcaactgcaaccggtgtacac``
| **IGL:** ``atcctttttctagtagcaactgcaaccggtgtacac``
To produce constructs without 5' Gibson overhangs, provide an empty dictionary.
overhang_3 : dict, optional
A ``dict`` mapping the locus name to 3' Gibson overhangs. By default, Gibson
overhangs corresponding to the expression vectors in Tiller et al, 2008:
| **IGH:** ``gcgtcgaccaagggcccatcggtcttcc``
| **IGK:** ``cgtacggtggctgcaccatctgtcttcatc``
| **IGL:** ``ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc``
To produce constructs without 3' Gibson overhangs, provide an empty dictionary.
sequence_key : str, default='sequence_aa'
Field containing the sequence to be codon optimized. Default is ``'sequence_aa'`` if
``annotation_format == 'airr'`` or ``'vdj_aa'`` if ``annotation_format == 'json'``.
Either nucleotide or amino acid sequences are acceptable.
id_key : str, optional
Field containing the name of the sequence. If not provided, ``sequence.id`` is used.
locus_key : str, optional
Field containing the name of the locus. If not provided, ``sequence["locus"]`` is used, which
conforms with AIRR naming conventions.
.. note::
The data in ``locus_key`` should return one of the following:
* IGH, IGK, or IGL (which are the names of the loci in AIRR naming conventions)
* heavy, kappa, or lambda
frame : int, default=1
Reading frame to translate. Default is ``1``.
.. note::
This parameter is ignored if the input sequences are amino acid sequences.
add_locus_to_name : bool, default=True
If ``True``, the name of the sequence will be appended with the name of the locus, separated by an underscore.
If ``False``, the name of the sequence will not be modified.
group_by_chain : bool, default=False
If ``True``, the output ``list`` of ``Sequence`` objects will be separated by chain (heavy first, light second).
Within each chain group, sequence ordering will retained (either input order, or sorted by ``sort_func``).
If ``False``, the output ``list`` of ``Sequence`` objects will not be grouped.
in_place : bool, default=False
If ``True``, the input ``Sequence`` objects will be returned with the optimized sequence
populating the ``sequence.codon_optimized`` property. If ``False``, the optimized sequences
will be returned as new ``Sequence`` objects.
sort_func : Callable, optional
A function to sort the output ``list`` of ``Sequence`` objects. By default,
the output ``list`` is is in the same order as the input.
Returns
-------
sequences : ``list`` of ``Sequence`` objects
A ``list`` of ``Sequence`` objects. Each ``Sequence`` object has the following properties:
| *id*: The sequence ID (either ``sequence.id`` or ``id_key``), to which the locus (``locus_key``) is appended if ``add_locus_to_name`` is ``True``.
| *sequence*: The codon-optimized sequence, including Gibson overhangs.
If ``sort_func`` is provided, the output ``list`` will be sorted by ``sort_func``, otherwise the output is in the same order as the input.
"""
# get overhangs
overhang_3 = overhang_3 if overhang_3 is not None else GIBSON3
overhang_5 = overhang_5 if overhang_5 is not None else GIBSON5
# process input
if isinstance(sequences, (Sequence, str)):
sequences = [sequences]
sequences = [Sequence(s) for s in sequences]
# sort
if sort_func is not None:
sequences = sorted(sequences, key=sort_func)
if group_by_chain:
locus_key = locus_key if locus_key is not None else "locus"
heavies = [s for s in sequences if s[locus_key] in ["heavy", "IGH"]]
lights = [s for s in sequences if s[locus_key] not in ["heavy", "IGH"]]
sequences = heavies + lights
# make codon-optimized synthesis constructs
if show_progress:
sequences = tqdm(sequences)
optimized = []
for sequence in sequences:
seq = sequence[sequence_key] if sequence_key is not None else sequence.sequence
name = sequence[id_key] if id_key is not None else sequence.id
locus = sequence[locus_key] if locus_key is not None else sequence["locus"]
if add_locus_to_name and locus is not None:
name += f"_{locus}"
opt_seq = codon_optimize(seq, frame=frame, as_string=True)
synthesis_construct = (
overhang_5.get(locus, "") + opt_seq + overhang_3.get(locus, "")
)
optimized.append(Sequence(synthesis_construct, id=name))
return optimized
GIBSON5 = {
"IGH": "catcctttttctagtagcaactgcaaccggtgtacac",
"IGK": "atcctttttctagtagcaactgcaaccggtgtacac",
"IGL": "atcctttttctagtagcaactgcaaccggtgtacac",
"heavy": "catcctttttctagtagcaactgcaaccggtgtacac",
"kappa": "atcctttttctagtagcaactgcaaccggtgtacac",
"lambda": "atcctttttctagtagcaactgcaaccggtgtacac",
}
GIBSON3 = {
"IGH": "gcgtcgaccaagggcccatcggtcttcc",
"IGK": "cgtacggtggctgcaccatctgtcttcatc",
"IGL": "ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc",
"heavy": "gcgtcgaccaagggcccatcggtcttcc",
"kappa": "cgtacggtggctgcaccatctgtcttcatc",
"lambda": "ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc",
}
LOCUS_MAP = {
"IGH": "heavy",
"IGK": "kappa",
"IGL": "lambda",
"TRA": "alpha",
"TRB": "beta",
"TRD": "delta",
"TRG": "gamma",
}
#!/usr/bin/python
# filename: preprocessing.py
#
# Copyright (c) 2024 Bryan Briney
# License: The MIT license (http://opensource.org/licenses/MIT)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
# and associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import os
import subprocess as sp
from typing import Iterable, Optional, Union
from natsort import natsorted
from tqdm.auto import tqdm
from ..bin import get_path as get_binary_path
from ..io import concatenate_files, delete_files, list_files, make_dir, rename_file
class FASTQFile:
""" """
def __init__(self, file: str):
self.file = file
self.path = os.path.abspath(file)
self.basename = os.path.basename(self.path)
self.dir = os.path.dirname(self.path)
def __eq__(self, other):
return self.name == other.name
class IlluminaFile(FASTQFile):
""" """
def __init__(self, file: str):
super().__init__(file)
@property
def name(self):
return "_".join(self.basename.split("_")[:-4])
@property
def number(self):
return self.basename.split(".")[0].split("_")[-1]
@property
def read(self):
return self.basename.split("_")[-2]
@property
def lane(self):
return self.basename.split("_")[-3]
@property
def sample(self):
return self.basename.split("_")[-4]
class MergeGroup:
""" """
def __init__(self, name: str, files: Iterable[FASTQFile]):
self.name = name
self.files = files
self.merged_file = None # assigned by merge function
def merge(
self,
merged_directory: str,
format: str = "fastq",
algo: str = "vsearch",
binary_path: Optional[str] = None,
merge_args: Optional[str] = None,
verbose: bool = False,
debug: bool = False,
) -> str:
groups = self._group_by_lane()
n_groups = len(groups)
if verbose:
print(f"{self.name}")
print("-" * len(self.name))
if n_groups > 1:
groups = tqdm(groups, desc=" - merging lanes")
# merge function
if algo.lower() == "vsearch":
merge_func = merge_fastqs_vsearch
else:
raise ValueError(f"Invalid merge algorithm: {algo}. Must be 'vsearch'.")
# merge files
merged_files = []
for group in groups:
r1, r2 = natsorted(group, key=lambda x: x.read)
merged_file = merge_func(
r1.path,
r2.path,
os.path.join(merged_directory, f"{self.name}.{format.lower}"),
binary_path=binary_path,
additional_args=merge_args,
output_format=format,
debug=debug,
)
merged_files.append(merged_file)
self.merged_file = os.path.join(merged_directory, f"{self.name}.{format.lower}")
if len(merged_files) > 1:
if verbose:
print(" - concatenating merged files")
concatenate_files(merged_files, self.merged_file)
delete_files(merged_files)
else:
rename_file(merged_files, self.merged_file)
return self.merged_file
def _group_by_lane(self):
lane_dict = {}
for f in self.files:
if f.lane not in lane_dict:
lane_dict[f.lane] = []
lane_dict[f.lane].append(f)
return [l[0] for l in natsorted(lane_dict.items(), key=lambda x: x[0])]
def merge_fastqs(
files: Union[str, Iterable],
output_directory: str,
output_format: str = "fastq",
schema: str = "illumina",
algo: str = "vsearch",
binary_path: Optional[str] = None,
merge_args: Optional[str] = None,
debug: bool = False,
verbose: bool = False,
) -> Iterable[str]:
"""
Merge paired-end fastq files.
Parameters
----------
files : Union[str, Iterable]
Path to a directory containing paired-end fastq files, or an iterable of paired-end fastq files.
output_directory : str
Path to the directory in which to save the merged files.
output_format : str, optional
Output format. Must be 'fastq' or 'fasta'. Default is 'fastq'.
schema : str, optional
Schema of the file names. Must be 'illumina'. Default is 'illumina'.
algo : str, optional
Algorithm to use for merging. Must be 'vsearch'. Default is 'vsearch'.
binary_path : str, optional
Path to the merge algorithm binary. If not provided, the binary packaged with abutils will be used.
merge_args : str, optional
Additional arguments (as a string) to pass to the merge function.
debug : bool, optional
If True, print debug output. Default is False.
verbose : bool, optional
If True, print verbose output. Default is False.
Returns
-------
list
A list of paths to the merged FASTA/Q files.
"""
# process input files
if isinstance(files, str):
if not os.path.isdir(files):
err = f"The supplied file path ({files}) does not exist or is not a directory."
raise ValueError(err)
files = list_files(files, extension=[".fastq", ".fq", ".fastq.gz", ".fq.gz"])
if schema.lower() == "illumina":
files = [IlluminaFile(f) for f in files]
else:
raise ValueError(f"Invalid schema: {schema}. Must be 'illumina'.")
# group files by sample
file_pairs = group_fastq_pairs(files, verbose=verbose)
# merge files
make_dir(output_directory)
merged_files = []
for fp in file_pairs:
merged_file = fp.merge(
merged_directory=output_directory,
format=output_format,
algo=algo,
binary_path=binary_path,
merge_args=merge_args,
debug=debug,
verbose=verbose,
)
merged_files.append(merged_file)
return merged_files
def group_fastq_pairs(
files: Iterable[FASTQFile], verbose: bool = False
) -> Iterable[MergeGroup]:
"""
Group paired-end fastq files by sample name. If a sample has multiple lanes, the files will be combined.
Parameters
----------
files : Union[str, Iterable]
Path to a directory containing paired-end fastq files, or an iterable of paired-end fastq files.
verbose : bool, optional
If True, print verbose output. Default is False.
Returns
-------
list
A list of MergeGroup objects.
"""
group_dict = {}
for f in files:
if f.name not in group_dict:
group_dict[f.name] = []
group_dict[f.name].append(f)
groups = [
MergeGroup(name, group_files, verbose=verbose)
for name, group_files in group_dict.items()
]
return groups
def merge_fastqs_vsearch(
forward: str,
reverse: str,
merged_file: str,
output_format: str = "fasta",
binary_path: Optional[str] = None,
additional_args: Optional[str] = None,
debug: bool = False,
) -> str:
"""
Merge paired-end reads using vsearch.
Parameters
----------
forward : str
Path to the forward read file.
reverse : str
Path to the reverse read file.
merged_file : str
Path to the merged read file.
output_format : str, optional
Output format. Must be 'fasta' or 'fastq'. Default is 'fasta'.
binary_path : str, optional
Path to a vsearch binary. If not provided, the vsearch binary packaged with abutils will be used.
additional_args : str, optional
Additional arguments (as a string) to pass to vsearch.
debug : bool, optional
If True, print vsearch stdout and stderr. Default is False.
Returns
-------
str
Path to the merged read file.
"""
# validate input files
if not os.path.isfile(forward):
err = f"The supplied forward read file path ({forward}) does not exist or is not a file."
raise ValueError(err)
if not os.path.isfile(reverse):
err = f"The supplied reverse read file path ({reverse}) does not exist or is not a file."
raise ValueError(err)
# make output directory
out_dir = os.path.dirname(merged_file)
make_dir(out_dir)
# get the vsearch binary
if binary_path is None:
binary_path = get_binary_path("vsearch")
# compile the vsearch command
cmd = f"{binary_path} --fastq_mergepairs {forward} --reverse {reverse}"
if output_format.lower() == "fasta":
cmd += " --fastaout {merged_file}"
elif output_format.lower() == "fastq":
cmd += " --fastqout {merged_file}"
else:
err = f"Invalid output format: {output_format}. Must be 'fasta' or 'fastq'."
raise ValueError(err)
if additional_args is not None:
cmd += f" {additional_args}"
# merge reads
p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = p.communicate()
if debug:
print(stdout.decode())
print(stderr.decode())
if p.returncode != 0:
err = f"Error merging reads with vsearch: {stderr.decode()}"
raise ValueError(err)
return merged_file
+2
-1
Metadata-Version: 2.1
Name: abutils
Version: 0.4.16
Version: 0.4.17
Summary: Utilities for analysis of adaptive immune receptor repertoire (AIRR) data

@@ -25,2 +25,3 @@ Home-page: https://github.com/briney/abutils

Requires-Dist: celery
Requires-Dist: dnachisel
Requires-Dist: ete3

@@ -27,0 +28,0 @@ Requires-Dist: fastcluster

@@ -5,2 +5,3 @@ abstar>=0.6.3

celery
dnachisel
ete3

@@ -7,0 +8,0 @@ fastcluster

@@ -8,2 +8,3 @@ LICENSE

abutils/__init__.py
abutils/bin.py
abutils/cl.py

@@ -22,2 +23,4 @@ abutils/io.py

abutils/bin/cdhit_linux_amd64
abutils/bin/fastp_darwin
abutils/bin/fastp_linux
abutils/bin/fasttree_darwin_amd64

@@ -185,2 +188,3 @@ abutils/bin/fasttree_darwin_arm64

abutils/tests/test_clonify.py
abutils/tests/test_cloning.py
abutils/tests/test_cluster.py

@@ -195,4 +199,6 @@ abutils/tests/test_color.py

abutils/tools/clonify.py
abutils/tools/cloning.py
abutils/tools/cluster.py
abutils/tools/phylo.py
abutils/tools/preprocessing.py
abutils/tools/similarity.py

@@ -199,0 +205,0 @@ abutils/utils/__init__.py

@@ -6,3 +6,3 @@ # from .core import *

from . import cl, io, pl, tl
# from . import bin, cl, io, pl, tl
from . import cl as color

@@ -9,0 +9,0 @@ from .core import lineage, pair, sequence

@@ -36,2 +36,3 @@ #!/usr/bin/env python

import dnachisel as dc
import pandas as pd

@@ -145,2 +146,3 @@ from Bio import SeqIO

self._reverse_complement = None
self._codon_optimized = None
self._strand = None

@@ -321,2 +323,12 @@ self.id_key = id_key

@property
def codon_optimized(self):
if self._codon_optimized is None:
self._codon_optimized = self.codon_optimize(as_string=True)
return self._codon_optimized
@codon_optimized.setter
def codon_optimized(self, val):
self._codon_optimized = val
@property
def annotations(self):

@@ -369,2 +381,44 @@ """

def codon_optimize(
self,
sequence_key: str = "sequence_aa",
id_key: str = "sequence_id",
frame: Optional[int] = None,
as_string: bool = True,
) -> Union[str, "Sequence"]:
"""
Codon optimize a sequence.
Parameters
----------
sequence_key : str, default="sequence_aa"
Name of the annotation field containg the sequence to be translated.
If not provided, ``Sequence.sequence`` is used.
id_key : str, default="sequence_id"
Name of the annotation field containg the sequence id.
If not provided, ``Sequence.id`` is used.
frame : int, default=1
Reading frame to translate. Default is ``1``.
as_string : bool, default=True
If ``True``, the optimized sequence will be returned as a ``str``.
If ``False``, the optimized sequence will be returned as a ``Sequence`` object.
Returns
-------
Union[str, Sequence]
Translated sequence as a ``str`` (if ``as_string`` is ``True``) or
``Sequence`` object (if ``as_string`` is ``False``).
"""
return codon_optimize(
sequence=self,
sequence_key=sequence_key,
id_key=id_key,
frame=frame,
as_string=as_string,
)
def as_fasta(

@@ -465,3 +519,3 @@ self, name_field: Optional[str] = None, seq_field: Optional[str] = None

if id is None:
id = uuid.uuid4()
id = str(uuid.uuid4())
self.sequence = str(seq).upper()

@@ -586,2 +640,3 @@ self.id = id

translated : str
Translated sequence.

@@ -597,3 +652,6 @@ """

return None
start = (frame % 3) - 1
# start = (frame % 3 or 3) - 1 # too clever by half
if frame not in range(1, 4):
raise ValueError(f"Invalid frame: {frame}. Must be 1, 2 or 3.")
start = frame - 1
end = len(seq) - (len(seq[start:]) % 3)

@@ -612,2 +670,98 @@ seq = seq[start:end]

def codon_optimize(
sequence,
sequence_key: Optional[str] = None,
id_key: Optional[str] = None,
frame: Optional[int] = None,
as_string: bool = False,
in_place: bool = False,
) -> Union[str, "Sequence"]:
"""
Optimizes the codons of a nucleotide or amino acid sequence.
Parameters
----------
sequence : Union[str, Sequence]
Nucleotide or amino acid sequence to be optimized.
sequence_key : str, default=None
Name of the annotation field containg the sequence to be optimized.
If not provided, ``sequence.sequence`` is used.
id_key : str, default=None
Name of the annotation field containg the sequence ID.
If not provided, ``sequence.id`` is used.
frame : int, default=1
Reading frame to translate. Default is ``1``.
.. note::
``frame`` is ignored if the input sequence is an amino acid sequence.
as_string : bool, default=False
If ``True``, the optimized sequence will be returned as a ``str``.
If ``False``, the optimized sequence will be returned as a ``Sequence`` object.
in_place : bool, default=False
If ``True``, the input ``Sequence`` object will be returned with the optimized sequence populating
the ``sequence.codon_optimized`` property. If ``False``, the optimized sequence will be returned
as a ``str`` (if ``as_string`` is ``True``) or a ``Sequence`` object (if ``as_string`` is ``False``).
Returns
-------
Union[str, Sequence]
If ``in_place`` is ``True``, the input ``Sequence`` object will be returned with the optimized sequence populating
the ``Sequence.sequence`` property.
If ``as_string`` is ``True``, the optimized sequence will be returned as a ``str``.
If ``as_string`` is ``False``, the optimized sequence will be returned as a new ``Sequence`` object.
"""
# process input
if not isinstance(sequence, Sequence):
sequence = Sequence(sequence)
if sequence_key is not None:
seq = (
sequence[sequence_key]
if sequence[sequence_key] is not None
else sequence.sequence
)
else:
seq = sequence.sequence
if id_key is not None:
name = sequence[id_key] if id_key is not None else sequence.id
else:
name = sequence.id
# get DNA sequence
if not all([res.upper() in ["A", "C", "G", "T", "N", "-"] for res in sequence]):
dna_seq = dc.reverse_translate(seq)
else:
frame = frame if frame is not None else 1
if frame not in range(1, 4):
raise ValueError(f"Invalid frame: {frame}. Must be 1, 2 or 3.")
start = frame - 1
dna_seq = seq[start:]
# optimize codons
problem = dc.DnaOptimizationProblem(
sequence=dna_seq,
constraints=[
dc.EnforceTranslation(),
dc.EnforceGCContent(maxi=0.56),
dc.EnforceGCContent(maxi=0.64, window=100),
dc.UniquifyAllKmers(10),
],
objectives=[dc.CodonOptimize(species="h_sapiens")],
logger=None,
)
problem.resolve_constraints(final_check=True)
problem.optimize()
if as_string:
return str(problem.sequence)
if in_place:
sequence.sequence = str(problem.sequence)
return sequence
return Sequence(problem.sequence, id=name)
def read_json(

@@ -1044,2 +1198,7 @@ json_file: str,

index: bool = False,
properties: Optional[Iterable[str]] = None,
drop_na_columns: bool = True,
order: Optional[Iterable[str]] = None,
exclude: Optional[Union[str, Iterable[str]]] = None,
leading: Optional[Union[str, Iterable[str]]] = None,
) -> None:

@@ -1070,10 +1229,55 @@ """

properties : list, default=None
A list of properties to be included in the CSV file.
drop_na_columns : bool, default=True
If ``True``, columns with all ``NaN`` values will be dropped from the CSV file.
Default is ``True``.
order : list, default=None
A list of fields in the order they should appear in the CSV file.
exclude : str or list, default=None
Field or list of fields to be excluded from the CSV file.
leading : str or list, default=None
Field or list of fields to appear first in the CSV file. Supercedes ``order``, so
if both are provided, fields in ``leading`` will appear first in the CSV file and
remaining fields will appear in the order provided in ``order``.
"""
d = []
data = []
for s in sequences:
if not s.annotations:
d.append({"sequence_id": s.id, "sequence": s.sequence})
d = {"sequence_id": s.id, "sequence": s.sequence}
else:
d.append(s.annotations)
df = pd.DataFrame(d)
d = s.annotations
if properties is not None:
for prop in properties:
try:
d[prop] = getattr(s, prop)
except AttributeError:
continue
data.append(d)
df = pd.DataFrame(data)
# drop NaN
if drop_na_columns:
df.dropna(axis=1, how="all", inplace=True)
# excluded columns
if exclude is not None:
if isinstance(exclude, str):
exclude = []
cols = [c for c in df.columns.values if c not in exclude]
df = df[cols]
# reorder
if order is not None:
cols = [o for o in order if o in df.columns.values]
df = df[cols]
if leading is not None:
if isinstance(leading, str):
leading = [leading]
leading = [l for l in leading if l in df.columns.values]
cols = leading + [c for c in df.columns.values if c not in leading]
df = df[cols]
df.to_csv(csv_file, sep=sep, index=index, columns=columns, header=header)

@@ -25,3 +25,6 @@ #!/usr/bin/env python

import glob
import os
import sys
from typing import Iterable, Union

@@ -40,5 +43,123 @@ from .core.sequence import (

from .utils.convert import abi_to_fasta
from .utils.pipeline import list_files, make_dir
# from .utils.pipeline import list_files, make_dir
def make_dir(directory: str) -> None:
"""
Makes a directory, if it doesn't already exist.
Parameters
----------
directory : str
Path to a directory.
"""
if not os.path.exists(directory):
os.makedirs(os.path.abspath(directory))
def list_files(
directory: str, extension: Union[str, Iterable, None] = None
) -> Iterable[str]:
"""
Lists files in a given directory.
Parameters
----------
directory : str
Path to a directory.
extension : str
If supplied, only files that end with the specificied extension(s) will be returned. Can be either
a string or a list of strings. Extension evaluation is case-insensitive and can match complex
extensions (e.g. '.fastq.gz'). Default is ``None``, which returns all files in the directory,
regardless of extension.
Returns
-------
Iterable[str]
"""
if os.path.isdir(directory):
expanded_dir = os.path.expanduser(directory)
files = sorted(glob.glob(expanded_dir + "/*"))
else:
files = [
directory,
]
if extension is not None:
if isinstance(extension, str):
extension = [
extension,
]
files = [
f
for f in files
if any(
[
any([f.lower().endswith(e.lower()) for e in extension]),
any([f.endswith(e.upper()) for e in extension]),
any([f.endswith(e.lower()) for e in extension]),
]
)
]
return files
def rename_file(file: str, new_name: str) -> None:
"""
Renames a file.
Parameters
----------
file : str
Path to the file to be renamed.
new_name : str
New name for the file.
"""
os.rename(file, new_name)
def delete_files(files: Union[str, Iterable]) -> None:
"""
Deletes files.
Parameters
----------
files : Union[str, Iterable]
Path to a file or an iterable of file paths.
"""
if isinstance(files, str):
files = [
files,
]
for f in files:
if os.path.exists(f):
os.remove(f)
def concatenate_files(files: Iterable[str], output_file: str) -> None:
"""
Concatenates multiple files into a single file.
Parameters
----------
files : Iterable[str]
Iterable of file paths.
output_file : str
Path to the output file.
"""
with open(output_file, "w") as outfile:
for fname in files:
with open(fname) as infile:
for line in infile:
outfile.write(line)
def read_sequences(

@@ -45,0 +166,0 @@ file=None,

@@ -5,8 +5,8 @@ #!usr/env/python

import os
import pytest
import tempfile
import pytest
from Bio.SeqRecord import SeqRecord
from ..core.sequence import Sequence, read_fasta, read_airr, to_fasta
from ..core.sequence import Sequence, read_airr, read_fasta, to_fasta

@@ -275,1 +275,27 @@

assert ">seq1\nATCG\n>seq2\nGCTA\n>seq3\nTTTT" in fasta
# -------------------------------------------
# codon optimization
# -------------------------------------------
def test_codon_optimize_returns_string():
seq = Sequence("RINEYLA")
optimized_seq = seq.codon_optimize(as_string=True)
assert isinstance(optimized_seq, str)
assert len(optimized_seq) == len(seq) * 3
def test_codon_optimize_returns_sequence_object():
seq = Sequence("RINEYLA")
optimized_seq = seq.codon_optimize(as_string=False)
assert isinstance(optimized_seq, Sequence)
assert len(optimized_seq.sequence) == len(seq) * 3
assert optimized_seq.translate() == seq.sequence
def test_codon_optimize_with_frame():
seq = Sequence("ATGCATGCATGCATGC")
with pytest.raises(ValueError):
seq.codon_optimize(frame=4)

@@ -27,5 +27,6 @@ #!/usr/bin/env python

from .core.pair import assign_pairs
from .core.sequence import reverse_complement, translate
from .core.sequence import codon_optimize, reverse_complement, translate
from .tools.alignment import *
from .tools.clonify import clonify
from .tools.cloning import *
from .tools.cluster import *

@@ -32,0 +33,0 @@ from .tools.phylo import *

@@ -1,16 +0,37 @@

from collections import Counter
#!/usr/bin/env python
# filename: clonify.py
#
#
# Copyright (c) 2023 Bryan Briney
# License: The MIT license (http://opensource.org/licenses/MIT)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
# and associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
import itertools
import sys
from collections import Counter
from typing import Iterable, Union
import fastcluster as fc
import numpy as np
import pandas as pd
import numpy as np
from Levenshtein import distance
import fastcluster as fc
from mnemonic import Mnemonic
from scipy.cluster.hierarchy import fcluster
from mnemonic import Mnemonic
from ..core.sequence import Sequence

@@ -17,0 +38,0 @@ from ..tools import cluster

@@ -37,2 +37,3 @@ #!/usr/bin/env python

from ..bin import get_path as get_binary_path
from ..core.sequence import Sequence, read_fasta, to_fasta

@@ -226,2 +227,7 @@ from ..utils.decorators import lazy_property

iddef: int = 0,
cluster_mode: str = "2",
cov_mode: str = "0",
coverage: float = 0.8,
alignment_mode: str = "3",
seq_id_mode: str = "1",
vsearch_bin: str = None,

@@ -232,2 +238,3 @@ mmseqs_bin: str = None,

seq_key: Optional[str] = None,
threads: Optional[int] = None,
strand: str = "plus",

@@ -285,2 +292,32 @@ as_dict: bool = False,

cluster_mode : str, default="2"
Clustering mode, as implemented by MMseqs2. Options are:
1. greedy set cover
2. connected component
3. greedy incremental (CD-HIT-like)
cov_mode : str, default="0"
Coverage mode, as implemented by MMseqs2. Options are:
0. bidirectional
1. target coverage
2. query coverage
3. target-in-query length coverage
coverage : float, default=0.8
Coverage threshold for clustering with MMseqs2. Must be between 0 and 1.
alignment_mode : str, default="3"
Alignment mode, as implemented by MMseqs2. Options are:
0. automatic
1. only score and end_pos
2. also start_pos and cov
3. also seq.id
4. only ungapped alignment
seq_id_mode : str, default="1"
Sequence ID mode, as implemented by MMseqs2. Options are:
0. alignment length
1. shorter sequence length
2. longer sequence length
vsearch_bin : str, optional

@@ -359,2 +396,8 @@ Path to a VSEARCH executable. If not provided, the VSEARCH binary bundled

threshold=threshold,
cluster_mode=cluster_mode,
cov_mode=cov_mode,
coverage=coverage,
alignment_mode=alignment_mode,
seq_id_mode=seq_id_mode,
threads=threads,
temp_dir=temp_dir,

@@ -371,2 +414,3 @@ mmseqs_bin=mmseqs_bin,

cdhit_bin=cdhit_bin,
threads=0 if threads is None else threads,
as_dict=True,

@@ -383,2 +427,3 @@ debug=debug,

strand=strand,
threads=threads,
as_dict=True,

@@ -409,2 +454,3 @@ debug=debug,

strand: str = "plus",
threads: Optional[int] = None,
as_dict: bool = False,

@@ -478,6 +524,7 @@ debug: bool = False,

if vsearch_bin is None:
mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
system = platform.system().lower()
machine = platform.machine().lower().replace("x86_64", "amd64")
vsearch_bin = os.path.join(mod_dir, f"bin/vsearch_{system}_{machine}")
vsearch_bin = get_binary_path("vsearch")
# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# system = platform.system().lower()
# machine = platform.machine().lower().replace("x86_64", "amd64")
# vsearch_bin = os.path.join(mod_dir, f"bin/vsearch_{system}_{machine}")
# do clustering

@@ -492,2 +539,4 @@ vsearch_cmd = f"{vsearch_bin} --cluster_fast {fasta_file}"

vsearch_cmd += f" --strand {strand}"
if threads is not None:
vsearch_cmd += f" --threads {threads}"
p = sp.Popen(vsearch_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)

@@ -531,2 +580,8 @@ stdout, stderr = p.communicate()

threshold: float = 0.975,
cluster_mode: str = "2",
cov_mode: str = "0",
coverage: float = 0.8,
alignment_mode: str = "3",
seq_id_mode: str = "1",
threads: Optional[int] = None,
temp_dir: str = "/tmp",

@@ -549,2 +604,42 @@ mmseqs_bin: str = None,

cluster_mode : str, default="2"
Clustering mode. Options are ``"1"``, ``"2"``, or ``"3"``.
- ``"1"``: greedy set cover
- ``"2"``: connected component
- ``"3"``: greedy incremental (CD-HIT-like)
See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
for more information.
cov_mode : str, default="0"
Coverage mode. Options are ``"0"``, ``"1"``, ``"2"``, or ``"3"``.
- ``"0"``: bidirectional
- ``"1"``: target coverage
- ``"2"``: query coverage
- ``"3"``: target-in-query length coverage
See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
for more information.
coverage : float, default=0.8
Coverage threshold for clustering. Must be between 0 and 1.
alignment_mode : str, default="3"
Alignment mode. Options are ``"0"``, ``"1"``, ``"2"``, ``"3"``, or ``"4"``.
- ``"0"``: automatic
- ``"1"``: only score and end_pos
- ``"2"``: also start_pos and cov
- ``"3"``: also seq.id
- ``"4"``: only ungapped alignment
See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
for more information.
seq_id_mode : str, default="1"
Sequence ID mode. Options are ``"0"``, ``"1"``, or ``"2"``.
- ``"0"``: alignment length
- ``"1"``: shorter sequence length
- ``"2"``: longer sequence length
threads : int, default=None
Number of threads to use for clustering. If not provided, the number of threads
will be determined by MMseqs2.
temp_dir : str, default="/tmp"

@@ -592,6 +687,7 @@ Path to a directory for temporary storage of clustering files.

if mmseqs_bin is None:
mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
system = platform.system().lower()
machine = platform.machine().lower().replace("x86_64", "amd64")
mmseqs_bin = os.path.join(mod_dir, f"bin/mmseqs_{system}_{machine}")
mmseqs_bin = get_binary_path("mmseqs")
# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# system = platform.system().lower()
# machine = platform.machine().lower().replace("x86_64", "amd64")
# mmseqs_bin = os.path.join(mod_dir, f"bin/mmseqs_{system}_{machine}")
# build the mmseqs DB

@@ -609,2 +705,9 @@ db_cmd = f"{mmseqs_bin} createdb {fasta_file} {db_file}"

cluster_cmd += f" --min-seq-id {threshold}"
cluster_cmd += f" --cluster-mode {cluster_mode}"
cluster_cmd += f" --cov-mode {cov_mode}"
cluster_cmd += f" -c {coverage}"
cluster_cmd += f" --alignment-mode {alignment_mode}"
cluster_cmd += f" --seq-id-mode {seq_id_mode}"
if threads is not None:
cluster_cmd += f" --threads {threads}"
p = sp.Popen(cluster_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)

@@ -657,2 +760,3 @@ stdout, stderr = p.communicate()

cdhit_bin: str = None,
threads: int = 0,
as_dict: bool = False,

@@ -680,2 +784,6 @@ debug: bool = False,

threads : int, default=0
Number of threads to use for clustering. If not provided, all available CPU cores
will be used.
id_key : str, default=None

@@ -715,6 +823,7 @@ Key to retrieve the sequence ID. If not provided or missing, ``Sequence.id`` is used.

if cdhit_bin is None:
mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
system = platform.system().lower()
machine = platform.machine().lower().replace("x86_64", "amd64")
cdhit_bin = os.path.join(mod_dir, f"bin/cdhit_{system}_{machine}")
cdhit_bin = get_binary_path("cdhit")
# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# system = platform.system().lower()
# machine = platform.machine().lower().replace("x86_64", "amd64")
# cdhit_bin = os.path.join(mod_dir, f"bin/cdhit_{system}_{machine}")
# run CD-HIT

@@ -729,4 +838,4 @@ # clustering options are as follows:

cluster_cmd = f"{cdhit_bin} -i {fasta_file} -o {output_file}"
cluster_cmd += f" -c {threshold} -n {wordsize}"
cluster_cmd += " -d 0 -l 4 -T 0 -M 0"
cluster_cmd += f" -c {threshold} -n {wordsize} -T {threads}"
cluster_cmd += " -d 0 -l 4 -M 0"
p = sp.Popen(cluster_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)

@@ -733,0 +842,0 @@ stdout, stderr = p.communicate()

@@ -44,2 +44,3 @@ #!/usr/bin/env python

from ..bin import get_path as get_binary_path
from ..core.sequence import Sequence, read_fasta, to_fasta

@@ -113,10 +114,11 @@ from ..tools.cluster import Cluster, cluster

if fasttree_bin is None:
mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# fasttree_bin = os.path.join(
# mod_dir, f"bin/fasttree_{platform.system().lower()}"
# )
system = platform.system().lower()
machine = platform.machine().lower()
fasttree_bin = os.path.join(mod_dir, f"bin/fasttree_{system}_{machine}")
fasttree_bin = fasttree_bin.replace("x86_64", "amd64")
fasttree_bin = get_binary_path("fasttree")
# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# # fasttree_bin = os.path.join(
# # mod_dir, f"bin/fasttree_{platform.system().lower()}"
# # )
# system = platform.system().lower()
# machine = platform.machine().lower()
# fasttree_bin = os.path.join(mod_dir, f"bin/fasttree_{system}_{machine}")
# fasttree_bin = fasttree_bin.replace("x86_64", "amd64")
# make output directory if necessary

@@ -123,0 +125,0 @@ if tree_file is None:

@@ -33,6 +33,5 @@ #!/usr/bin/env python

from ..core.pair import Pair
from ..core.sequence import Sequence
from ..core.pair import Pair
__all__ = [

@@ -153,2 +152,28 @@ "RepertoireSimilarity",

def __iter__(self):
for s in self.similarities:
yield s
def __len__(self):
return len(self.similarities)
def __repr__(self):
return "<RepertoireSimilarities: {} comparisons>".format(len(self))
def __str__(self):
return "<RepertoireSimilarities: {} comparisons>".format(len(self))
def __add__(self, other):
if isinstance(other, RepertoireSimilarity):
self.add(other)
return self
elif isinstance(other, RepertoireSimilarities):
for s in other:
self.add(s)
return self
else:
raise TypeError(
"RepertoireSimilarities can only be added to RepertoireSimilarity or RepertoireSimilarities objects."
)
@property

@@ -166,3 +191,3 @@ def df(self):

@property
def means(self):
def means(self) -> np.array:
"""

@@ -174,3 +199,3 @@ Returns an array of mean similarity values, one for each pairwise comparison.

@property
def medians(self):
def medians(self) -> np.array:
"""

@@ -182,3 +207,3 @@ Returns an array of median similarity values, one for each pairwise comparison.

@property
def mins(self):
def mins(self) -> np.array:
"""

@@ -190,3 +215,3 @@ Returns an array of minimum similarity values, one for each pairwise comparison.

@property
def maxs(self):
def maxs(self) -> np.array:
"""

@@ -198,3 +223,3 @@ Returns an array of maximum similarity values, one for each pairwise comparison.

@property
def stds(self):
def stds(self) -> np.array:
"""

@@ -206,3 +231,3 @@ Returns an array of standard deviations, one for each pairwise comparison.

@property
def sems(self):
def sems(self) -> np.array:
"""

@@ -213,3 +238,7 @@ Returns an array of standard errors of the mean, one for each pairwise comparison.

def squareform(self, method=None, agg=np.mean):
def squareform(
self,
# method=None,
agg=np.mean,
) -> pd.DataFrame:
"""

@@ -233,4 +262,4 @@ Returns a squareform representation of the similarity data.

_df = self.df[(self.df["sample1"] == s1) & (self.df["sample2"] == s2)]
if method is not None:
_df = _df[_df["method"] == method]
# if method is not None:
# _df = _df[_df["method"] == method]
if _df.shape[0] == 0:

@@ -278,5 +307,10 @@ continue

A list of repertoires. Each repertoire can be a list of ``abutils.Sequence``
objects or ``abutils.Pair`` objects. Individual repertoires cannot mix ``Sequence``
and ``Pair`` objects, but the list of repertoires can contain both.
objects or a list of ``abutils.Pair`` objects.
.. note::
Individual repertoires cannot mix ``Sequence`` and ``Pair`` objects,
but the list of repertoires can contain both. For example, comparing a
repetoire of ``Sequence`` objects to a repertoire of ``Pair`` objects is
allowed.
names : list, optional

@@ -287,3 +321,11 @@ A list of names for the repertoires. If ``None``, names will be generated as

method : str, optional
The similarity method to use. Default is ``"morisita-horn"``.
The similarity method to use. Options are:
- ``"morisita-horn"``
- ``"kullback-leibler"``
- ``"jensen-shannon"``
- ``"jaccard"``
- ``"bray-curtis"``
- ``"renkonen"``
- ``"cosine"``
Default is ``"morisita-horn"``.

@@ -304,3 +346,4 @@ features : str, list, optional

pairs_only : bool, optional
Whether to use only paired sequences. Default is ``False``.
Whether to use only paired sequences. Requires that all repertoires contain ``Pair`` objects,
rather than ``Sequence`` objects. Default is ``False``.

@@ -310,2 +353,7 @@ chain : str, optional

.. note::
This applies to both ``Sequence`` and ``Pair`` objects. If a repertoire contains
``Sequence`` objects from multiple chains, only sequences of the specified chain will
be used for similarity calculation, and the rest will be ignored.
force_self_comparisons : bool, optional

@@ -355,3 +403,3 @@ Whether to force self-comparisons. Only used if exactly two repertoires are provided.

elif len(names) != len(repertoires):
err = f"\nThe number of names must match the number of repertoires.\n"
err = "\nThe number of names must match the number of repertoires.\n"
err += f"You provided {len(names)} names and {len(repertoires)} repertoires.\n"

@@ -358,0 +406,0 @@ raise RuntimeError(err)

@@ -31,14 +31,20 @@ #!/usr/bin/env python

import sys
from typing import Iterable, Optional
from typing import Iterable, Optional, Union
from . import log
if sys.version_info[0] > 2:
STR_TYPES = [
str,
]
# else:
# STR_TYPES = [str, unicode]
# for backward compatibility
def list_files(*args, **kwargs):
from ..io import list_files
return list_files(*args, **kwargs)
def make_dir(*args, **kwargs):
from ..io import make_dir
return make_dir(*args, **kwargs)
def initialize(log_file, project_dir=None, debug=False):

@@ -82,58 +88,62 @@ """

def make_dir(directory: str) -> None:
"""
Makes a directory, if it doesn't already exist.
# def make_dir(directory: str) -> None:
# """
# Makes a directory, if it doesn't already exist.
Parameters
----------
directory : str
Path to a directory.
# Parameters
# ----------
# directory : str
# Path to a directory.
"""
if not os.path.exists(directory):
os.makedirs(os.path.abspath(directory))
# """
# if not os.path.exists(directory):
# os.makedirs(os.path.abspath(directory))
def list_files(directory: str, extension: Optional[str] = None) -> Iterable[str]:
"""
Lists files in a given directory.
# def list_files(
# directory: str, extension: Union[str, Iterable, None] = None
# ) -> Iterable[str]:
# """
# Lists files in a given directory.
Parameters
----------
directory : str
Path to a directory.
# Parameters
# ----------
# directory : str
# Path to a directory.
extension : str
If supplied, only files that contain the specificied extension will be returned.
Default is ``None``, which returns all files in the directory, regardless of extension.
# extension : str
# If supplied, only files that end with the specificied extension(s) will be returned. Can be either
# a string or a list of strings. Extension evaluation is case-insensitive and can match complex
# extensions (e.g. '.fastq.gz'). Default is ``None``, which returns all files in the directory,
# regardless of extension.
Returns
-------
Iterable[str]
# Returns
# -------
# Iterable[str]
"""
if os.path.isdir(directory):
expanded_dir = os.path.expanduser(directory)
files = sorted(glob.glob(expanded_dir + "/*"))
else:
files = [
directory,
]
if extension is not None:
if type(extension) in STR_TYPES:
extension = [
extension,
]
files = [
f
for f in files
if any(
[
f.split(".")[-1] in extension,
f.split(".")[-1].upper() in extension,
f.split(".")[-1].lower() in extension,
]
)
]
return files
# """
# if os.path.isdir(directory):
# expanded_dir = os.path.expanduser(directory)
# files = sorted(glob.glob(expanded_dir + "/*"))
# else:
# files = [
# directory,
# ]
# if extension is not None:
# if isinstance(extension, str):
# extension = [
# extension,
# ]
# files = [
# f
# for f in files
# if any(
# [
# any([f.lower().endswith(e.lower()) for e in extension]),
# any([f.endswith(e.upper()) for e in extension]),
# any([f.endswith(e.lower()) for e in extension]),
# ]
# )
# ]
# return files

@@ -140,0 +150,0 @@

@@ -6,2 +6,2 @@ # Store the version here so:

__version__ = "0.4.16"
__version__ = "0.4.17"
Metadata-Version: 2.1
Name: abutils
Version: 0.4.16
Version: 0.4.17
Summary: Utilities for analysis of adaptive immune receptor repertoire (AIRR) data

@@ -25,2 +25,3 @@ Home-page: https://github.com/briney/abutils

Requires-Dist: celery
Requires-Dist: dnachisel
Requires-Dist: ete3

@@ -27,0 +28,0 @@ Requires-Dist: fastcluster

@@ -5,2 +5,3 @@ abstar >= 0.6.3

celery
dnachisel
ete3

@@ -7,0 +8,0 @@ fastcluster

Sorry, the diff of this file is too big to display