abutils - npm Package Compare versions

+205

abutils/bin.py

		#!/usr/bin/env python
		# filename: bin.py
		#
		#
		# Copyright (c) 2023 Bryan Briney
		# License: The MIT license (http://opensource.org/licenses/MIT)
		#
		# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
		# and associated documentation files (the "Software"), to deal in the Software without restriction,
		# including without limitation the rights to use, copy, modify, merge, publish, distribute,
		# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
		# furnished to do so, subject to the following conditions:
		#
		# The above copyright notice and this permission notice shall be included in all copies or
		# substantial portions of the Software.
		#
		# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
		# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
		# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
		# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		#


		import os
		import platform
		from typing import Optional
		from urllib.request import urlretrieve

		__all__ = ["get_path", "get_binary_directory"]

		# __all__ = [
		# "cdhit",
		# "fastp",
		# "fasttree",
		# "mmseqs",
		# "mafft",
		# "muscle",
		# "muscle_v3",
		# "vsearch",
		# ]


		BIN_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bin")
		SYSTEM = platform.system().lower()
		MACHINE = platform.machine().lower().replace("x86_64", "amd64")

		# cdhit = os.path.join(BIN_DIR, f"cd-hit_{SYSTEM}_{MACHINE}")
		# fastp = os.path.join(BIN_DIR, f"fastp_{SYSTEM}")
		# fasttree = os.path.join(BIN_DIR, f"fasttree_{SYSTEM}_{MACHINE}")
		# mafft = os.path.join(BIN_DIR, f"mafft_{SYSTEM}_{MACHINE}/mafft.bat")
		# mmseqs = os.path.join(BIN_DIR, f"mmseqs_{SYSTEM}_{MACHINE}")
		# muscle = os.path.join(BIN_DIR, f"muscle_{SYSTEM}_{MACHINE}")
		# muscle_v3 = os.path.join(BIN_DIR, f"muscle3_{SYSTEM}_{MACHINE}")
		# vsearch = os.path.join(BIN_DIR, f"vsearch_{SYSTEM}_{MACHINE}")


		def get_binary_directory() -> str:
		"""
		Gets the abutils binary directory.

		Returns
		-------
		str
		Path to the abutils binary directory.

		"""
		return BIN_DIR


		def copy_to_binary_directory(source: str, name: Optional[str] = None) -> None:
		"""
		Copies a binary to the abutils binary directory.

		Parameters
		----------
		source : str
		Path to the binary to be copied.

		name : str, optional
		Name of the binary in the binary directory. If not provided, the basename of the
		source binary will be used.

		"""
		binary_dir = get_binary_directory()
		name = name if name is not None else os.path.basename(source)
		target = os.path.join(binary_dir, source)
		if not os.path.exists(target):
		os.system(f'cp "{source}" "{target}"')


		def get_path(binary: str) -> str:
		"""
		Gets the path to a pre-packaged binary.

		.. note::
		If the binary is not found in the binary directory, it will be downloaded from
		the brineylab S3 bucket. This is because some binaries are too large
		to be included in the ``abutils`` PyPI package.

		Parameters
		----------
		binary : str
		Name of the binary. Available binaries are:
		- blastn
		- cd-hit
		- fastp
		- fasttree
		- makeblastdb
		- mafft
		- mmseqs
		- muscle
		- muscle_v3
		- vsearch

		Returns
		-------
		str
		Path to the binary.

		"""
		# format (and maybe fix) binary name
		binary = binary.lower()
		if binary == "mmseqs2":
		binary = "mmseqs"
		if binary == "muscle_v3":
		binary = "muscle3"
		if binary == "cd-hit":
		binary = "cdhit"
		available = [
		# blast
		"blastn",
		"makeblastdb",
		# clustering/alignment
		"cdhit",
		"mafft",
		"mmseqs",
		"muscle",
		"muscle3",
		"vsearch",
		# qc
		"fastp",
		# phylogeny
		"fasttree",
		]
		if binary not in available:
		raise ValueError(
		f"Binary '{binary}' not available. Available binaries: \n"
		+ "\n".join(available)
		)
		# get binary path
		if binary in ["fastp", "muscle3", "blastn", "makeblastdb"]:
		bin_name = f"{binary}_{SYSTEM}"
		bin_path = os.path.join(BIN_DIR, bin_name)
		elif binary == "mafft":
		bin_name = f"{binary}_{SYSTEM}_{MACHINE}/mafft.bat"
		bin_path = os.path.join(BIN_DIR, bin_name)
		else:
		bin_name = f"{binary}_{SYSTEM}_{MACHINE}"
		bin_path = os.path.join(BIN_DIR, bin_name)
		# download binary if missing
		if not os.path.exists(bin_path):
		print(
		f"Downloading missing binary: {binary}. This will only occur once on intial use of the binary."
		)
		download_missing_binary(bin_name, bin_path)
		return bin_path


		# download missing binary (some are too large to included in the PyPI package)
		def download_missing_binary(bin_name: str, bin_path: str) -> None:
		url = f"https://brineylab.s3.amazonaws.com/tools/abutils_binaries/{bin_name}"
		urlretrieve(url, bin_path)


		# # download any missing binaries (some are too large to included in the PyPI package)
		# def download_missing_binaries():
		# to_download = []
		# for b in [
		# "cd-hit",
		# "fastp",
		# "fasttree",
		# "mafft",
		# "mmseqs",
		# "muscle",
		# "muscle3",
		# "vsearch",
		# ]:
		# if b in ["fastp", "muscle3"]:
		# bin_name = f"{b}_{SYSTEM}"
		# bin_path = os.path.join(BIN_DIR, bin_name)
		# else:
		# bin_name = f"{b}_{SYSTEM}_{MACHINE}"
		# bin_path = os.path.join(BIN_DIR, bin_name)
		# if not os.path.exists(bin_path):
		# to_download.append((bin_name, bin_path))
		# if to_download:
		# print(
		# "Downloading missing binaries. This will only occur once on intial use of abutils."
		# )
		# url = f"https://brineylab.s3.amazonaws.com/tools/abutils_binaries/{bin_name}"
		# urlretrieve(url, bin_path)


		# download_missing_binaries()

abutils/bin/fastp_darwin

Sorry, the diff of this file is too big to display

abutils/bin/fastp_linux

Sorry, the diff of this file is too big to display

+65

abutils/tests/test_cloning.py

		import pytest

		from abutils import Sequence
		from abutils.tools.cloning import build_synthesis_constructs

		# ------------------------------------
		# build_synthesis_constructs
		# ------------------------------------


		def test_build_synthesis_constructs_single_sequence():
		seq = Sequence("RINEYLA")
		seq["locus"] = "IGH"
		constructs = build_synthesis_constructs(seq)
		assert len(constructs) == 1
		opt_seq = str(constructs[0].sequence)
		assert opt_seq.startswith("catcctttttctagtagcaactgcaaccggtgtacac".upper())
		assert opt_seq.endswith("gcgtcgaccaagggcccatcggtcttcc".upper())


		def test_build_synthesis_constructs_multiple_sequences():
		seqs = [Sequence("RINEYLA"), Sequence("ALYENIR")]
		for seq in seqs:
		seq["locus"] = "IGH"
		constructs = build_synthesis_constructs(seqs)
		assert len(constructs) == 2


		def test_build_synthesis_constructs_with_overhangs():
		seq = Sequence("RINEYLA")
		seq["locus"] = "IGH"
		overhang_5 = {"IGH": "gggg"}
		overhang_3 = {"IGH": "cccc"}
		constructs = build_synthesis_constructs(
		seq, overhang_5=overhang_5, overhang_3=overhang_3
		)
		opt_seq = str(constructs[0].sequence)
		assert opt_seq.startswith("gggg".upper())
		assert opt_seq.endswith("cccc".upper())


		def test_build_synthesis_constructs_no_overhangs():
		seq = Sequence("ATGCATGCTTATGAG")
		constructs = build_synthesis_constructs(seq, overhang_5={}, overhang_3={})
		assert len(constructs[0].sequence) == len("ATGCATGCTTATGAG")


		def test_build_synthesis_constructs_group_by_chain():
		seqs = [Sequence("RINEYLA"), Sequence("ALYENIR")]
		for seq, locus in zip(seqs, ["IGH", "IGK"]):
		seq["locus"] = locus
		constructs = build_synthesis_constructs(seqs, group_by_chain=True)
		assert constructs[0].id[-3:] == "IGH"
		assert constructs[1].id[-3:] == "IGK"


		def test_build_synthesis_constructs_sort_func():
		seqs = [Sequence("RINEYLA", id="B"), Sequence("ALYENIR", id="A")]
		for s in seqs:
		s["locus"] = "IGH"
		constructs = build_synthesis_constructs(
		seqs, sort_func=lambda s: s.id, add_locus_to_name=False
		)
		assert constructs[0].id == "A"
		assert constructs[1].id == "B"

+204

abutils/tools/cloning.py

		#!/usr/bin/env python
		# filename: cloning.py
		#
		#
		# Copyright (c) 2023 Bryan Briney
		# License: The MIT license (http://opensource.org/licenses/MIT)
		#
		# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
		# and associated documentation files (the "Software"), to deal in the Software without restriction,
		# including without limitation the rights to use, copy, modify, merge, publish, distribute,
		# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
		# furnished to do so, subject to the following conditions:
		#
		# The above copyright notice and this permission notice shall be included in all copies or
		# substantial portions of the Software.
		#
		# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
		# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
		# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
		# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		#


		from typing import Callable, Iterable, Optional, Union

		from tqdm.auto import tqdm

		from ..core.sequence import Sequence, codon_optimize

		__all__ = ["build_synthesis_constructs"]


		def build_synthesis_constructs(
		sequences: Union[Sequence, Iterable[Sequence], str],
		overhang_5: Optional[dict] = None,
		overhang_3: Optional[dict] = None,
		sequence_key: Optional[str] = None,
		id_key: Optional[str] = None,
		locus_key: Optional[str] = None,
		frame: Optional[int] = None,
		add_locus_to_name: bool = True,
		group_by_chain: bool = False,
		show_progress: bool = True,
		sort_func: Optional[Callable] = None,
		):
		"""
		Builds codon-optimized synthesis constructs, including Gibson overhangs suitable
		for cloning IGH, IGK and IGL variable region constructs into antibody expression
		vectors.


		.. seealso::
		\| Thomas Tiller, Eric Meffre, Sergey Yurasov, Makoto Tsuiji, Michel C Nussenzweig, Hedda Wardemann
		\| Efficient generation of monoclonal antibodies from single human B cells by single cell RT-PCR and expression vector cloning
		\| Journal of Immunological Methods 2008, doi: 10.1016/j.jim.2007.09.017


		Parameters
		----------
		sequences : ``Sequence``, str, or ``Iterable[Sequence]``
		A ``Sequence`` object, a sequence as a ``str``, or an ``Iterable`` of ``Sequence`` objects.

		.. note::
		If the provided sequences are nucleotide sequences, ``frame`` must also be provided
		so that codon optimization can be performed on the correct reading frame.

		overhang_5 : dict, optional
		A ``dict`` mapping the locus name to 5' Gibson overhangs. By default, Gibson
		overhangs corresponding to the expression vectors in Tiller et al, 2008:

		\| IGH: ``catcctttttctagtagcaactgcaaccggtgtacac``
		\| IGK: ``atcctttttctagtagcaactgcaaccggtgtacac``
		\| IGL: ``atcctttttctagtagcaactgcaaccggtgtacac``

		To produce constructs without 5' Gibson overhangs, provide an empty dictionary.

		overhang_3 : dict, optional
		A ``dict`` mapping the locus name to 3' Gibson overhangs. By default, Gibson
		overhangs corresponding to the expression vectors in Tiller et al, 2008:

		\| IGH: ``gcgtcgaccaagggcccatcggtcttcc``
		\| IGK: ``cgtacggtggctgcaccatctgtcttcatc``
		\| IGL: ``ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc``

		To produce constructs without 3' Gibson overhangs, provide an empty dictionary.

		sequence_key : str, default='sequence_aa'
		Field containing the sequence to be codon optimized. Default is ``'sequence_aa'`` if
		``annotation_format == 'airr'`` or ``'vdj_aa'`` if ``annotation_format == 'json'``.
		Either nucleotide or amino acid sequences are acceptable.

		id_key : str, optional
		Field containing the name of the sequence. If not provided, ``sequence.id`` is used.

		locus_key : str, optional
		Field containing the name of the locus. If not provided, ``sequence["locus"]`` is used, which
		conforms with AIRR naming conventions.

		.. note::
		The data in ``locus_key`` should return one of the following:
		* IGH, IGK, or IGL (which are the names of the loci in AIRR naming conventions)
		* heavy, kappa, or lambda

		frame : int, default=1
		Reading frame to translate. Default is ``1``.

		.. note::
		This parameter is ignored if the input sequences are amino acid sequences.

		add_locus_to_name : bool, default=True
		If ``True``, the name of the sequence will be appended with the name of the locus, separated by an underscore.
		If ``False``, the name of the sequence will not be modified.

		group_by_chain : bool, default=False
		If ``True``, the output ``list`` of ``Sequence`` objects will be separated by chain (heavy first, light second).
		Within each chain group, sequence ordering will retained (either input order, or sorted by ``sort_func``).
		If ``False``, the output ``list`` of ``Sequence`` objects will not be grouped.

		in_place : bool, default=False
		If ``True``, the input ``Sequence`` objects will be returned with the optimized sequence
		populating the ``sequence.codon_optimized`` property. If ``False``, the optimized sequences
		will be returned as new ``Sequence`` objects.

		sort_func : Callable, optional
		A function to sort the output ``list`` of ``Sequence`` objects. By default,
		the output ``list`` is is in the same order as the input.


		Returns
		-------
		sequences : ``list`` of ``Sequence`` objects
		A ``list`` of ``Sequence`` objects. Each ``Sequence`` object has the following properties:

		\| id: The sequence ID (either ``sequence.id`` or ``id_key``), to which the locus (``locus_key``) is appended if ``add_locus_to_name`` is ``True``.
		\| sequence: The codon-optimized sequence, including Gibson overhangs.

		If ``sort_func`` is provided, the output ``list`` will be sorted by ``sort_func``, otherwise the output is in the same order as the input.

		"""
		# get overhangs
		overhang_3 = overhang_3 if overhang_3 is not None else GIBSON3
		overhang_5 = overhang_5 if overhang_5 is not None else GIBSON5

		# process input
		if isinstance(sequences, (Sequence, str)):
		sequences = [sequences]
		sequences = [Sequence(s) for s in sequences]

		# sort
		if sort_func is not None:
		sequences = sorted(sequences, key=sort_func)
		if group_by_chain:
		locus_key = locus_key if locus_key is not None else "locus"
		heavies = [s for s in sequences if s[locus_key] in ["heavy", "IGH"]]
		lights = [s for s in sequences if s[locus_key] not in ["heavy", "IGH"]]
		sequences = heavies + lights

		# make codon-optimized synthesis constructs
		if show_progress:
		sequences = tqdm(sequences)
		optimized = []
		for sequence in sequences:
		seq = sequence[sequence_key] if sequence_key is not None else sequence.sequence
		name = sequence[id_key] if id_key is not None else sequence.id
		locus = sequence[locus_key] if locus_key is not None else sequence["locus"]
		if add_locus_to_name and locus is not None:
		name += f"_{locus}"
		opt_seq = codon_optimize(seq, frame=frame, as_string=True)
		synthesis_construct = (
		overhang_5.get(locus, "") + opt_seq + overhang_3.get(locus, "")
		)
		optimized.append(Sequence(synthesis_construct, id=name))
		return optimized


		GIBSON5 = {
		"IGH": "catcctttttctagtagcaactgcaaccggtgtacac",
		"IGK": "atcctttttctagtagcaactgcaaccggtgtacac",
		"IGL": "atcctttttctagtagcaactgcaaccggtgtacac",
		"heavy": "catcctttttctagtagcaactgcaaccggtgtacac",
		"kappa": "atcctttttctagtagcaactgcaaccggtgtacac",
		"lambda": "atcctttttctagtagcaactgcaaccggtgtacac",
		}

		GIBSON3 = {
		"IGH": "gcgtcgaccaagggcccatcggtcttcc",
		"IGK": "cgtacggtggctgcaccatctgtcttcatc",
		"IGL": "ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc",
		"heavy": "gcgtcgaccaagggcccatcggtcttcc",
		"kappa": "cgtacggtggctgcaccatctgtcttcatc",
		"lambda": "ggtcagcccaaggctgccccctcggtcactctgttcccgccctcgagtgaggagcttcaagccaacaaggcc",
		}


		LOCUS_MAP = {
		"IGH": "heavy",
		"IGK": "kappa",
		"IGL": "lambda",
		"TRA": "alpha",
		"TRB": "beta",
		"TRD": "delta",
		"TRG": "gamma",
		}

+323

abutils/tools/preprocessing.py

		#!/usr/bin/python
		# filename: preprocessing.py

		#
		# Copyright (c) 2024 Bryan Briney
		# License: The MIT license (http://opensource.org/licenses/MIT)
		#
		# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
		# and associated documentation files (the "Software"), to deal in the Software without restriction,
		# including without limitation the rights to use, copy, modify, merge, publish, distribute,
		# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
		# furnished to do so, subject to the following conditions:
		#
		# The above copyright notice and this permission notice shall be included in all copies or
		# substantial portions of the Software.
		#
		# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
		# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
		# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
		# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		#


		import os
		import subprocess as sp
		from typing import Iterable, Optional, Union

		from natsort import natsorted
		from tqdm.auto import tqdm

		from ..bin import get_path as get_binary_path
		from ..io import concatenate_files, delete_files, list_files, make_dir, rename_file


		class FASTQFile:
		""" """

		def __init__(self, file: str):
		self.file = file
		self.path = os.path.abspath(file)
		self.basename = os.path.basename(self.path)
		self.dir = os.path.dirname(self.path)

		def __eq__(self, other):
		return self.name == other.name


		class IlluminaFile(FASTQFile):
		""" """

		def __init__(self, file: str):
		super().__init__(file)

		@property
		def name(self):
		return "_".join(self.basename.split("_")[:-4])

		@property
		def number(self):
		return self.basename.split(".")[0].split("_")[-1]

		@property
		def read(self):
		return self.basename.split("_")[-2]

		@property
		def lane(self):
		return self.basename.split("_")[-3]

		@property
		def sample(self):
		return self.basename.split("_")[-4]


		class MergeGroup:
		""" """

		def __init__(self, name: str, files: Iterable[FASTQFile]):
		self.name = name
		self.files = files
		self.merged_file = None # assigned by merge function

		def merge(
		self,
		merged_directory: str,
		format: str = "fastq",
		algo: str = "vsearch",
		binary_path: Optional[str] = None,
		merge_args: Optional[str] = None,
		verbose: bool = False,
		debug: bool = False,
		) -> str:
		groups = self._group_by_lane()
		n_groups = len(groups)
		if verbose:
		print(f"{self.name}")
		print("-" * len(self.name))
		if n_groups > 1:
		groups = tqdm(groups, desc=" - merging lanes")
		# merge function
		if algo.lower() == "vsearch":
		merge_func = merge_fastqs_vsearch
		else:
		raise ValueError(f"Invalid merge algorithm: {algo}. Must be 'vsearch'.")
		# merge files
		merged_files = []
		for group in groups:
		r1, r2 = natsorted(group, key=lambda x: x.read)
		merged_file = merge_func(
		r1.path,
		r2.path,
		os.path.join(merged_directory, f"{self.name}.{format.lower}"),
		binary_path=binary_path,
		additional_args=merge_args,
		output_format=format,
		debug=debug,
		)
		merged_files.append(merged_file)
		self.merged_file = os.path.join(merged_directory, f"{self.name}.{format.lower}")
		if len(merged_files) > 1:
		if verbose:
		print(" - concatenating merged files")
		concatenate_files(merged_files, self.merged_file)
		delete_files(merged_files)
		else:
		rename_file(merged_files, self.merged_file)
		return self.merged_file

		def _group_by_lane(self):
		lane_dict = {}
		for f in self.files:
		if f.lane not in lane_dict:
		lane_dict[f.lane] = []
		lane_dict[f.lane].append(f)
		return [l[0] for l in natsorted(lane_dict.items(), key=lambda x: x[0])]


		def merge_fastqs(
		files: Union[str, Iterable],
		output_directory: str,
		output_format: str = "fastq",
		schema: str = "illumina",
		algo: str = "vsearch",
		binary_path: Optional[str] = None,
		merge_args: Optional[str] = None,
		debug: bool = False,
		verbose: bool = False,
		) -> Iterable[str]:
		"""
		Merge paired-end fastq files.

		Parameters
		----------
		files : Union[str, Iterable]
		Path to a directory containing paired-end fastq files, or an iterable of paired-end fastq files.

		output_directory : str
		Path to the directory in which to save the merged files.

		output_format : str, optional
		Output format. Must be 'fastq' or 'fasta'. Default is 'fastq'.

		schema : str, optional
		Schema of the file names. Must be 'illumina'. Default is 'illumina'.

		algo : str, optional
		Algorithm to use for merging. Must be 'vsearch'. Default is 'vsearch'.

		binary_path : str, optional
		Path to the merge algorithm binary. If not provided, the binary packaged with abutils will be used.

		merge_args : str, optional
		Additional arguments (as a string) to pass to the merge function.

		debug : bool, optional
		If True, print debug output. Default is False.

		verbose : bool, optional
		If True, print verbose output. Default is False.

		Returns
		-------
		list
		A list of paths to the merged FASTA/Q files.

		"""
		# process input files
		if isinstance(files, str):
		if not os.path.isdir(files):
		err = f"The supplied file path ({files}) does not exist or is not a directory."
		raise ValueError(err)
		files = list_files(files, extension=[".fastq", ".fq", ".fastq.gz", ".fq.gz"])
		if schema.lower() == "illumina":
		files = [IlluminaFile(f) for f in files]
		else:
		raise ValueError(f"Invalid schema: {schema}. Must be 'illumina'.")
		# group files by sample
		file_pairs = group_fastq_pairs(files, verbose=verbose)
		# merge files
		make_dir(output_directory)
		merged_files = []
		for fp in file_pairs:
		merged_file = fp.merge(
		merged_directory=output_directory,
		format=output_format,
		algo=algo,
		binary_path=binary_path,
		merge_args=merge_args,
		debug=debug,
		verbose=verbose,
		)
		merged_files.append(merged_file)
		return merged_files


		def group_fastq_pairs(
		files: Iterable[FASTQFile], verbose: bool = False
		) -> Iterable[MergeGroup]:
		"""
		Group paired-end fastq files by sample name. If a sample has multiple lanes, the files will be combined.

		Parameters
		----------
		files : Union[str, Iterable]
		Path to a directory containing paired-end fastq files, or an iterable of paired-end fastq files.

		verbose : bool, optional
		If True, print verbose output. Default is False.

		Returns
		-------
		list
		A list of MergeGroup objects.

		"""
		group_dict = {}
		for f in files:
		if f.name not in group_dict:
		group_dict[f.name] = []
		group_dict[f.name].append(f)
		groups = [
		MergeGroup(name, group_files, verbose=verbose)
		for name, group_files in group_dict.items()
		]
		return groups


		def merge_fastqs_vsearch(
		forward: str,
		reverse: str,
		merged_file: str,
		output_format: str = "fasta",
		binary_path: Optional[str] = None,
		additional_args: Optional[str] = None,
		debug: bool = False,
		) -> str:
		"""
		Merge paired-end reads using vsearch.

		Parameters
		----------
		forward : str
		Path to the forward read file.

		reverse : str
		Path to the reverse read file.

		merged_file : str
		Path to the merged read file.

		output_format : str, optional
		Output format. Must be 'fasta' or 'fastq'. Default is 'fasta'.

		binary_path : str, optional
		Path to a vsearch binary. If not provided, the vsearch binary packaged with abutils will be used.

		additional_args : str, optional
		Additional arguments (as a string) to pass to vsearch.

		debug : bool, optional
		If True, print vsearch stdout and stderr. Default is False.

		Returns
		-------
		str
		Path to the merged read file.

		"""
		# validate input files
		if not os.path.isfile(forward):
		err = f"The supplied forward read file path ({forward}) does not exist or is not a file."
		raise ValueError(err)
		if not os.path.isfile(reverse):
		err = f"The supplied reverse read file path ({reverse}) does not exist or is not a file."
		raise ValueError(err)
		# make output directory
		out_dir = os.path.dirname(merged_file)
		make_dir(out_dir)
		# get the vsearch binary
		if binary_path is None:
		binary_path = get_binary_path("vsearch")
		# compile the vsearch command
		cmd = f"{binary_path} --fastq_mergepairs {forward} --reverse {reverse}"
		if output_format.lower() == "fasta":
		cmd += " --fastaout {merged_file}"
		elif output_format.lower() == "fastq":
		cmd += " --fastqout {merged_file}"
		else:
		err = f"Invalid output format: {output_format}. Must be 'fasta' or 'fastq'."
		raise ValueError(err)
		if additional_args is not None:
		cmd += f" {additional_args}"
		# merge reads
		p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
		stdout, stderr = p.communicate()
		if debug:
		print(stdout.decode())
		print(stderr.decode())
		if p.returncode != 0:
		err = f"Error merging reads with vsearch: {stderr.decode()}"
		raise ValueError(err)
		return merged_file

+2

-1

abutils.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: abutils
		Version: 0.4.16
		Version: 0.4.17
		Summary: Utilities for analysis of adaptive immune receptor repertoire (AIRR) data
		@@ -25,2 +25,3 @@ Home-page: https://github.com/briney/abutils
		Requires-Dist: celery
		Requires-Dist: dnachisel
		Requires-Dist: ete3
		@@ -27,0 +28,0 @@ Requires-Dist: fastcluster

+1

-0

abutils.egg-info/requires.txt

		@@ -5,2 +5,3 @@ abstar>=0.6.3
		celery
		dnachisel
		ete3
		@@ -7,0 +8,0 @@ fastcluster

+6

-0

abutils.egg-info/SOURCES.txt

		@@ -8,2 +8,3 @@ LICENSE
		abutils/__init__.py
		abutils/bin.py
		abutils/cl.py
		@@ -22,2 +23,4 @@ abutils/io.py
		abutils/bin/cdhit_linux_amd64
		abutils/bin/fastp_darwin
		abutils/bin/fastp_linux
		abutils/bin/fasttree_darwin_amd64
		@@ -185,2 +188,3 @@ abutils/bin/fasttree_darwin_arm64
		abutils/tests/test_clonify.py
		abutils/tests/test_cloning.py
		abutils/tests/test_cluster.py
		@@ -195,4 +199,6 @@ abutils/tests/test_color.py
		abutils/tools/clonify.py
		abutils/tools/cloning.py
		abutils/tools/cluster.py
		abutils/tools/phylo.py
		abutils/tools/preprocessing.py
		abutils/tools/similarity.py
		@@ -199,0 +205,0 @@ abutils/utils/__init__.py

+1

-1

abutils/__init__.py

		@@ -6,3 +6,3 @@ # from .core import *

		from . import cl, io, pl, tl
		# from . import bin, cl, io, pl, tl
		from . import cl as color
		@@ -9,0 +9,0 @@ from .core import lineage, pair, sequence

+210

-6

abutils/core/sequence.py

		@@ -36,2 +36,3 @@ #!/usr/bin/env python

		import dnachisel as dc
		import pandas as pd
		@@ -145,2 +146,3 @@ from Bio import SeqIO
		self._reverse_complement = None
		self._codon_optimized = None
		self._strand = None
		@@ -321,2 +323,12 @@ self.id_key = id_key
		@property
		def codon_optimized(self):
		if self._codon_optimized is None:
		self._codon_optimized = self.codon_optimize(as_string=True)
		return self._codon_optimized

		@codon_optimized.setter
		def codon_optimized(self, val):
		self._codon_optimized = val

		@property
		def annotations(self):
		@@ -369,2 +381,44 @@ """

		def codon_optimize(
		self,
		sequence_key: str = "sequence_aa",
		id_key: str = "sequence_id",
		frame: Optional[int] = None,
		as_string: bool = True,
		) -> Union[str, "Sequence"]:
		"""
		Codon optimize a sequence.

		Parameters
		----------
		sequence_key : str, default="sequence_aa"
		Name of the annotation field containg the sequence to be translated.
		If not provided, ``Sequence.sequence`` is used.

		id_key : str, default="sequence_id"
		Name of the annotation field containg the sequence id.
		If not provided, ``Sequence.id`` is used.

		frame : int, default=1
		Reading frame to translate. Default is ``1``.

		as_string : bool, default=True
		If ``True``, the optimized sequence will be returned as a ``str``.
		If ``False``, the optimized sequence will be returned as a ``Sequence`` object.

		Returns
		-------
		Union[str, Sequence]
		Translated sequence as a ``str`` (if ``as_string`` is ``True``) or
		``Sequence`` object (if ``as_string`` is ``False``).

		"""
		return codon_optimize(
		sequence=self,
		sequence_key=sequence_key,
		id_key=id_key,
		frame=frame,
		as_string=as_string,
		)

		def as_fasta(
		@@ -465,3 +519,3 @@ self, name_field: Optional[str] = None, seq_field: Optional[str] = None
		if id is None:
		id = uuid.uuid4()
		id = str(uuid.uuid4())
		self.sequence = str(seq).upper()
		@@ -586,2 +640,3 @@ self.id = id
		translated : str
		Translated sequence.

		@@ -597,3 +652,6 @@ """
		return None
		start = (frame % 3) - 1
		# start = (frame % 3 or 3) - 1 # too clever by half
		if frame not in range(1, 4):
		raise ValueError(f"Invalid frame: {frame}. Must be 1, 2 or 3.")
		start = frame - 1
		end = len(seq) - (len(seq[start:]) % 3)
		@@ -612,2 +670,98 @@ seq = seq[start:end]

		def codon_optimize(
		sequence,
		sequence_key: Optional[str] = None,
		id_key: Optional[str] = None,
		frame: Optional[int] = None,
		as_string: bool = False,
		in_place: bool = False,
		) -> Union[str, "Sequence"]:
		"""
		Optimizes the codons of a nucleotide or amino acid sequence.

		Parameters
		----------
		sequence : Union[str, Sequence]
		Nucleotide or amino acid sequence to be optimized.

		sequence_key : str, default=None
		Name of the annotation field containg the sequence to be optimized.
		If not provided, ``sequence.sequence`` is used.

		id_key : str, default=None
		Name of the annotation field containg the sequence ID.
		If not provided, ``sequence.id`` is used.

		frame : int, default=1
		Reading frame to translate. Default is ``1``.

		.. note::
		``frame`` is ignored if the input sequence is an amino acid sequence.

		as_string : bool, default=False
		If ``True``, the optimized sequence will be returned as a ``str``.
		If ``False``, the optimized sequence will be returned as a ``Sequence`` object.

		in_place : bool, default=False
		If ``True``, the input ``Sequence`` object will be returned with the optimized sequence populating
		the ``sequence.codon_optimized`` property. If ``False``, the optimized sequence will be returned
		as a ``str`` (if ``as_string`` is ``True``) or a ``Sequence`` object (if ``as_string`` is ``False``).

		Returns
		-------
		Union[str, Sequence]
		If ``in_place`` is ``True``, the input ``Sequence`` object will be returned with the optimized sequence populating
		the ``Sequence.sequence`` property.
		If ``as_string`` is ``True``, the optimized sequence will be returned as a ``str``.
		If ``as_string`` is ``False``, the optimized sequence will be returned as a new ``Sequence`` object.

		"""
		# process input
		if not isinstance(sequence, Sequence):
		sequence = Sequence(sequence)
		if sequence_key is not None:
		seq = (
		sequence[sequence_key]
		if sequence[sequence_key] is not None
		else sequence.sequence
		)
		else:
		seq = sequence.sequence
		if id_key is not None:
		name = sequence[id_key] if id_key is not None else sequence.id
		else:
		name = sequence.id

		# get DNA sequence
		if not all([res.upper() in ["A", "C", "G", "T", "N", "-"] for res in sequence]):
		dna_seq = dc.reverse_translate(seq)
		else:
		frame = frame if frame is not None else 1
		if frame not in range(1, 4):
		raise ValueError(f"Invalid frame: {frame}. Must be 1, 2 or 3.")
		start = frame - 1
		dna_seq = seq[start:]

		# optimize codons
		problem = dc.DnaOptimizationProblem(
		sequence=dna_seq,
		constraints=[
		dc.EnforceTranslation(),
		dc.EnforceGCContent(maxi=0.56),
		dc.EnforceGCContent(maxi=0.64, window=100),
		dc.UniquifyAllKmers(10),
		],
		objectives=[dc.CodonOptimize(species="h_sapiens")],
		logger=None,
		)
		problem.resolve_constraints(final_check=True)
		problem.optimize()
		if as_string:
		return str(problem.sequence)
		if in_place:
		sequence.sequence = str(problem.sequence)
		return sequence
		return Sequence(problem.sequence, id=name)


		def read_json(
		@@ -1044,2 +1198,7 @@ json_file: str,
		index: bool = False,
		properties: Optional[Iterable[str]] = None,
		drop_na_columns: bool = True,
		order: Optional[Iterable[str]] = None,
		exclude: Optional[Union[str, Iterable[str]]] = None,
		leading: Optional[Union[str, Iterable[str]]] = None,
		) -> None:
		@@ -1070,10 +1229,55 @@ """

		properties : list, default=None
		A list of properties to be included in the CSV file.

		drop_na_columns : bool, default=True
		If ``True``, columns with all ``NaN`` values will be dropped from the CSV file.
		Default is ``True``.

		order : list, default=None
		A list of fields in the order they should appear in the CSV file.

		exclude : str or list, default=None
		Field or list of fields to be excluded from the CSV file.

		leading : str or list, default=None
		Field or list of fields to appear first in the CSV file. Supercedes ``order``, so
		if both are provided, fields in ``leading`` will appear first in the CSV file and
		remaining fields will appear in the order provided in ``order``.

		"""
		d = []
		data = []
		for s in sequences:
		if not s.annotations:
		d.append({"sequence_id": s.id, "sequence": s.sequence})
		d = {"sequence_id": s.id, "sequence": s.sequence}
		else:
		d.append(s.annotations)
		df = pd.DataFrame(d)
		d = s.annotations
		if properties is not None:
		for prop in properties:
		try:
		d[prop] = getattr(s, prop)
		except AttributeError:
		continue
		data.append(d)
		df = pd.DataFrame(data)
		# drop NaN
		if drop_na_columns:
		df.dropna(axis=1, how="all", inplace=True)
		# excluded columns
		if exclude is not None:
		if isinstance(exclude, str):
		exclude = []
		cols = [c for c in df.columns.values if c not in exclude]
		df = df[cols]
		# reorder
		if order is not None:
		cols = [o for o in order if o in df.columns.values]
		df = df[cols]
		if leading is not None:
		if isinstance(leading, str):
		leading = [leading]
		leading = [l for l in leading if l in df.columns.values]
		cols = leading + [c for c in df.columns.values if c not in leading]
		df = df[cols]

		df.to_csv(csv_file, sep=sep, index=index, columns=columns, header=header)

+122

-1

abutils/io.py

		@@ -25,3 +25,6 @@ #!/usr/bin/env python

		import glob
		import os
		import sys
		from typing import Iterable, Union

		@@ -40,5 +43,123 @@ from .core.sequence import (
		from .utils.convert import abi_to_fasta
		from .utils.pipeline import list_files, make_dir

		# from .utils.pipeline import list_files, make_dir


		def make_dir(directory: str) -> None:
		"""
		Makes a directory, if it doesn't already exist.

		Parameters
		----------
		directory : str
		Path to a directory.

		"""
		if not os.path.exists(directory):
		os.makedirs(os.path.abspath(directory))


		def list_files(
		directory: str, extension: Union[str, Iterable, None] = None
		) -> Iterable[str]:
		"""
		Lists files in a given directory.

		Parameters
		----------
		directory : str
		Path to a directory.

		extension : str
		If supplied, only files that end with the specificied extension(s) will be returned. Can be either
		a string or a list of strings. Extension evaluation is case-insensitive and can match complex
		extensions (e.g. '.fastq.gz'). Default is ``None``, which returns all files in the directory,
		regardless of extension.

		Returns
		-------
		Iterable[str]

		"""
		if os.path.isdir(directory):
		expanded_dir = os.path.expanduser(directory)
		files = sorted(glob.glob(expanded_dir + "/*"))
		else:
		files = [
		directory,
		]
		if extension is not None:
		if isinstance(extension, str):
		extension = [
		extension,
		]
		files = [
		f
		for f in files
		if any(
		[
		any([f.lower().endswith(e.lower()) for e in extension]),
		any([f.endswith(e.upper()) for e in extension]),
		any([f.endswith(e.lower()) for e in extension]),
		]
		)
		]
		return files


		def rename_file(file: str, new_name: str) -> None:
		"""
		Renames a file.

		Parameters
		----------
		file : str
		Path to the file to be renamed.

		new_name : str
		New name for the file.

		"""
		os.rename(file, new_name)


		def delete_files(files: Union[str, Iterable]) -> None:
		"""
		Deletes files.

		Parameters
		----------
		files : Union[str, Iterable]
		Path to a file or an iterable of file paths.

		"""
		if isinstance(files, str):
		files = [
		files,
		]
		for f in files:
		if os.path.exists(f):
		os.remove(f)


		def concatenate_files(files: Iterable[str], output_file: str) -> None:
		"""
		Concatenates multiple files into a single file.

		Parameters
		----------
		files : Iterable[str]
		Iterable of file paths.

		output_file : str
		Path to the output file.

		"""
		with open(output_file, "w") as outfile:
		for fname in files:
		with open(fname) as infile:
		for line in infile:
		outfile.write(line)


		def read_sequences(
		@@ -45,0 +166,0 @@ file=None,

+28

-2

abutils/tests/test_sequence.py

		@@ -5,8 +5,8 @@ #!usr/env/python
		import os
		import pytest
		import tempfile

		import pytest
		from Bio.SeqRecord import SeqRecord

		from ..core.sequence import Sequence, read_fasta, read_airr, to_fasta
		from ..core.sequence import Sequence, read_airr, read_fasta, to_fasta

		@@ -275,1 +275,27 @@
		assert ">seq1\nATCG\n>seq2\nGCTA\n>seq3\nTTTT" in fasta


		# -------------------------------------------
		# codon optimization
		# -------------------------------------------


		def test_codon_optimize_returns_string():
		seq = Sequence("RINEYLA")
		optimized_seq = seq.codon_optimize(as_string=True)
		assert isinstance(optimized_seq, str)
		assert len(optimized_seq) == len(seq) * 3


		def test_codon_optimize_returns_sequence_object():
		seq = Sequence("RINEYLA")
		optimized_seq = seq.codon_optimize(as_string=False)
		assert isinstance(optimized_seq, Sequence)
		assert len(optimized_seq.sequence) == len(seq) * 3
		assert optimized_seq.translate() == seq.sequence


		def test_codon_optimize_with_frame():
		seq = Sequence("ATGCATGCATGCATGC")
		with pytest.raises(ValueError):
		seq.codon_optimize(frame=4)

+2

-1

abutils/tl.py

		@@ -27,5 +27,6 @@ #!/usr/bin/env python
		from .core.pair import assign_pairs
		from .core.sequence import reverse_complement, translate
		from .core.sequence import codon_optimize, reverse_complement, translate
		from .tools.alignment import *
		from .tools.clonify import clonify
		from .tools.cloning import *
		from .tools.cluster import *
		@@ -32,0 +33,0 @@ from .tools.phylo import *

+28

-7

abutils/tools/clonify.py

		@@ -1,16 +0,37 @@
		from collections import Counter
		#!/usr/bin/env python
		# filename: clonify.py
		#
		#
		# Copyright (c) 2023 Bryan Briney
		# License: The MIT license (http://opensource.org/licenses/MIT)
		#
		# Permission is hereby granted, free of charge, to any person obtaining a copy of this software
		# and associated documentation files (the "Software"), to deal in the Software without restriction,
		# including without limitation the rights to use, copy, modify, merge, publish, distribute,
		# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
		# furnished to do so, subject to the following conditions:
		#
		# The above copyright notice and this permission notice shall be included in all copies or
		# substantial portions of the Software.
		#
		# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
		# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
		# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
		# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		#


		import itertools
		import sys
		from collections import Counter
		from typing import Iterable, Union

		import fastcluster as fc
		import numpy as np
		import pandas as pd
		import numpy as np

		from Levenshtein import distance

		import fastcluster as fc
		from mnemonic import Mnemonic
		from scipy.cluster.hierarchy import fcluster

		from mnemonic import Mnemonic

		from ..core.sequence import Sequence
		@@ -17,0 +38,0 @@ from ..tools import cluster

+123

-14

abutils/tools/cluster.py

		@@ -37,2 +37,3 @@ #!/usr/bin/env python

		from ..bin import get_path as get_binary_path
		from ..core.sequence import Sequence, read_fasta, to_fasta
		@@ -226,2 +227,7 @@ from ..utils.decorators import lazy_property
		iddef: int = 0,
		cluster_mode: str = "2",
		cov_mode: str = "0",
		coverage: float = 0.8,
		alignment_mode: str = "3",
		seq_id_mode: str = "1",
		vsearch_bin: str = None,
		@@ -232,2 +238,3 @@ mmseqs_bin: str = None,
		seq_key: Optional[str] = None,
		threads: Optional[int] = None,
		strand: str = "plus",
		@@ -285,2 +292,32 @@ as_dict: bool = False,

		cluster_mode : str, default="2"
		Clustering mode, as implemented by MMseqs2. Options are:
		1. greedy set cover
		2. connected component
		3. greedy incremental (CD-HIT-like)

		cov_mode : str, default="0"
		Coverage mode, as implemented by MMseqs2. Options are:
		0. bidirectional
		1. target coverage
		2. query coverage
		3. target-in-query length coverage

		coverage : float, default=0.8
		Coverage threshold for clustering with MMseqs2. Must be between 0 and 1.

		alignment_mode : str, default="3"
		Alignment mode, as implemented by MMseqs2. Options are:
		0. automatic
		1. only score and end_pos
		2. also start_pos and cov
		3. also seq.id
		4. only ungapped alignment

		seq_id_mode : str, default="1"
		Sequence ID mode, as implemented by MMseqs2. Options are:
		0. alignment length
		1. shorter sequence length
		2. longer sequence length

		vsearch_bin : str, optional
		@@ -359,2 +396,8 @@ Path to a VSEARCH executable. If not provided, the VSEARCH binary bundled
		threshold=threshold,
		cluster_mode=cluster_mode,
		cov_mode=cov_mode,
		coverage=coverage,
		alignment_mode=alignment_mode,
		seq_id_mode=seq_id_mode,
		threads=threads,
		temp_dir=temp_dir,
		@@ -371,2 +414,3 @@ mmseqs_bin=mmseqs_bin,
		cdhit_bin=cdhit_bin,
		threads=0 if threads is None else threads,
		as_dict=True,
		@@ -383,2 +427,3 @@ debug=debug,
		strand=strand,
		threads=threads,
		as_dict=True,
		@@ -409,2 +454,3 @@ debug=debug,
		strand: str = "plus",
		threads: Optional[int] = None,
		as_dict: bool = False,
		@@ -478,6 +524,7 @@ debug: bool = False,
		if vsearch_bin is None:
		mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		system = platform.system().lower()
		machine = platform.machine().lower().replace("x86_64", "amd64")
		vsearch_bin = os.path.join(mod_dir, f"bin/vsearch_{system}_{machine}")
		vsearch_bin = get_binary_path("vsearch")
		# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		# system = platform.system().lower()
		# machine = platform.machine().lower().replace("x86_64", "amd64")
		# vsearch_bin = os.path.join(mod_dir, f"bin/vsearch_{system}_{machine}")
		# do clustering
		@@ -492,2 +539,4 @@ vsearch_cmd = f"{vsearch_bin} --cluster_fast {fasta_file}"
		vsearch_cmd += f" --strand {strand}"
		if threads is not None:
		vsearch_cmd += f" --threads {threads}"
		p = sp.Popen(vsearch_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
		@@ -531,2 +580,8 @@ stdout, stderr = p.communicate()
		threshold: float = 0.975,
		cluster_mode: str = "2",
		cov_mode: str = "0",
		coverage: float = 0.8,
		alignment_mode: str = "3",
		seq_id_mode: str = "1",
		threads: Optional[int] = None,
		temp_dir: str = "/tmp",
		@@ -549,2 +604,42 @@ mmseqs_bin: str = None,

		cluster_mode : str, default="2"
		Clustering mode. Options are ``"1"``, ``"2"``, or ``"3"``.
		- ``"1"``: greedy set cover
		- ``"2"``: connected component
		- ``"3"``: greedy incremental (CD-HIT-like)
		See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
		for more information.

		cov_mode : str, default="0"
		Coverage mode. Options are ``"0"``, ``"1"``, ``"2"``, or ``"3"``.
		- ``"0"``: bidirectional
		- ``"1"``: target coverage
		- ``"2"``: query coverage
		- ``"3"``: target-in-query length coverage
		See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
		for more information.

		coverage : float, default=0.8
		Coverage threshold for clustering. Must be between 0 and 1.

		alignment_mode : str, default="3"
		Alignment mode. Options are ``"0"``, ``"1"``, ``"2"``, ``"3"``, or ``"4"``.
		- ``"0"``: automatic
		- ``"1"``: only score and end_pos
		- ``"2"``: also start_pos and cov
		- ``"3"``: also seq.id
		- ``"4"``: only ungapped alignment
		See the `MMseqs2 documentation <https://mmseqs.com/latest/userguide.html#clustering>`_
		for more information.

		seq_id_mode : str, default="1"
		Sequence ID mode. Options are ``"0"``, ``"1"``, or ``"2"``.
		- ``"0"``: alignment length
		- ``"1"``: shorter sequence length
		- ``"2"``: longer sequence length

		threads : int, default=None
		Number of threads to use for clustering. If not provided, the number of threads
		will be determined by MMseqs2.

		temp_dir : str, default="/tmp"
		@@ -592,6 +687,7 @@ Path to a directory for temporary storage of clustering files.
		if mmseqs_bin is None:
		mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		system = platform.system().lower()
		machine = platform.machine().lower().replace("x86_64", "amd64")
		mmseqs_bin = os.path.join(mod_dir, f"bin/mmseqs_{system}_{machine}")
		mmseqs_bin = get_binary_path("mmseqs")
		# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		# system = platform.system().lower()
		# machine = platform.machine().lower().replace("x86_64", "amd64")
		# mmseqs_bin = os.path.join(mod_dir, f"bin/mmseqs_{system}_{machine}")
		# build the mmseqs DB
		@@ -609,2 +705,9 @@ db_cmd = f"{mmseqs_bin} createdb {fasta_file} {db_file}"
		cluster_cmd += f" --min-seq-id {threshold}"
		cluster_cmd += f" --cluster-mode {cluster_mode}"
		cluster_cmd += f" --cov-mode {cov_mode}"
		cluster_cmd += f" -c {coverage}"
		cluster_cmd += f" --alignment-mode {alignment_mode}"
		cluster_cmd += f" --seq-id-mode {seq_id_mode}"
		if threads is not None:
		cluster_cmd += f" --threads {threads}"
		p = sp.Popen(cluster_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
		@@ -657,2 +760,3 @@ stdout, stderr = p.communicate()
		cdhit_bin: str = None,
		threads: int = 0,
		as_dict: bool = False,
		@@ -680,2 +784,6 @@ debug: bool = False,

		threads : int, default=0
		Number of threads to use for clustering. If not provided, all available CPU cores
		will be used.

		id_key : str, default=None
		@@ -715,6 +823,7 @@ Key to retrieve the sequence ID. If not provided or missing, ``Sequence.id`` is used.
		if cdhit_bin is None:
		mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		system = platform.system().lower()
		machine = platform.machine().lower().replace("x86_64", "amd64")
		cdhit_bin = os.path.join(mod_dir, f"bin/cdhit_{system}_{machine}")
		cdhit_bin = get_binary_path("cdhit")
		# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		# system = platform.system().lower()
		# machine = platform.machine().lower().replace("x86_64", "amd64")
		# cdhit_bin = os.path.join(mod_dir, f"bin/cdhit_{system}_{machine}")
		# run CD-HIT
		@@ -729,4 +838,4 @@ # clustering options are as follows:
		cluster_cmd = f"{cdhit_bin} -i {fasta_file} -o {output_file}"
		cluster_cmd += f" -c {threshold} -n {wordsize}"
		cluster_cmd += " -d 0 -l 4 -T 0 -M 0"
		cluster_cmd += f" -c {threshold} -n {wordsize} -T {threads}"
		cluster_cmd += " -d 0 -l 4 -M 0"
		p = sp.Popen(cluster_cmd, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
		@@ -733,0 +842,0 @@ stdout, stderr = p.communicate()

+10

-8

abutils/tools/phylo.py

		@@ -44,2 +44,3 @@ #!/usr/bin/env python

		from ..bin import get_path as get_binary_path
		from ..core.sequence import Sequence, read_fasta, to_fasta
		@@ -113,10 +114,11 @@ from ..tools.cluster import Cluster, cluster
		if fasttree_bin is None:
		mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		# fasttree_bin = os.path.join(
		# mod_dir, f"bin/fasttree_{platform.system().lower()}"
		# )
		system = platform.system().lower()
		machine = platform.machine().lower()
		fasttree_bin = os.path.join(mod_dir, f"bin/fasttree_{system}_{machine}")
		fasttree_bin = fasttree_bin.replace("x86_64", "amd64")
		fasttree_bin = get_binary_path("fasttree")
		# mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
		# # fasttree_bin = os.path.join(
		# # mod_dir, f"bin/fasttree_{platform.system().lower()}"
		# # )
		# system = platform.system().lower()
		# machine = platform.machine().lower()
		# fasttree_bin = os.path.join(mod_dir, f"bin/fasttree_{system}_{machine}")
		# fasttree_bin = fasttree_bin.replace("x86_64", "amd64")
		# make output directory if necessary
		@@ -123,0 +125,0 @@ if tree_file is None:

+64

-16

abutils/tools/similarity.py

		@@ -33,6 +33,5 @@ #!/usr/bin/env python

		from ..core.pair import Pair
		from ..core.sequence import Sequence
		from ..core.pair import Pair


		__all__ = [
		@@ -153,2 +152,28 @@ "RepertoireSimilarity",

		def __iter__(self):
		for s in self.similarities:
		yield s

		def __len__(self):
		return len(self.similarities)

		def __repr__(self):
		return "<RepertoireSimilarities: {} comparisons>".format(len(self))

		def __str__(self):
		return "<RepertoireSimilarities: {} comparisons>".format(len(self))

		def __add__(self, other):
		if isinstance(other, RepertoireSimilarity):
		self.add(other)
		return self
		elif isinstance(other, RepertoireSimilarities):
		for s in other:
		self.add(s)
		return self
		else:
		raise TypeError(
		"RepertoireSimilarities can only be added to RepertoireSimilarity or RepertoireSimilarities objects."
		)

		@property
		@@ -166,3 +191,3 @@ def df(self):
		@property
		def means(self):
		def means(self) -> np.array:
		"""
		@@ -174,3 +199,3 @@ Returns an array of mean similarity values, one for each pairwise comparison.
		@property
		def medians(self):
		def medians(self) -> np.array:
		"""
		@@ -182,3 +207,3 @@ Returns an array of median similarity values, one for each pairwise comparison.
		@property
		def mins(self):
		def mins(self) -> np.array:
		"""
		@@ -190,3 +215,3 @@ Returns an array of minimum similarity values, one for each pairwise comparison.
		@property
		def maxs(self):
		def maxs(self) -> np.array:
		"""
		@@ -198,3 +223,3 @@ Returns an array of maximum similarity values, one for each pairwise comparison.
		@property
		def stds(self):
		def stds(self) -> np.array:
		"""
		@@ -206,3 +231,3 @@ Returns an array of standard deviations, one for each pairwise comparison.
		@property
		def sems(self):
		def sems(self) -> np.array:
		"""
		@@ -213,3 +238,7 @@ Returns an array of standard errors of the mean, one for each pairwise comparison.

		def squareform(self, method=None, agg=np.mean):
		def squareform(
		self,
		# method=None,
		agg=np.mean,
		) -> pd.DataFrame:
		"""
		@@ -233,4 +262,4 @@ Returns a squareform representation of the similarity data.
		_df = self.df[(self.df["sample1"] == s1) & (self.df["sample2"] == s2)]
		if method is not None:
		_df = _df[_df["method"] == method]
		# if method is not None:
		# _df = _df[_df["method"] == method]
		if _df.shape[0] == 0:
		@@ -278,5 +307,10 @@ continue
		A list of repertoires. Each repertoire can be a list of ``abutils.Sequence``
		objects or ``abutils.Pair`` objects. Individual repertoires cannot mix ``Sequence``
		and ``Pair`` objects, but the list of repertoires can contain both.
		objects or a list of ``abutils.Pair`` objects.

		.. note::
		Individual repertoires cannot mix ``Sequence`` and ``Pair`` objects,
		but the list of repertoires can contain both. For example, comparing a
		repetoire of ``Sequence`` objects to a repertoire of ``Pair`` objects is
		allowed.

		names : list, optional
		@@ -287,3 +321,11 @@ A list of names for the repertoires. If ``None``, names will be generated as
		method : str, optional
		The similarity method to use. Default is ``"morisita-horn"``.
		The similarity method to use. Options are:
		- ``"morisita-horn"``
		- ``"kullback-leibler"``
		- ``"jensen-shannon"``
		- ``"jaccard"``
		- ``"bray-curtis"``
		- ``"renkonen"``
		- ``"cosine"``
		Default is ``"morisita-horn"``.

		@@ -304,3 +346,4 @@ features : str, list, optional
		pairs_only : bool, optional
		Whether to use only paired sequences. Default is ``False``.
		Whether to use only paired sequences. Requires that all repertoires contain ``Pair`` objects,
		rather than ``Sequence`` objects. Default is ``False``.

		@@ -310,2 +353,7 @@ chain : str, optional

		.. note::
		This applies to both ``Sequence`` and ``Pair`` objects. If a repertoire contains
		``Sequence`` objects from multiple chains, only sequences of the specified chain will
		be used for similarity calculation, and the rest will be ignored.

		force_self_comparisons : bool, optional
		@@ -355,3 +403,3 @@ Whether to force self-comparisons. Only used if exactly two repertoires are provided.
		elif len(names) != len(repertoires):
		err = f"\nThe number of names must match the number of repertoires.\n"
		err = "\nThe number of names must match the number of repertoires.\n"
		err += f"You provided {len(names)} names and {len(repertoires)} repertoires.\n"
		@@ -358,0 +406,0 @@ raise RuntimeError(err)

+65

-55

abutils/utils/pipeline.py

		@@ -31,14 +31,20 @@ #!/usr/bin/env python
		import sys
		from typing import Iterable, Optional
		from typing import Iterable, Optional, Union

		from . import log

		if sys.version_info[0] > 2:
		STR_TYPES = [
		str,
		]
		# else:
		# STR_TYPES = [str, unicode]

		# for backward compatibility
		def list_files(args, *kwargs):
		from ..io import list_files

		return list_files(args, *kwargs)


		def make_dir(args, *kwargs):
		from ..io import make_dir

		return make_dir(args, *kwargs)


		def initialize(log_file, project_dir=None, debug=False):
		@@ -82,58 +88,62 @@ """

		def make_dir(directory: str) -> None:
		"""
		Makes a directory, if it doesn't already exist.
		# def make_dir(directory: str) -> None:
		# """
		# Makes a directory, if it doesn't already exist.

		Parameters
		----------
		directory : str
		Path to a directory.
		# Parameters
		# ----------
		# directory : str
		# Path to a directory.

		"""
		if not os.path.exists(directory):
		os.makedirs(os.path.abspath(directory))
		# """
		# if not os.path.exists(directory):
		# os.makedirs(os.path.abspath(directory))


		def list_files(directory: str, extension: Optional[str] = None) -> Iterable[str]:
		"""
		Lists files in a given directory.
		# def list_files(
		# directory: str, extension: Union[str, Iterable, None] = None
		# ) -> Iterable[str]:
		# """
		# Lists files in a given directory.

		Parameters
		----------
		directory : str
		Path to a directory.
		# Parameters
		# ----------
		# directory : str
		# Path to a directory.

		extension : str
		If supplied, only files that contain the specificied extension will be returned.
		Default is ``None``, which returns all files in the directory, regardless of extension.
		# extension : str
		# If supplied, only files that end with the specificied extension(s) will be returned. Can be either
		# a string or a list of strings. Extension evaluation is case-insensitive and can match complex
		# extensions (e.g. '.fastq.gz'). Default is ``None``, which returns all files in the directory,
		# regardless of extension.

		Returns
		-------
		Iterable[str]
		# Returns
		# -------
		# Iterable[str]

		"""
		if os.path.isdir(directory):
		expanded_dir = os.path.expanduser(directory)
		files = sorted(glob.glob(expanded_dir + "/*"))
		else:
		files = [
		directory,
		]
		if extension is not None:
		if type(extension) in STR_TYPES:
		extension = [
		extension,
		]
		files = [
		f
		for f in files
		if any(
		[
		f.split(".")[-1] in extension,
		f.split(".")[-1].upper() in extension,
		f.split(".")[-1].lower() in extension,
		]
		)
		]
		return files
		# """
		# if os.path.isdir(directory):
		# expanded_dir = os.path.expanduser(directory)
		# files = sorted(glob.glob(expanded_dir + "/*"))
		# else:
		# files = [
		# directory,
		# ]
		# if extension is not None:
		# if isinstance(extension, str):
		# extension = [
		# extension,
		# ]
		# files = [
		# f
		# for f in files
		# if any(
		# [
		# any([f.lower().endswith(e.lower()) for e in extension]),
		# any([f.endswith(e.upper()) for e in extension]),
		# any([f.endswith(e.lower()) for e in extension]),
		# ]
		# )
		# ]
		# return files

		@@ -140,0 +150,0 @@

+1

-1

abutils/version.py

		@@ -6,2 +6,2 @@ # Store the version here so:

		__version__ = "0.4.16"
		__version__ = "0.4.17"

+2

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: abutils
		Version: 0.4.16
		Version: 0.4.17
		Summary: Utilities for analysis of adaptive immune receptor repertoire (AIRR) data
		@@ -25,2 +25,3 @@ Home-page: https://github.com/briney/abutils
		Requires-Dist: celery
		Requires-Dist: dnachisel
		Requires-Dist: ete3
		@@ -27,0 +28,0 @@ Requires-Dist: fastcluster

+1

-0

requirements.txt

		@@ -5,2 +5,3 @@ abstar >= 0.6.3
		celery
		dnachisel
		ete3
		@@ -7,0 +8,0 @@ fastcluster

abutils/tools/alignment.py

Sorry, the diff of this file is too big to display

abutils - npm Package Compare versions

Improved metrics