bionumpy - npm Package Compare versions

+36

bionumpy/encodings/bool_encoding.py

		from typing import List

		import numpy as np

		from bionumpy.encoded_array import Encoding, EncodedArray
		from bionumpy.encodings.string_encodings import StringEncoding


		class BoolStringEncoding(Encoding):
		'''
		>>> from bionumpy.encodings.bool_encoding import bool_string
		>>> bool_string.encode(['True', 'False', 'True'])
		array([ True, False, True])
		>>> bool_string.decode([False, False])
		encoded_ragged_array(['False',
		'False'])
		'''
		returns_raw = True
		def __init__(self, true_string: str = 'True', false_string: str = 'False'):
		self._true_string = true_string
		self._false_string = false_string
		self._string_encoding = StringEncoding([false_string, true_string])
		self._lookup = np.array([false_string, true_string])

		def get_labels(self) -> List[str]:
		return [self._false_string, self._true_string]

		def encode(self, encoded_ragged_array):
		s = self._string_encoding.encode(encoded_ragged_array)
		return s.raw().astype(bool)

		def decode(self, encoded_array):
		a = EncodedArray(np.asanyarray(encoded_array).astype(int), self._string_encoding)
		return self._string_encoding.decode(a)

		bool_string = BoolStringEncoding()

+165

bionumpy/io/indexed_bam.py

		import dataclasses
		import os
		import warnings
		from pathlib import Path
		from typing import Iterable, Union, List

		import numpy as np
		from npstructures import RaggedArray

		from ..datatypes import BamEntry, Interval
		from ..encoded_array import EncodedRaggedArray, EncodedArray, BaseEncoding
		from ..arithmetics.intervals import fast_sort_intervals
		from ..encodings import CigarOpEncoding


		def alignments_to_bam_entries(alignments: Iterable['pysam.AlignedSegment'], none_on_empty=False) -> Union[
		BamEntry, None]:
		cols = alignments_to_cols(alignments)
		return cols.to_entry()


		@dataclasses.dataclass
		class BamAccumulator:
		reference_name: List[str]
		query_name: List[str]
		flag: List[int]
		reference_start: List[int]
		mapping_quality: List[int]
		cigar_ops_data: List[int]
		cigar_ops_lengths: List[int]
		cigar_len_data: List[int]
		cigar_len_lengths: List[int]
		seq_data: List[str]
		seq_lengths: List[int]
		qual_data: List[int]
		qual_lengths: List[int]

		def add_alignment(self, read: 'pysam.AlignedSegment'):
		accumulator = self
		accumulator.reference_name.append(read.reference_name)
		accumulator.query_name.append(read.query_name)
		accumulator.flag.append(read.flag)
		accumulator.reference_start.append(read.reference_start)
		accumulator.mapping_quality.append(read.mapping_quality)
		cigar_ops, cigar_lengths = zip(*read.cigartuples)
		accumulator.cigar_ops_data.extend(cigar_ops)
		accumulator.cigar_ops_lengths.append(len(cigar_ops))
		accumulator.cigar_len_data.extend(cigar_lengths)
		accumulator.cigar_len_lengths.append(len(cigar_lengths))
		seq = read.seq
		accumulator.seq_data.append(seq)
		accumulator.seq_lengths.append(len(seq))
		qual = read.qual
		accumulator.qual_data.append(qual)
		accumulator.qual_lengths.append(len(qual))

		def to_entry(self):
		if not len(self.reference_name):
		return BamEntry.empty()
		seq_data = np.frombuffer(bytes(''.join(self.seq_data), 'utf-8'), dtype=np.uint8)
		qual_data = np.frombuffer(bytes(''.join(self.qual_data), 'utf-8'), dtype=np.uint8)
		return BamEntry(
		np.array(self.reference_name),
		np.array(self.query_name),
		np.array(self.flag),
		np.array(self.reference_start),
		np.array(self.mapping_quality),
		EncodedRaggedArray(EncodedArray(
		np.array(self.cigar_ops_data), CigarOpEncoding),
		np.array(self.cigar_ops_lengths)),
		RaggedArray(np.array(self.cigar_len_data), np.array(self.cigar_len_lengths)),
		EncodedRaggedArray(EncodedArray(seq_data, BaseEncoding),
		np.array(self.seq_lengths)),
		RaggedArray(qual_data, np.array(self.qual_lengths)))


		def alignments_to_cols(alignments, min_start=0, accumulator=None):
		accumulator = accumulator or BamAccumulator(*(list() for _ in range(13)))
		for read in alignments:
		if read.reference_start < min_start:
		continue
		accumulator.add_alignment(read)
		return accumulator


		class IndexedBamFile:
		'''
		A wrapper class around pysam to extract all reads that overlap with any interval in a set of intervals.

		Examples
		--------
		>>> import bionumpy as bnp
		>>> bn = bnp.open_indexed('example_data/ctcf_chr21-22.bam')
		>>> intervals = bnp.open('example_data/ctcf.bed.gz').read()
		>>> bn[intervals]
		BamEntry with 12649 entries
		chromosome name flag position mapq cigar_op cigar_length sequence quality
		chr21 SOLEXA-1GA-2:2:39:563:1 0 10403175 37 M [36] AGGCGGAGCCCTAGGGACAGGAG [96 97 96 97 97 96 96 9
		chr21 SOLEXA-1GA-2:2:51:1257: 0 10403191 37 M [36] ACAGGAGGAGGGGAGTTGCGCAC [96 97 97 96 96 96 97 9
		chr21 SOLEXA-1GA-2:2:90:233:6 16 13980514 37 M [36] ACACCCTCCCCTCGCCGCTGCAG [66 92 90 90 94 92 79 7
		chr21 SOLEXA-1GA-2:2:62:293:1 16 13980528 37 M [36] CCGCTGCAGTGTAGAAACCCAAT [89 95 93 93 96 94 97 9
		chr21 SOLEXA-1GA-1:1:49:718:1 0 13980531 37 M [36] CTGCAGTGTAGAAACCCAATAGC [97 97 97 98 97 97 96 9
		chr21 SOLEXA-1GA-2:2:57:1221: 16 13980533 37 M [36] GCAGTGTAGAAACCCAATAGCGT [97 97 97 95 93 96 93 9
		chr21 SOLEXA-1GA-1:1:57:1445: 16 13980536 37 M [36] GTGTAGAAACCCAATAGCGTCCC [96 92 94 96 93 97 93 9
		chr21 SOLEXA-1GA-2:2:64:1358: 0 14120164 37 M [36] ACCCTTAAAAGACCCAGATGTTG [97 98 96 97 97 98 97 9
		chr21 SOLEXA-1GA-1:1:63:383:1 0 14120199 37 M [36] ATGGAAGCAGCTTCATATCCAAG [97 97 95 97 98 97 97 9
		chr21 SOLEXA-1GA-1:1:111:87:1 0 14120203 37 M [36] AAGCAGCTTCATATCCAAGGGTG [97 97 95 97 98 97 98 9
		'''

		def __init__(self, filename: str, create_index=False):
		try:
		import pysam
		except ImportError:
		raise ImportError('Please install pysam to use IndexedBamFile')
		warnings.warn('Indexed bam files are experimental and may not work as expected, use at your own risk')
		if create_index:
		index_filename= Path(filename).with_suffix('.bam.bai')
		if not os.path.isfile(index_filename):
		pysam.index(str(filename))
		self._samfile = pysam.AlignmentFile(filename, 'rb')

		def _fetch_from_sorted_intervals(self, intervals):
		cur_chromosome, last_stop, last_start = None, None, None
		accumulator = BamAccumulator(*(list() for _ in range(13)))
		for interval in intervals:
		if interval.chromosome == cur_chromosome:
		assert interval.start >= last_start, f'Intervals must be sorted {(interval.start, last_start)}'
		else:
		last_stop = 0
		alignments_to_cols(self._samfile.fetch(str(interval.chromosome),
		start=int(interval.start),
		stop=int(interval.stop)),
		min_start=last_stop, accumulator=accumulator)

		cur_chromosome, last_stop, last_start = interval.chromosome, interval.stop, interval.start
		continue

		return accumulator.to_entry()

		def get_all_overlapping(self, intervals: Interval) -> BamEntry:
		'''
		Extract all reads that overlap with any interval in the input item.
		Parameters
		----------
		intervals

		Returns
		-------

		'''
		item = fast_sort_intervals(intervals)
		return self._fetch_from_sorted_intervals(item)

		def __getitem__(self, item: Interval):
		'''
		Extract all reads that overlap with any interval in the input item.
		Parameters
		----------
		item

		Returns
		-------

		'''
		return self.get_all_overlapping(item)

+91

bionumpy/streams/memory_mapping.py

		import pickle
		import warnings
		from typing import Iterator, Callable

		import numpy as np

		from bionumpy import EncodedRaggedArray, EncodedArray


		class MemMapEncodedRaggedArray:

		@classmethod
		def load(self, basename: str) -> EncodedRaggedArray:
		'''
		Load a read only memory-mapped encoded ragged array.

		Parameters
		----------
		basename: str
		The base name of the memory-mapped files.

		Returns
		-------
		EncodedRaggedArray
		'''
		data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='r')
		lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='r')
		with open(f'{basename}_encoding.pkl', 'rb') as f:
		encoding = pickle.load(f)
		return EncodedRaggedArray(EncodedArray(data, encoding), lengths)

		@classmethod
		def create(cls, loader_creator: Callable[[], Iterator[EncodedRaggedArray]], basename) -> EncodedRaggedArray:
		'''
		Create a memory-mapped encoded ragged array.
		Takes in a callable that returns an iterator of EncodedRaggedArray objects.
		It goes through the iterator twice, first to calculate the total size of the data and lengths arrays,
		and then to write the data to disk.

		Returns an EncodedRaggedArray object where the data and lengths are memory-mapped.

		The basename provided is used to create the following files:
		- basename_data.dat
		- basename_lengths.dat
		- basename_encoding.pkl

		The same basename should be used to load the memory-mapped files later.

		Parameters
		----------
		loader_creator: Callable[[], Iterator[EncodedRaggedArray]]
		A callable that returns an iterator of EncodedRaggedArray objects.
		basename: str
		Where to store the memory-mapped files.

		Returns
		-------
		EncodedRaggedArray
		'''
		warnings.warn(
		f"{cls.__name__} is in an experimental stage and may change in the future.",
		category=FutureWarning,
		stacklevel=2
		)
		total_sequence_length = 0
		n_sequences = 0
		encoding = None
		for sequences in loader_creator():
		n_sequences += len(sequences)
		total_sequence_length += sequences.size
		if encoding is None:
		encoding = sequences.encoding
		else:
		assert encoding == sequences.encoding, f'Expected {encoding} but got {sequences.encoding}'
		with open(f'{basename}_encoding.pkl', 'wb') as f:
		pickle.dump(encoding, f)

		data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='w+', shape=total_sequence_length)
		lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='w+', shape=n_sequences)
		data_offset = 0
		length_offset = 0
		for sequences in loader_creator():
		data[data_offset:data_offset + sequences.size] = sequences.raw().ravel()
		data_offset += sequences.size

		lengths[length_offset:length_offset + len(sequences)] = sequences.lengths
		length_offset += len(sequences)
		data.flush()
		lengths.flush()
		return EncodedRaggedArray(EncodedArray(data, encoding), lengths)

+19

tests/io_tests/test_indexed_bam.py

		import pytest
		import bionumpy as bnp

		@pytest.fixture
		def pysam_install():
		try:
		import pysam
		except ImportError:
		pytest.skip()


		def test_indexed_bam(pysam_install, data_path):
		from bionumpy.io import open_indexed
		bam_filepath = data_path/'ctcf_chr21-22.bam'
		bed_filepath = data_path/'ctcf.bed.gz'
		bam = open_indexed(bam_filepath)
		bed = bnp.open(bed_filepath).read()
		alignments = bam[bed]
		assert len(alignments) == 12649

+55

tests/test_memory_mapping.py

		import numpy as np

		import pytest

		import bionumpy as bnp
		from bionumpy.streams.memory_mapping import MemMapEncodedRaggedArray as MemMap
		from bionumpy.util.testing import assert_encoded_raggedarray_equal


		@pytest.fixture
		def filename(data_path):
		return data_path / 'big.fq.gz'


		@pytest.fixture
		def basename(tmp_path, filename):
		return tmp_path / filename.stem


		@pytest.fixture
		def loader(filename):
		return lambda: (chunk.sequence for chunk in bnp.open(filename))


		def test_create(basename, loader):
		all_sequences = np.concatenate(list(loader()))
		mem_mapped_encoded_ragged_array = MemMap.create(loader, basename)
		assert_encoded_raggedarray_equal(all_sequences, mem_mapped_encoded_ragged_array)


		def test_load(basename, loader):
		all_sequences = np.concatenate(list(loader()))
		MemMap.create(loader, basename)
		loaded_mem_mapped_encoded_ragged_array = MemMap.load(basename)
		assert_encoded_raggedarray_equal(all_sequences, loaded_mem_mapped_encoded_ragged_array)


		@pytest.mark.skip('Needs pooch to download the file')
		def test_big_file(tmp_path):
		import pooch
		url = 'https://github.com/bionumpy/bionumpy-example-data/raw/refs/heads/master/big.fq.gz'
		filename = pooch.retrieve(url, known_hash=None, path=tmp_path)
		basename = tmp_path / 'big'
		n_entries = bnp.count_entries(filename)

		loader = lambda: (chunk.sequence for chunk in bnp.open(filename))
		total_size = sum(chunk.size for chunk in loader())
		mem_mapped = MemMap.create(loader, basename)
		n_sequences = len(mem_mapped)
		assert n_sequences == n_entries
		assert total_size == mem_mapped.size
		chunk_size = 32
		for i in range(0, n_sequences, chunk_size):
		chunk = mem_mapped[i:i + chunk_size]
		assert len(chunk) == min(chunk_size, n_sequences - i)

+1

-1

bionumpy.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: bionumpy
		Version: 1.0.12
		Version: 1.0.13
		Summary: Library for working with biological sequence data as numpy arrays.
		@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy

+5

-0

bionumpy.egg-info/SOURCES.txt

		@@ -42,2 +42,3 @@ HISTORY.rst
		bionumpy/encodings/base_encoding.py
		bionumpy/encodings/bool_encoding.py
		bionumpy/encodings/exceptions.py
		@@ -74,2 +75,3 @@ bionumpy/encodings/integer_encoding.py
		bionumpy/io/headers.py
		bionumpy/io/indexed_bam.py
		bionumpy/io/indexed_fasta.py
		@@ -123,2 +125,3 @@ bionumpy/io/indexed_files.py
		bionumpy/streams/left_join.py
		bionumpy/streams/memory_mapping.py
		bionumpy/streams/multistream.py
		@@ -186,2 +189,3 @@ bionumpy/streams/reductions.py
		tests/test_lazybnpdataclass.py
		tests/test_memory_mapping.py
		tests/test_minimizers.py
		@@ -223,2 +227,3 @@ tests/test_msa.py
		tests/io_tests/test_file_extensions.py
		tests/io_tests/test_indexed_bam.py
		tests/io_tests/test_sam.py
		@@ -225,0 +230,0 @@ tests/io_tests/test_sub_delimiters.py

+1

-1

bionumpy/__init__.py

		@@ -5,3 +5,3 @@ """Top-level package for bionumpy."""
		__email__ = "knutdrand@gmail.com"
		__version__ = '1.0.12'
		__version__ = '1.0.13'

		@@ -8,0 +8,0 @@ import npstructures as nps

+12

-2

bionumpy/arithmetics/intervals.py

		@@ -10,6 +10,7 @@ from typing import List
		from .bedgraph import BedGraph
		from .. import streamable
		from ..streams.decorators import streamable
		from ..streams.grouped import chromosome_map
		from ..datatypes import Interval
		from ..bnpdataclass import bnpdataclass
		from ..string_array import StringArray
		from ..util import interleave
		@@ -252,3 +253,2 @@ from ..bnpdataclass import replace
		return intervals[args]

		if sort_order is not None:
		@@ -261,3 +261,13 @@ chromosome_key_function = {name: i for i, name in enumerate(sort_order)}.__getitem__

		def fast_sort_intervals(intervals: Interval) -> Interval:
		if hasattr(intervals.chromosome, 'encoding') and isinstance(intervals.chromosome.encoding, StringEncoding):
		args = np.lexsort((intervals.start, intervals.chromosome))
		return intervals[args]
		if isinstance(intervals.chromosome, StringArray):
		args = np.lexsort((intervals.start, intervals.chromosome.raw()))
		return intervals[args]
		assert False, 'Fast sort intervals only works with StringEncoding and StringArray'



		@chromosome_map()
		@@ -264,0 +274,0 @@ def merge_intervals(intervals: Interval, distance: int = 0) -> Interval:

+58

-45

bionumpy/bnpdataclass/bnpdataclass.py

		@@ -344,55 +344,68 @@ import dataclasses
		pre_val = getattr(obj, field.name)
		numeric_types = (int, float, bool)
		optional_numeric_types = tuple(Optional[t] for t in numeric_types)
		if field.type == Union[BNPDataClass, str]:
		if isinstance(pre_val,
		(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \
		hasattr(pre_val, 'to_numpy'):
		val = as_encoded_array(pre_val)
		elif True or isinstance(pre_val, BNPDataClass):
		val = pre_val
		else:
		assert False, (field.type, type(pre_val))
		try:
		val = cls.__convert_single_field(field, pre_val)
		except Exception as e:
		raise ValueError(f"Error when converting {field.name} to {field.type} with value {pre_val}") from e

		elif field.type in numeric_types + optional_numeric_types:
		val = np.asanyarray(pre_val)
		elif field.type == str:
		assert isinstance(pre_val, (
		setattr(obj, field.name, val)

		@classmethod
		def __convert_single_field(cls, field, pre_val):
		numeric_types = (int, float, bool)
		optional_numeric_types = tuple(Optional[t] for t in numeric_types)
		if field.type == Union[BNPDataClass, str]:
		if isinstance(pre_val,
		(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \
		hasattr(pre_val, 'to_numpy'):
		val = as_encoded_array(pre_val)
		elif True or isinstance(pre_val, BNPDataClass):
		val = pre_val
		else:
		assert False, (field.type, type(pre_val))

		elif field.type in numeric_types + optional_numeric_types:
		val = np.asanyarray(pre_val)
		elif field.type == str:
		assert isinstance(pre_val, (
		str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or hasattr(pre_val,
		'to_numpy'), (
		field, pre_val, type(pre_val))
		val = as_encoded_array(pre_val)
		elif field.type == SequenceID or field.type == List[str]:
		if isinstance(pre_val, EncodedArray):
		val = pre_val
		else:
		val = as_string_array(pre_val)
		elif is_subclass_or_instance(field.type, Encoding):
		if is_subclass_or_instance(field.type, NumericEncoding):
		assert isinstance(pre_val,
		(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \
		(field, pre_val, type(pre_val))
		else:
		assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray)) or hasattr(pre_val, 'to_numpy'), (field, pre_val)
		# must do as_encoded and not explicit encode as pre_val might already
		# be encoded
		val = as_encoded_array(pre_val)
		elif field.type == SequenceID or field.type == List[str]:
		if isinstance(pre_val, EncodedArray):
		val = pre_val
		else:
		val = as_string_array(pre_val)
		elif is_subclass_or_instance(field.type, Encoding):
		if is_subclass_or_instance(field.type, NumericEncoding):
		assert isinstance(pre_val,
		(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \
		(field, pre_val, type(pre_val))
		val = as_encoded_array(pre_val, field.type)
		if isinstance(field.type, FlatAlphabetEncoding):
		val = val.ravel()
		elif field.type == List[int] or field.type == List[bool] or field.type == List[float]:
		if not isinstance(pre_val, RaggedArray):
		try:
		val = RaggedArray(pre_val)
		except TypeError as e:
		val = np.asanyarray(pre_val)
		else:
		val = pre_val
		elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass):
		# assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val))
		elif getattr(field.type, 'returns_raw', False) and isinstance(pre_val, (np.ndarray, np.generic)):
		val = pre_val
		else:
		assert False, field.type
		assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray, bool)) or hasattr(pre_val,
		'to_numpy'), (
		field, pre_val, type(pre_val), isinstance(pre_val, np.generic))
		val = as_encoded_array(pre_val, field.type)
		# must do as_encoded and not explicit encode as pre_val might already
		# be encoded
		if isinstance(field.type, FlatAlphabetEncoding):
		val = val.ravel()
		elif field.type == List[int] or field.type == List[bool] or field.type == List[float]:
		if not isinstance(pre_val, RaggedArray):
		try:
		val = RaggedArray(pre_val)
		except TypeError as e:
		val = np.asanyarray(pre_val)
		else:
		val = pre_val
		elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass):
		# assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val))
		val = pre_val
		else:
		assert False, field.type
		return val

		setattr(obj, field.name, val)

		NewClass.__name__ = base_class.__name__
		@@ -399,0 +412,0 @@ NewClass.__qualname__ = base_class.__qualname__

+2

-1

bionumpy/io/bam.py

		@@ -28,3 +28,4 @@ from functools import lru_cache
		self._ends = ends
		self._chromosome_names = as_encoded_array([h[0] for h in header_data])
		self._chromosome_names = np.array([h[0] for h in header_data])
		# self._chromosome_names = as_encoded_array([h[0] for h in header_data])
		self._header_data = header_data
		@@ -31,0 +32,0 @@ self._functions = [self._get_chromosome,

+3

-1

bionumpy/io/dump_csv.py

		@@ -7,2 +7,3 @@ import numpy as np
		change_encoding, as_encoded_array
		from ..encodings.bool_encoding import bool_string
		from ..encodings.string_encodings import StringEncoding
		@@ -52,2 +53,3 @@ from npstructures import RaggedArray
		str: str_func, # lambda x: x,
		bool_string: lambda x: bool_string.decode(x),
		SequenceID: seq_id_func,
		@@ -60,3 +62,3 @@ List[int]: int_lists_to_strings,
		}
		if is_subclass_or_instance(datatype, Encoding):
		if is_subclass_or_instance(datatype, Encoding) and not datatype==bool_string:
		encoding = datatype
		@@ -63,0 +65,0 @@

+2

-2

bionumpy/io/files.py

		@@ -214,4 +214,4 @@ from pathlib import PurePath
		is_gzip = suffix in (".gz", ".bam")
		if suffix == '.bam':
		reader = NumpyBamReader
		#if suffix == '.bam':
		# reader = NumpyBamReader
		if suffix == ".gz":
		@@ -218,0 +218,0 @@ suffix = path.suffixes[-2]

+12

-5

bionumpy/io/indexed_files.py

		from pathlib import PurePath
		import os

		from .indexed_bam import IndexedBamFile
		from .indexed_fasta import IndexedFasta, create_index
		@@ -48,6 +50,11 @@ from .files import bnp_open
		index_file_name = path.with_suffix(path.suffix + ".fai")
		assert suffix in (".fa", ".fasta"), "Only fasta supported for indexed read"
		if not os.path.isfile(index_file_name):
		index = create_index(path)
		bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index)
		return IndexedFasta(filename)

		if suffix in (".fa", ".fasta"):
		if not os.path.isfile(index_file_name):
		index = create_index(path)
		bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index)
		return IndexedFasta(filename)
		elif suffix == '.bam':
		return IndexedBamFile(filename, create_index=True)
		else:
		raise ValueError(f"Unknown file type {suffix} for indexed read. Only .fa, .fasta and .bam are supported.")

+16

-4

bionumpy/io/jaspar.py

		@@ -25,3 +25,15 @@ import numpy as np

		def read_csv_motif(filename):
		def read_csv_motif(filename: str) -> PWM:
		'''
		Read a PWM from a CSV file. The first line should be the alphabet, and the rest should be the matrix with probabilities.

		Parameters
		----------
		filename

		Returns
		-------
		PWM

		'''
		f = open(filename)
		@@ -31,5 +43,5 @@ alphabet = f.readline().strip().split(",")
		for line in f:
		line = line.strip().split(",")
		parts = line.strip().split(",")
		for i, letter in enumerate(alphabet):
		pwm[letter].append(float(line[i]))
		return PWM.from_dict(pwm)
		pwm[letter].append(float(parts[i]))
		return PWM.from_dict(pwm)

+4

-2

bionumpy/sequence/kmers.py

		@@ -84,3 +84,4 @@ import numpy as np
		# use the faster _get_dna_kmers
		return _get_dna_kmers(sequence, k)
		result = _get_dna_kmers(sequence, k)
		return result

		@@ -98,3 +99,4 @@ return KmerEncoder(k, sequence.encoding).rolling_window(sequence)
		out = EncodedRaggedArray(convoluted, shape, safe_mode=False) # FIXME: Should not use unsafe here

		if window_size == 1:
		return out
		return out[..., : (-window_size + 1)]
		@@ -101,0 +103,0 @@

+1

-1

bionumpy/sequence/position_weight_matrix.py

		@@ -103,3 +103,3 @@ import numpy as np
		@classmethod
		def from_dict(cls, dictionary: Dict[str, ArrayLike], background: Dict[str, float] = None) -> "PWM":
		def from_dict(cls, dictionary: Dict[str, typing.Union[ArrayLike, typing.List]], background: Dict[str, float] = None) -> "PWM":
		"""Create a PWM object from a dict of letters to position probabilities
		@@ -106,0 +106,0 @@

+2

-0

bionumpy/sequence/rollable.py

		@@ -63,2 +63,4 @@ from abc import abstractmethod
		out = as_strided(convoluted, shape)
		if window_size == 1:
		return out
		if mode == "valid":
		@@ -65,0 +67,0 @@ return out[..., : (-window_size + 1)]

+1

-1

bionumpy/string_array.py

		@@ -101,3 +101,3 @@ import numpy as np
		return value.raw()
		elif isinstance(value, EncodedArray):
		elif isinstance(value, (EncodedArray, EncodedRaggedArray)):
		print(value)
		@@ -104,0 +104,0 @@ return string_array(value)

+1

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: bionumpy
		Version: 1.0.12
		Version: 1.0.13
		Summary: Library for working with biological sequence data as numpy arrays.
		@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy

+1

-1

setup.py

		@@ -49,3 +49,3 @@ #!/usr/bin/env python
		url='https://github.com/bionumpy/bionumpy',
		version='1.0.12',
		version='1.0.13',
		zip_safe=False,
		@@ -52,0 +52,0 @@ extras_require={'full': ['isal']}

+21

-1

tests/test_bnpdataclass.py

		import dataclasses
		import pytest
		import numpy as np
		from bionumpy import AminoAcidEncoding, DNAEncoding
		from bionumpy import AminoAcidEncoding, DNAEncoding, EncodedArray, BaseEncoding
		from bionumpy.bnpdataclass import bnpdataclass
		@@ -9,2 +9,5 @@ from bionumpy.bnpdataclass.bnpdataclass import make_dataclass, BNPDataClass, dynamic_concatenate
		from numpy.testing import assert_equal

		from bionumpy.datatypes import SequenceID
		from bionumpy.encodings.bool_encoding import bool_string
		from bionumpy.util.testing import assert_bnpdataclass_equal
		@@ -145,1 +148,18 @@ # import pandas as pd

		@pytest.fixture()
		def bool_class():
		@bnpdataclass
		class BNPDC:
		sequence_id: SequenceID
		test_field: bool_string

		return BNPDC

		def test_bool_class(bool_class):
		obj = bool_class(sequence_id=['hei', 'ja'],
		test_field=['True', 'False'])
		from bionumpy.io.delimited_buffers import DelimitedBuffer
		buffer = DelimitedBuffer.from_data(obj)

+16

-5

tests/test_encodings.py

		@@ -13,7 +13,8 @@ import numpy as np
		from bionumpy.datatypes import SequenceEntryWithQuality
		from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, ACGTnEncoding
		from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, \
		ACGTnEncoding
		from bionumpy.encoded_array import NumericEncoding, OneToOneEncoding, BaseEncoding


		#from bionumpy.encoded_array import OneToOneEncoding
		# from bionumpy.encoded_array import OneToOneEncoding

		@@ -84,5 +85,5 @@ @pytest.fixture
		["1234",
		["1234", "5678"],
		np.array([1, 2, 3, 4]),
		RaggedArray([[1, 2, 3], [4]])])
		["1234", "5678"],
		np.array([1, 2, 3, 4]),
		RaggedArray([[1, 2, 3], [4]])])
		def test_digit_encoding(data):
		@@ -166,1 +167,11 @@ encoding = DigitEncoding
		assert_raggedarray_equal(encoded, encoded2)


		def test_bool_encoding():
		from bionumpy.encodings.bool_encoding import BoolStringEncoding, bool_string
		encoding = bool_string
		data = ["True", "False", "True"]
		encoded = encoding.encode(data)
		decoded = encoding.decode(encoded)
		assert np.all(decoded == data)
		print(as_encoded_array(data, encoding))

+5

-0

tests/test_kmer.py

		@@ -59,2 +59,7 @@ import pytest

		@pytest.mark.parametrize("encoding", [bnp.DNAEncoding, bnp.AminoAcidEncoding])
		def test_get_kmers_one(encoding):
		sequence = bionumpy.encoded_array.as_encoded_array(["ACTG"], encoding)
		kmers = bnp.sequence.get_kmers(sequence, 1)
		assert len(kmers[0]) == 4, kmers[0]

		@@ -61,0 +66,0 @@

+1

-1

tests/test_strops.py

		@@ -117,3 +117,3 @@ import pytest
		bam2 = bnp.open(data_path / "test.bam").read()
		assert np.all(str_equal(bam.chromosome, bam2.chromosome))
		assert np.all(bam.chromosome==bam2.chromosome)

		@@ -120,0 +120,0 @@

bionumpy - npm Package Compare versions

Improved metrics