Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

bionumpy

Package Overview
Dependencies
Maintainers
1
Versions
48
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

bionumpy - npm Package Compare versions

Comparing version
1.0.12
to
1.0.13
+36
bionumpy/encodings/bool_encoding.py
from typing import List
import numpy as np
from bionumpy.encoded_array import Encoding, EncodedArray
from bionumpy.encodings.string_encodings import StringEncoding
class BoolStringEncoding(Encoding):
'''
>>> from bionumpy.encodings.bool_encoding import bool_string
>>> bool_string.encode(['True', 'False', 'True'])
array([ True, False, True])
>>> bool_string.decode([False, False])
encoded_ragged_array(['False',
'False'])
'''
returns_raw = True
def __init__(self, true_string: str = 'True', false_string: str = 'False'):
self._true_string = true_string
self._false_string = false_string
self._string_encoding = StringEncoding([false_string, true_string])
self._lookup = np.array([false_string, true_string])
def get_labels(self) -> List[str]:
return [self._false_string, self._true_string]
def encode(self, encoded_ragged_array):
s = self._string_encoding.encode(encoded_ragged_array)
return s.raw().astype(bool)
def decode(self, encoded_array):
a = EncodedArray(np.asanyarray(encoded_array).astype(int), self._string_encoding)
return self._string_encoding.decode(a)
bool_string = BoolStringEncoding()
import dataclasses
import os
import warnings
from pathlib import Path
from typing import Iterable, Union, List
import numpy as np
from npstructures import RaggedArray
from ..datatypes import BamEntry, Interval
from ..encoded_array import EncodedRaggedArray, EncodedArray, BaseEncoding
from ..arithmetics.intervals import fast_sort_intervals
from ..encodings import CigarOpEncoding
def alignments_to_bam_entries(alignments: Iterable['pysam.AlignedSegment'], none_on_empty=False) -> Union[
BamEntry, None]:
cols = alignments_to_cols(alignments)
return cols.to_entry()
@dataclasses.dataclass
class BamAccumulator:
reference_name: List[str]
query_name: List[str]
flag: List[int]
reference_start: List[int]
mapping_quality: List[int]
cigar_ops_data: List[int]
cigar_ops_lengths: List[int]
cigar_len_data: List[int]
cigar_len_lengths: List[int]
seq_data: List[str]
seq_lengths: List[int]
qual_data: List[int]
qual_lengths: List[int]
def add_alignment(self, read: 'pysam.AlignedSegment'):
accumulator = self
accumulator.reference_name.append(read.reference_name)
accumulator.query_name.append(read.query_name)
accumulator.flag.append(read.flag)
accumulator.reference_start.append(read.reference_start)
accumulator.mapping_quality.append(read.mapping_quality)
cigar_ops, cigar_lengths = zip(*read.cigartuples)
accumulator.cigar_ops_data.extend(cigar_ops)
accumulator.cigar_ops_lengths.append(len(cigar_ops))
accumulator.cigar_len_data.extend(cigar_lengths)
accumulator.cigar_len_lengths.append(len(cigar_lengths))
seq = read.seq
accumulator.seq_data.append(seq)
accumulator.seq_lengths.append(len(seq))
qual = read.qual
accumulator.qual_data.append(qual)
accumulator.qual_lengths.append(len(qual))
def to_entry(self):
if not len(self.reference_name):
return BamEntry.empty()
seq_data = np.frombuffer(bytes(''.join(self.seq_data), 'utf-8'), dtype=np.uint8)
qual_data = np.frombuffer(bytes(''.join(self.qual_data), 'utf-8'), dtype=np.uint8)
return BamEntry(
np.array(self.reference_name),
np.array(self.query_name),
np.array(self.flag),
np.array(self.reference_start),
np.array(self.mapping_quality),
EncodedRaggedArray(EncodedArray(
np.array(self.cigar_ops_data), CigarOpEncoding),
np.array(self.cigar_ops_lengths)),
RaggedArray(np.array(self.cigar_len_data), np.array(self.cigar_len_lengths)),
EncodedRaggedArray(EncodedArray(seq_data, BaseEncoding),
np.array(self.seq_lengths)),
RaggedArray(qual_data, np.array(self.qual_lengths)))
def alignments_to_cols(alignments, min_start=0, accumulator=None):
accumulator = accumulator or BamAccumulator(*(list() for _ in range(13)))
for read in alignments:
if read.reference_start < min_start:
continue
accumulator.add_alignment(read)
return accumulator
class IndexedBamFile:
'''
A wrapper class around pysam to extract all reads that overlap with any interval in a set of intervals.
Examples
--------
>>> import bionumpy as bnp
>>> bn = bnp.open_indexed('example_data/ctcf_chr21-22.bam')
>>> intervals = bnp.open('example_data/ctcf.bed.gz').read()
>>> bn[intervals]
BamEntry with 12649 entries
chromosome name flag position mapq cigar_op cigar_length sequence quality
chr21 SOLEXA-1GA-2:2:39:563:1 0 10403175 37 M [36] AGGCGGAGCCCTAGGGACAGGAG [96 97 96 97 97 96 96 9
chr21 SOLEXA-1GA-2:2:51:1257: 0 10403191 37 M [36] ACAGGAGGAGGGGAGTTGCGCAC [96 97 97 96 96 96 97 9
chr21 SOLEXA-1GA-2:2:90:233:6 16 13980514 37 M [36] ACACCCTCCCCTCGCCGCTGCAG [66 92 90 90 94 92 79 7
chr21 SOLEXA-1GA-2:2:62:293:1 16 13980528 37 M [36] CCGCTGCAGTGTAGAAACCCAAT [89 95 93 93 96 94 97 9
chr21 SOLEXA-1GA-1:1:49:718:1 0 13980531 37 M [36] CTGCAGTGTAGAAACCCAATAGC [97 97 97 98 97 97 96 9
chr21 SOLEXA-1GA-2:2:57:1221: 16 13980533 37 M [36] GCAGTGTAGAAACCCAATAGCGT [97 97 97 95 93 96 93 9
chr21 SOLEXA-1GA-1:1:57:1445: 16 13980536 37 M [36] GTGTAGAAACCCAATAGCGTCCC [96 92 94 96 93 97 93 9
chr21 SOLEXA-1GA-2:2:64:1358: 0 14120164 37 M [36] ACCCTTAAAAGACCCAGATGTTG [97 98 96 97 97 98 97 9
chr21 SOLEXA-1GA-1:1:63:383:1 0 14120199 37 M [36] ATGGAAGCAGCTTCATATCCAAG [97 97 95 97 98 97 97 9
chr21 SOLEXA-1GA-1:1:111:87:1 0 14120203 37 M [36] AAGCAGCTTCATATCCAAGGGTG [97 97 95 97 98 97 98 9
'''
def __init__(self, filename: str, create_index=False):
try:
import pysam
except ImportError:
raise ImportError('Please install pysam to use IndexedBamFile')
warnings.warn('Indexed bam files are experimental and may not work as expected, use at your own risk')
if create_index:
index_filename= Path(filename).with_suffix('.bam.bai')
if not os.path.isfile(index_filename):
pysam.index(str(filename))
self._samfile = pysam.AlignmentFile(filename, 'rb')
def _fetch_from_sorted_intervals(self, intervals):
cur_chromosome, last_stop, last_start = None, None, None
accumulator = BamAccumulator(*(list() for _ in range(13)))
for interval in intervals:
if interval.chromosome == cur_chromosome:
assert interval.start >= last_start, f'Intervals must be sorted {(interval.start, last_start)}'
else:
last_stop = 0
alignments_to_cols(self._samfile.fetch(str(interval.chromosome),
start=int(interval.start),
stop=int(interval.stop)),
min_start=last_stop, accumulator=accumulator)
cur_chromosome, last_stop, last_start = interval.chromosome, interval.stop, interval.start
continue
return accumulator.to_entry()
def get_all_overlapping(self, intervals: Interval) -> BamEntry:
'''
Extract all reads that overlap with any interval in the input item.
Parameters
----------
intervals
Returns
-------
'''
item = fast_sort_intervals(intervals)
return self._fetch_from_sorted_intervals(item)
def __getitem__(self, item: Interval):
'''
Extract all reads that overlap with any interval in the input item.
Parameters
----------
item
Returns
-------
'''
return self.get_all_overlapping(item)
import pickle
import warnings
from typing import Iterator, Callable
import numpy as np
from bionumpy import EncodedRaggedArray, EncodedArray
class MemMapEncodedRaggedArray:
@classmethod
def load(self, basename: str) -> EncodedRaggedArray:
'''
Load a read only memory-mapped encoded ragged array.
Parameters
----------
basename: str
The base name of the memory-mapped files.
Returns
-------
EncodedRaggedArray
'''
data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='r')
lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='r')
with open(f'{basename}_encoding.pkl', 'rb') as f:
encoding = pickle.load(f)
return EncodedRaggedArray(EncodedArray(data, encoding), lengths)
@classmethod
def create(cls, loader_creator: Callable[[], Iterator[EncodedRaggedArray]], basename) -> EncodedRaggedArray:
'''
Create a memory-mapped encoded ragged array.
Takes in a callable that returns an iterator of EncodedRaggedArray objects.
It goes through the iterator twice, first to calculate the total size of the data and lengths arrays,
and then to write the data to disk.
Returns an EncodedRaggedArray object where the data and lengths are memory-mapped.
The basename provided is used to create the following files:
- basename_data.dat
- basename_lengths.dat
- basename_encoding.pkl
The same basename should be used to load the memory-mapped files later.
Parameters
----------
loader_creator: Callable[[], Iterator[EncodedRaggedArray]]
A callable that returns an iterator of EncodedRaggedArray objects.
basename: str
Where to store the memory-mapped files.
Returns
-------
EncodedRaggedArray
'''
warnings.warn(
f"{cls.__name__} is in an experimental stage and may change in the future.",
category=FutureWarning,
stacklevel=2
)
total_sequence_length = 0
n_sequences = 0
encoding = None
for sequences in loader_creator():
n_sequences += len(sequences)
total_sequence_length += sequences.size
if encoding is None:
encoding = sequences.encoding
else:
assert encoding == sequences.encoding, f'Expected {encoding} but got {sequences.encoding}'
with open(f'{basename}_encoding.pkl', 'wb') as f:
pickle.dump(encoding, f)
data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='w+', shape=total_sequence_length)
lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='w+', shape=n_sequences)
data_offset = 0
length_offset = 0
for sequences in loader_creator():
data[data_offset:data_offset + sequences.size] = sequences.raw().ravel()
data_offset += sequences.size
lengths[length_offset:length_offset + len(sequences)] = sequences.lengths
length_offset += len(sequences)
data.flush()
lengths.flush()
return EncodedRaggedArray(EncodedArray(data, encoding), lengths)
import pytest
import bionumpy as bnp
@pytest.fixture
def pysam_install():
try:
import pysam
except ImportError:
pytest.skip()
def test_indexed_bam(pysam_install, data_path):
from bionumpy.io import open_indexed
bam_filepath = data_path/'ctcf_chr21-22.bam'
bed_filepath = data_path/'ctcf.bed.gz'
bam = open_indexed(bam_filepath)
bed = bnp.open(bed_filepath).read()
alignments = bam[bed]
assert len(alignments) == 12649
import numpy as np
import pytest
import bionumpy as bnp
from bionumpy.streams.memory_mapping import MemMapEncodedRaggedArray as MemMap
from bionumpy.util.testing import assert_encoded_raggedarray_equal
@pytest.fixture
def filename(data_path):
return data_path / 'big.fq.gz'
@pytest.fixture
def basename(tmp_path, filename):
return tmp_path / filename.stem
@pytest.fixture
def loader(filename):
return lambda: (chunk.sequence for chunk in bnp.open(filename))
def test_create(basename, loader):
all_sequences = np.concatenate(list(loader()))
mem_mapped_encoded_ragged_array = MemMap.create(loader, basename)
assert_encoded_raggedarray_equal(all_sequences, mem_mapped_encoded_ragged_array)
def test_load(basename, loader):
all_sequences = np.concatenate(list(loader()))
MemMap.create(loader, basename)
loaded_mem_mapped_encoded_ragged_array = MemMap.load(basename)
assert_encoded_raggedarray_equal(all_sequences, loaded_mem_mapped_encoded_ragged_array)
@pytest.mark.skip('Needs pooch to download the file')
def test_big_file(tmp_path):
import pooch
url = 'https://github.com/bionumpy/bionumpy-example-data/raw/refs/heads/master/big.fq.gz'
filename = pooch.retrieve(url, known_hash=None, path=tmp_path)
basename = tmp_path / 'big'
n_entries = bnp.count_entries(filename)
loader = lambda: (chunk.sequence for chunk in bnp.open(filename))
total_size = sum(chunk.size for chunk in loader())
mem_mapped = MemMap.create(loader, basename)
n_sequences = len(mem_mapped)
assert n_sequences == n_entries
assert total_size == mem_mapped.size
chunk_size = 32
for i in range(0, n_sequences, chunk_size):
chunk = mem_mapped[i:i + chunk_size]
assert len(chunk) == min(chunk_size, n_sequences - i)
+1
-1
Metadata-Version: 2.1
Name: bionumpy
Version: 1.0.12
Version: 1.0.13
Summary: Library for working with biological sequence data as numpy arrays.

@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy

@@ -42,2 +42,3 @@ HISTORY.rst

bionumpy/encodings/base_encoding.py
bionumpy/encodings/bool_encoding.py
bionumpy/encodings/exceptions.py

@@ -74,2 +75,3 @@ bionumpy/encodings/integer_encoding.py

bionumpy/io/headers.py
bionumpy/io/indexed_bam.py
bionumpy/io/indexed_fasta.py

@@ -123,2 +125,3 @@ bionumpy/io/indexed_files.py

bionumpy/streams/left_join.py
bionumpy/streams/memory_mapping.py
bionumpy/streams/multistream.py

@@ -186,2 +189,3 @@ bionumpy/streams/reductions.py

tests/test_lazybnpdataclass.py
tests/test_memory_mapping.py
tests/test_minimizers.py

@@ -223,2 +227,3 @@ tests/test_msa.py

tests/io_tests/test_file_extensions.py
tests/io_tests/test_indexed_bam.py
tests/io_tests/test_sam.py

@@ -225,0 +230,0 @@ tests/io_tests/test_sub_delimiters.py

@@ -5,3 +5,3 @@ """Top-level package for bionumpy."""

__email__ = "knutdrand@gmail.com"
__version__ = '1.0.12'
__version__ = '1.0.13'

@@ -8,0 +8,0 @@ import npstructures as nps

@@ -10,6 +10,7 @@ from typing import List

from .bedgraph import BedGraph
from .. import streamable
from ..streams.decorators import streamable
from ..streams.grouped import chromosome_map
from ..datatypes import Interval
from ..bnpdataclass import bnpdataclass
from ..string_array import StringArray
from ..util import interleave

@@ -252,3 +253,2 @@ from ..bnpdataclass import replace

return intervals[args]
if sort_order is not None:

@@ -261,3 +261,13 @@ chromosome_key_function = {name: i for i, name in enumerate(sort_order)}.__getitem__

def fast_sort_intervals(intervals: Interval) -> Interval:
if hasattr(intervals.chromosome, 'encoding') and isinstance(intervals.chromosome.encoding, StringEncoding):
args = np.lexsort((intervals.start, intervals.chromosome))
return intervals[args]
if isinstance(intervals.chromosome, StringArray):
args = np.lexsort((intervals.start, intervals.chromosome.raw()))
return intervals[args]
assert False, 'Fast sort intervals only works with StringEncoding and StringArray'
@chromosome_map()

@@ -264,0 +274,0 @@ def merge_intervals(intervals: Interval, distance: int = 0) -> Interval:

@@ -344,55 +344,68 @@ import dataclasses

pre_val = getattr(obj, field.name)
numeric_types = (int, float, bool)
optional_numeric_types = tuple(Optional[t] for t in numeric_types)
if field.type == Union[BNPDataClass, str]:
if isinstance(pre_val,
(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \
hasattr(pre_val, 'to_numpy'):
val = as_encoded_array(pre_val)
elif True or isinstance(pre_val, BNPDataClass):
val = pre_val
else:
assert False, (field.type, type(pre_val))
try:
val = cls.__convert_single_field(field, pre_val)
except Exception as e:
raise ValueError(f"Error when converting {field.name} to {field.type} with value {pre_val}") from e
elif field.type in numeric_types + optional_numeric_types:
val = np.asanyarray(pre_val)
elif field.type == str:
assert isinstance(pre_val, (
setattr(obj, field.name, val)
@classmethod
def __convert_single_field(cls, field, pre_val):
numeric_types = (int, float, bool)
optional_numeric_types = tuple(Optional[t] for t in numeric_types)
if field.type == Union[BNPDataClass, str]:
if isinstance(pre_val,
(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \
hasattr(pre_val, 'to_numpy'):
val = as_encoded_array(pre_val)
elif True or isinstance(pre_val, BNPDataClass):
val = pre_val
else:
assert False, (field.type, type(pre_val))
elif field.type in numeric_types + optional_numeric_types:
val = np.asanyarray(pre_val)
elif field.type == str:
assert isinstance(pre_val, (
str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or hasattr(pre_val,
'to_numpy'), (
field, pre_val, type(pre_val))
val = as_encoded_array(pre_val)
elif field.type == SequenceID or field.type == List[str]:
if isinstance(pre_val, EncodedArray):
val = pre_val
else:
val = as_string_array(pre_val)
elif is_subclass_or_instance(field.type, Encoding):
if is_subclass_or_instance(field.type, NumericEncoding):
assert isinstance(pre_val,
(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \
(field, pre_val, type(pre_val))
else:
assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray)) or hasattr(pre_val, 'to_numpy'), (field, pre_val)
# must do as_encoded and not explicit encode as pre_val might already
# be encoded
val = as_encoded_array(pre_val)
elif field.type == SequenceID or field.type == List[str]:
if isinstance(pre_val, EncodedArray):
val = pre_val
else:
val = as_string_array(pre_val)
elif is_subclass_or_instance(field.type, Encoding):
if is_subclass_or_instance(field.type, NumericEncoding):
assert isinstance(pre_val,
(str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \
(field, pre_val, type(pre_val))
val = as_encoded_array(pre_val, field.type)
if isinstance(field.type, FlatAlphabetEncoding):
val = val.ravel()
elif field.type == List[int] or field.type == List[bool] or field.type == List[float]:
if not isinstance(pre_val, RaggedArray):
try:
val = RaggedArray(pre_val)
except TypeError as e:
val = np.asanyarray(pre_val)
else:
val = pre_val
elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass):
# assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val))
elif getattr(field.type, 'returns_raw', False) and isinstance(pre_val, (np.ndarray, np.generic)):
val = pre_val
else:
assert False, field.type
assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray, bool)) or hasattr(pre_val,
'to_numpy'), (
field, pre_val, type(pre_val), isinstance(pre_val, np.generic))
val = as_encoded_array(pre_val, field.type)
# must do as_encoded and not explicit encode as pre_val might already
# be encoded
if isinstance(field.type, FlatAlphabetEncoding):
val = val.ravel()
elif field.type == List[int] or field.type == List[bool] or field.type == List[float]:
if not isinstance(pre_val, RaggedArray):
try:
val = RaggedArray(pre_val)
except TypeError as e:
val = np.asanyarray(pre_val)
else:
val = pre_val
elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass):
# assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val))
val = pre_val
else:
assert False, field.type
return val
setattr(obj, field.name, val)
NewClass.__name__ = base_class.__name__

@@ -399,0 +412,0 @@ NewClass.__qualname__ = base_class.__qualname__

@@ -28,3 +28,4 @@ from functools import lru_cache

self._ends = ends
self._chromosome_names = as_encoded_array([h[0] for h in header_data])
self._chromosome_names = np.array([h[0] for h in header_data])
# self._chromosome_names = as_encoded_array([h[0] for h in header_data])
self._header_data = header_data

@@ -31,0 +32,0 @@ self._functions = [self._get_chromosome,

@@ -7,2 +7,3 @@ import numpy as np

change_encoding, as_encoded_array
from ..encodings.bool_encoding import bool_string
from ..encodings.string_encodings import StringEncoding

@@ -52,2 +53,3 @@ from npstructures import RaggedArray

str: str_func, # lambda x: x,
bool_string: lambda x: bool_string.decode(x),
SequenceID: seq_id_func,

@@ -60,3 +62,3 @@ List[int]: int_lists_to_strings,

}
if is_subclass_or_instance(datatype, Encoding):
if is_subclass_or_instance(datatype, Encoding) and not datatype==bool_string:
encoding = datatype

@@ -63,0 +65,0 @@

@@ -214,4 +214,4 @@ from pathlib import PurePath

is_gzip = suffix in (".gz", ".bam")
if suffix == '.bam':
reader = NumpyBamReader
#if suffix == '.bam':
# reader = NumpyBamReader
if suffix == ".gz":

@@ -218,0 +218,0 @@ suffix = path.suffixes[-2]

from pathlib import PurePath
import os
from .indexed_bam import IndexedBamFile
from .indexed_fasta import IndexedFasta, create_index

@@ -48,6 +50,11 @@ from .files import bnp_open

index_file_name = path.with_suffix(path.suffix + ".fai")
assert suffix in (".fa", ".fasta"), "Only fasta supported for indexed read"
if not os.path.isfile(index_file_name):
index = create_index(path)
bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index)
return IndexedFasta(filename)
if suffix in (".fa", ".fasta"):
if not os.path.isfile(index_file_name):
index = create_index(path)
bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index)
return IndexedFasta(filename)
elif suffix == '.bam':
return IndexedBamFile(filename, create_index=True)
else:
raise ValueError(f"Unknown file type {suffix} for indexed read. Only .fa, .fasta and .bam are supported.")

@@ -25,3 +25,15 @@ import numpy as np

def read_csv_motif(filename):
def read_csv_motif(filename: str) -> PWM:
'''
Read a PWM from a CSV file. The first line should be the alphabet, and the rest should be the matrix with probabilities.
Parameters
----------
filename
Returns
-------
PWM
'''
f = open(filename)

@@ -31,5 +43,5 @@ alphabet = f.readline().strip().split(",")

for line in f:
line = line.strip().split(",")
parts = line.strip().split(",")
for i, letter in enumerate(alphabet):
pwm[letter].append(float(line[i]))
return PWM.from_dict(pwm)
pwm[letter].append(float(parts[i]))
return PWM.from_dict(pwm)

@@ -84,3 +84,4 @@ import numpy as np

# use the faster _get_dna_kmers
return _get_dna_kmers(sequence, k)
result = _get_dna_kmers(sequence, k)
return result

@@ -98,3 +99,4 @@ return KmerEncoder(k, sequence.encoding).rolling_window(sequence)

out = EncodedRaggedArray(convoluted, shape, safe_mode=False) # FIXME: Should not use unsafe here
if window_size == 1:
return out
return out[..., : (-window_size + 1)]

@@ -101,0 +103,0 @@

@@ -103,3 +103,3 @@ import numpy as np

@classmethod
def from_dict(cls, dictionary: Dict[str, ArrayLike], background: Dict[str, float] = None) -> "PWM":
def from_dict(cls, dictionary: Dict[str, typing.Union[ArrayLike, typing.List]], background: Dict[str, float] = None) -> "PWM":
"""Create a PWM object from a dict of letters to position probabilities

@@ -106,0 +106,0 @@

@@ -63,2 +63,4 @@ from abc import abstractmethod

out = as_strided(convoluted, shape)
if window_size == 1:
return out
if mode == "valid":

@@ -65,0 +67,0 @@ return out[..., : (-window_size + 1)]

@@ -101,3 +101,3 @@ import numpy as np

return value.raw()
elif isinstance(value, EncodedArray):
elif isinstance(value, (EncodedArray, EncodedRaggedArray)):
print(value)

@@ -104,0 +104,0 @@ return string_array(value)

Metadata-Version: 2.1
Name: bionumpy
Version: 1.0.12
Version: 1.0.13
Summary: Library for working with biological sequence data as numpy arrays.

@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy

@@ -49,3 +49,3 @@ #!/usr/bin/env python

url='https://github.com/bionumpy/bionumpy',
version='1.0.12',
version='1.0.13',
zip_safe=False,

@@ -52,0 +52,0 @@ extras_require={'full': ['isal']}

import dataclasses
import pytest
import numpy as np
from bionumpy import AminoAcidEncoding, DNAEncoding
from bionumpy import AminoAcidEncoding, DNAEncoding, EncodedArray, BaseEncoding
from bionumpy.bnpdataclass import bnpdataclass

@@ -9,2 +9,5 @@ from bionumpy.bnpdataclass.bnpdataclass import make_dataclass, BNPDataClass, dynamic_concatenate

from numpy.testing import assert_equal
from bionumpy.datatypes import SequenceID
from bionumpy.encodings.bool_encoding import bool_string
from bionumpy.util.testing import assert_bnpdataclass_equal

@@ -145,1 +148,18 @@ # import pandas as pd

@pytest.fixture()
def bool_class():
@bnpdataclass
class BNPDC:
sequence_id: SequenceID
test_field: bool_string
return BNPDC
def test_bool_class(bool_class):
obj = bool_class(sequence_id=['hei', 'ja'],
test_field=['True', 'False'])
from bionumpy.io.delimited_buffers import DelimitedBuffer
buffer = DelimitedBuffer.from_data(obj)

@@ -13,7 +13,8 @@ import numpy as np

from bionumpy.datatypes import SequenceEntryWithQuality
from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, ACGTnEncoding
from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, \
ACGTnEncoding
from bionumpy.encoded_array import NumericEncoding, OneToOneEncoding, BaseEncoding
#from bionumpy.encoded_array import OneToOneEncoding
# from bionumpy.encoded_array import OneToOneEncoding

@@ -84,5 +85,5 @@ @pytest.fixture

["1234",
["1234", "5678"],
np.array([1, 2, 3, 4]),
RaggedArray([[1, 2, 3], [4]])])
["1234", "5678"],
np.array([1, 2, 3, 4]),
RaggedArray([[1, 2, 3], [4]])])
def test_digit_encoding(data):

@@ -166,1 +167,11 @@ encoding = DigitEncoding

assert_raggedarray_equal(encoded, encoded2)
def test_bool_encoding():
from bionumpy.encodings.bool_encoding import BoolStringEncoding, bool_string
encoding = bool_string
data = ["True", "False", "True"]
encoded = encoding.encode(data)
decoded = encoding.decode(encoded)
assert np.all(decoded == data)
print(as_encoded_array(data, encoding))

@@ -59,2 +59,7 @@ import pytest

@pytest.mark.parametrize("encoding", [bnp.DNAEncoding, bnp.AminoAcidEncoding])
def test_get_kmers_one(encoding):
sequence = bionumpy.encoded_array.as_encoded_array(["ACTG"], encoding)
kmers = bnp.sequence.get_kmers(sequence, 1)
assert len(kmers[0]) == 4, kmers[0]

@@ -61,0 +66,0 @@

@@ -117,3 +117,3 @@ import pytest

bam2 = bnp.open(data_path / "test.bam").read()
assert np.all(str_equal(bam.chromosome, bam2.chromosome))
assert np.all(bam.chromosome==bam2.chromosome)

@@ -120,0 +120,0 @@