bionumpy
Advanced tools
| from typing import List | ||
| import numpy as np | ||
| from bionumpy.encoded_array import Encoding, EncodedArray | ||
| from bionumpy.encodings.string_encodings import StringEncoding | ||
| class BoolStringEncoding(Encoding): | ||
| ''' | ||
| >>> from bionumpy.encodings.bool_encoding import bool_string | ||
| >>> bool_string.encode(['True', 'False', 'True']) | ||
| array([ True, False, True]) | ||
| >>> bool_string.decode([False, False]) | ||
| encoded_ragged_array(['False', | ||
| 'False']) | ||
| ''' | ||
| returns_raw = True | ||
| def __init__(self, true_string: str = 'True', false_string: str = 'False'): | ||
| self._true_string = true_string | ||
| self._false_string = false_string | ||
| self._string_encoding = StringEncoding([false_string, true_string]) | ||
| self._lookup = np.array([false_string, true_string]) | ||
| def get_labels(self) -> List[str]: | ||
| return [self._false_string, self._true_string] | ||
| def encode(self, encoded_ragged_array): | ||
| s = self._string_encoding.encode(encoded_ragged_array) | ||
| return s.raw().astype(bool) | ||
| def decode(self, encoded_array): | ||
| a = EncodedArray(np.asanyarray(encoded_array).astype(int), self._string_encoding) | ||
| return self._string_encoding.decode(a) | ||
| bool_string = BoolStringEncoding() |
| import dataclasses | ||
| import os | ||
| import warnings | ||
| from pathlib import Path | ||
| from typing import Iterable, Union, List | ||
| import numpy as np | ||
| from npstructures import RaggedArray | ||
| from ..datatypes import BamEntry, Interval | ||
| from ..encoded_array import EncodedRaggedArray, EncodedArray, BaseEncoding | ||
| from ..arithmetics.intervals import fast_sort_intervals | ||
| from ..encodings import CigarOpEncoding | ||
| def alignments_to_bam_entries(alignments: Iterable['pysam.AlignedSegment'], none_on_empty=False) -> Union[ | ||
| BamEntry, None]: | ||
| cols = alignments_to_cols(alignments) | ||
| return cols.to_entry() | ||
| @dataclasses.dataclass | ||
| class BamAccumulator: | ||
| reference_name: List[str] | ||
| query_name: List[str] | ||
| flag: List[int] | ||
| reference_start: List[int] | ||
| mapping_quality: List[int] | ||
| cigar_ops_data: List[int] | ||
| cigar_ops_lengths: List[int] | ||
| cigar_len_data: List[int] | ||
| cigar_len_lengths: List[int] | ||
| seq_data: List[str] | ||
| seq_lengths: List[int] | ||
| qual_data: List[int] | ||
| qual_lengths: List[int] | ||
| def add_alignment(self, read: 'pysam.AlignedSegment'): | ||
| accumulator = self | ||
| accumulator.reference_name.append(read.reference_name) | ||
| accumulator.query_name.append(read.query_name) | ||
| accumulator.flag.append(read.flag) | ||
| accumulator.reference_start.append(read.reference_start) | ||
| accumulator.mapping_quality.append(read.mapping_quality) | ||
| cigar_ops, cigar_lengths = zip(*read.cigartuples) | ||
| accumulator.cigar_ops_data.extend(cigar_ops) | ||
| accumulator.cigar_ops_lengths.append(len(cigar_ops)) | ||
| accumulator.cigar_len_data.extend(cigar_lengths) | ||
| accumulator.cigar_len_lengths.append(len(cigar_lengths)) | ||
| seq = read.seq | ||
| accumulator.seq_data.append(seq) | ||
| accumulator.seq_lengths.append(len(seq)) | ||
| qual = read.qual | ||
| accumulator.qual_data.append(qual) | ||
| accumulator.qual_lengths.append(len(qual)) | ||
| def to_entry(self): | ||
| if not len(self.reference_name): | ||
| return BamEntry.empty() | ||
| seq_data = np.frombuffer(bytes(''.join(self.seq_data), 'utf-8'), dtype=np.uint8) | ||
| qual_data = np.frombuffer(bytes(''.join(self.qual_data), 'utf-8'), dtype=np.uint8) | ||
| return BamEntry( | ||
| np.array(self.reference_name), | ||
| np.array(self.query_name), | ||
| np.array(self.flag), | ||
| np.array(self.reference_start), | ||
| np.array(self.mapping_quality), | ||
| EncodedRaggedArray(EncodedArray( | ||
| np.array(self.cigar_ops_data), CigarOpEncoding), | ||
| np.array(self.cigar_ops_lengths)), | ||
| RaggedArray(np.array(self.cigar_len_data), np.array(self.cigar_len_lengths)), | ||
| EncodedRaggedArray(EncodedArray(seq_data, BaseEncoding), | ||
| np.array(self.seq_lengths)), | ||
| RaggedArray(qual_data, np.array(self.qual_lengths))) | ||
| def alignments_to_cols(alignments, min_start=0, accumulator=None): | ||
| accumulator = accumulator or BamAccumulator(*(list() for _ in range(13))) | ||
| for read in alignments: | ||
| if read.reference_start < min_start: | ||
| continue | ||
| accumulator.add_alignment(read) | ||
| return accumulator | ||
| class IndexedBamFile: | ||
| ''' | ||
| A wrapper class around pysam to extract all reads that overlap with any interval in a set of intervals. | ||
| Examples | ||
| -------- | ||
| >>> import bionumpy as bnp | ||
| >>> bn = bnp.open_indexed('example_data/ctcf_chr21-22.bam') | ||
| >>> intervals = bnp.open('example_data/ctcf.bed.gz').read() | ||
| >>> bn[intervals] | ||
| BamEntry with 12649 entries | ||
| chromosome name flag position mapq cigar_op cigar_length sequence quality | ||
| chr21 SOLEXA-1GA-2:2:39:563:1 0 10403175 37 M [36] AGGCGGAGCCCTAGGGACAGGAG [96 97 96 97 97 96 96 9 | ||
| chr21 SOLEXA-1GA-2:2:51:1257: 0 10403191 37 M [36] ACAGGAGGAGGGGAGTTGCGCAC [96 97 97 96 96 96 97 9 | ||
| chr21 SOLEXA-1GA-2:2:90:233:6 16 13980514 37 M [36] ACACCCTCCCCTCGCCGCTGCAG [66 92 90 90 94 92 79 7 | ||
| chr21 SOLEXA-1GA-2:2:62:293:1 16 13980528 37 M [36] CCGCTGCAGTGTAGAAACCCAAT [89 95 93 93 96 94 97 9 | ||
| chr21 SOLEXA-1GA-1:1:49:718:1 0 13980531 37 M [36] CTGCAGTGTAGAAACCCAATAGC [97 97 97 98 97 97 96 9 | ||
| chr21 SOLEXA-1GA-2:2:57:1221: 16 13980533 37 M [36] GCAGTGTAGAAACCCAATAGCGT [97 97 97 95 93 96 93 9 | ||
| chr21 SOLEXA-1GA-1:1:57:1445: 16 13980536 37 M [36] GTGTAGAAACCCAATAGCGTCCC [96 92 94 96 93 97 93 9 | ||
| chr21 SOLEXA-1GA-2:2:64:1358: 0 14120164 37 M [36] ACCCTTAAAAGACCCAGATGTTG [97 98 96 97 97 98 97 9 | ||
| chr21 SOLEXA-1GA-1:1:63:383:1 0 14120199 37 M [36] ATGGAAGCAGCTTCATATCCAAG [97 97 95 97 98 97 97 9 | ||
| chr21 SOLEXA-1GA-1:1:111:87:1 0 14120203 37 M [36] AAGCAGCTTCATATCCAAGGGTG [97 97 95 97 98 97 98 9 | ||
| ''' | ||
| def __init__(self, filename: str, create_index=False): | ||
| try: | ||
| import pysam | ||
| except ImportError: | ||
| raise ImportError('Please install pysam to use IndexedBamFile') | ||
| warnings.warn('Indexed bam files are experimental and may not work as expected, use at your own risk') | ||
| if create_index: | ||
| index_filename= Path(filename).with_suffix('.bam.bai') | ||
| if not os.path.isfile(index_filename): | ||
| pysam.index(str(filename)) | ||
| self._samfile = pysam.AlignmentFile(filename, 'rb') | ||
| def _fetch_from_sorted_intervals(self, intervals): | ||
| cur_chromosome, last_stop, last_start = None, None, None | ||
| accumulator = BamAccumulator(*(list() for _ in range(13))) | ||
| for interval in intervals: | ||
| if interval.chromosome == cur_chromosome: | ||
| assert interval.start >= last_start, f'Intervals must be sorted {(interval.start, last_start)}' | ||
| else: | ||
| last_stop = 0 | ||
| alignments_to_cols(self._samfile.fetch(str(interval.chromosome), | ||
| start=int(interval.start), | ||
| stop=int(interval.stop)), | ||
| min_start=last_stop, accumulator=accumulator) | ||
| cur_chromosome, last_stop, last_start = interval.chromosome, interval.stop, interval.start | ||
| continue | ||
| return accumulator.to_entry() | ||
| def get_all_overlapping(self, intervals: Interval) -> BamEntry: | ||
| ''' | ||
| Extract all reads that overlap with any interval in the input item. | ||
| Parameters | ||
| ---------- | ||
| intervals | ||
| Returns | ||
| ------- | ||
| ''' | ||
| item = fast_sort_intervals(intervals) | ||
| return self._fetch_from_sorted_intervals(item) | ||
| def __getitem__(self, item: Interval): | ||
| ''' | ||
| Extract all reads that overlap with any interval in the input item. | ||
| Parameters | ||
| ---------- | ||
| item | ||
| Returns | ||
| ------- | ||
| ''' | ||
| return self.get_all_overlapping(item) |
| import pickle | ||
| import warnings | ||
| from typing import Iterator, Callable | ||
| import numpy as np | ||
| from bionumpy import EncodedRaggedArray, EncodedArray | ||
| class MemMapEncodedRaggedArray: | ||
| @classmethod | ||
| def load(self, basename: str) -> EncodedRaggedArray: | ||
| ''' | ||
| Load a read only memory-mapped encoded ragged array. | ||
| Parameters | ||
| ---------- | ||
| basename: str | ||
| The base name of the memory-mapped files. | ||
| Returns | ||
| ------- | ||
| EncodedRaggedArray | ||
| ''' | ||
| data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='r') | ||
| lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='r') | ||
| with open(f'{basename}_encoding.pkl', 'rb') as f: | ||
| encoding = pickle.load(f) | ||
| return EncodedRaggedArray(EncodedArray(data, encoding), lengths) | ||
| @classmethod | ||
| def create(cls, loader_creator: Callable[[], Iterator[EncodedRaggedArray]], basename) -> EncodedRaggedArray: | ||
| ''' | ||
| Create a memory-mapped encoded ragged array. | ||
| Takes in a callable that returns an iterator of EncodedRaggedArray objects. | ||
| It goes through the iterator twice, first to calculate the total size of the data and lengths arrays, | ||
| and then to write the data to disk. | ||
| Returns an EncodedRaggedArray object where the data and lengths are memory-mapped. | ||
| The basename provided is used to create the following files: | ||
| - basename_data.dat | ||
| - basename_lengths.dat | ||
| - basename_encoding.pkl | ||
| The same basename should be used to load the memory-mapped files later. | ||
| Parameters | ||
| ---------- | ||
| loader_creator: Callable[[], Iterator[EncodedRaggedArray]] | ||
| A callable that returns an iterator of EncodedRaggedArray objects. | ||
| basename: str | ||
| Where to store the memory-mapped files. | ||
| Returns | ||
| ------- | ||
| EncodedRaggedArray | ||
| ''' | ||
| warnings.warn( | ||
| f"{cls.__name__} is in an experimental stage and may change in the future.", | ||
| category=FutureWarning, | ||
| stacklevel=2 | ||
| ) | ||
| total_sequence_length = 0 | ||
| n_sequences = 0 | ||
| encoding = None | ||
| for sequences in loader_creator(): | ||
| n_sequences += len(sequences) | ||
| total_sequence_length += sequences.size | ||
| if encoding is None: | ||
| encoding = sequences.encoding | ||
| else: | ||
| assert encoding == sequences.encoding, f'Expected {encoding} but got {sequences.encoding}' | ||
| with open(f'{basename}_encoding.pkl', 'wb') as f: | ||
| pickle.dump(encoding, f) | ||
| data = np.memmap(f'{basename}_data.dat', dtype=np.uint8, mode='w+', shape=total_sequence_length) | ||
| lengths = np.memmap(f'{basename}_lengths.dat', dtype=np.int32, mode='w+', shape=n_sequences) | ||
| data_offset = 0 | ||
| length_offset = 0 | ||
| for sequences in loader_creator(): | ||
| data[data_offset:data_offset + sequences.size] = sequences.raw().ravel() | ||
| data_offset += sequences.size | ||
| lengths[length_offset:length_offset + len(sequences)] = sequences.lengths | ||
| length_offset += len(sequences) | ||
| data.flush() | ||
| lengths.flush() | ||
| return EncodedRaggedArray(EncodedArray(data, encoding), lengths) | ||
| import pytest | ||
| import bionumpy as bnp | ||
| @pytest.fixture | ||
| def pysam_install(): | ||
| try: | ||
| import pysam | ||
| except ImportError: | ||
| pytest.skip() | ||
| def test_indexed_bam(pysam_install, data_path): | ||
| from bionumpy.io import open_indexed | ||
| bam_filepath = data_path/'ctcf_chr21-22.bam' | ||
| bed_filepath = data_path/'ctcf.bed.gz' | ||
| bam = open_indexed(bam_filepath) | ||
| bed = bnp.open(bed_filepath).read() | ||
| alignments = bam[bed] | ||
| assert len(alignments) == 12649 |
| import numpy as np | ||
| import pytest | ||
| import bionumpy as bnp | ||
| from bionumpy.streams.memory_mapping import MemMapEncodedRaggedArray as MemMap | ||
| from bionumpy.util.testing import assert_encoded_raggedarray_equal | ||
| @pytest.fixture | ||
| def filename(data_path): | ||
| return data_path / 'big.fq.gz' | ||
| @pytest.fixture | ||
| def basename(tmp_path, filename): | ||
| return tmp_path / filename.stem | ||
| @pytest.fixture | ||
| def loader(filename): | ||
| return lambda: (chunk.sequence for chunk in bnp.open(filename)) | ||
| def test_create(basename, loader): | ||
| all_sequences = np.concatenate(list(loader())) | ||
| mem_mapped_encoded_ragged_array = MemMap.create(loader, basename) | ||
| assert_encoded_raggedarray_equal(all_sequences, mem_mapped_encoded_ragged_array) | ||
| def test_load(basename, loader): | ||
| all_sequences = np.concatenate(list(loader())) | ||
| MemMap.create(loader, basename) | ||
| loaded_mem_mapped_encoded_ragged_array = MemMap.load(basename) | ||
| assert_encoded_raggedarray_equal(all_sequences, loaded_mem_mapped_encoded_ragged_array) | ||
| @pytest.mark.skip('Needs pooch to download the file') | ||
| def test_big_file(tmp_path): | ||
| import pooch | ||
| url = 'https://github.com/bionumpy/bionumpy-example-data/raw/refs/heads/master/big.fq.gz' | ||
| filename = pooch.retrieve(url, known_hash=None, path=tmp_path) | ||
| basename = tmp_path / 'big' | ||
| n_entries = bnp.count_entries(filename) | ||
| loader = lambda: (chunk.sequence for chunk in bnp.open(filename)) | ||
| total_size = sum(chunk.size for chunk in loader()) | ||
| mem_mapped = MemMap.create(loader, basename) | ||
| n_sequences = len(mem_mapped) | ||
| assert n_sequences == n_entries | ||
| assert total_size == mem_mapped.size | ||
| chunk_size = 32 | ||
| for i in range(0, n_sequences, chunk_size): | ||
| chunk = mem_mapped[i:i + chunk_size] | ||
| assert len(chunk) == min(chunk_size, n_sequences - i) |
| Metadata-Version: 2.1 | ||
| Name: bionumpy | ||
| Version: 1.0.12 | ||
| Version: 1.0.13 | ||
| Summary: Library for working with biological sequence data as numpy arrays. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy |
@@ -42,2 +42,3 @@ HISTORY.rst | ||
| bionumpy/encodings/base_encoding.py | ||
| bionumpy/encodings/bool_encoding.py | ||
| bionumpy/encodings/exceptions.py | ||
@@ -74,2 +75,3 @@ bionumpy/encodings/integer_encoding.py | ||
| bionumpy/io/headers.py | ||
| bionumpy/io/indexed_bam.py | ||
| bionumpy/io/indexed_fasta.py | ||
@@ -123,2 +125,3 @@ bionumpy/io/indexed_files.py | ||
| bionumpy/streams/left_join.py | ||
| bionumpy/streams/memory_mapping.py | ||
| bionumpy/streams/multistream.py | ||
@@ -186,2 +189,3 @@ bionumpy/streams/reductions.py | ||
| tests/test_lazybnpdataclass.py | ||
| tests/test_memory_mapping.py | ||
| tests/test_minimizers.py | ||
@@ -223,2 +227,3 @@ tests/test_msa.py | ||
| tests/io_tests/test_file_extensions.py | ||
| tests/io_tests/test_indexed_bam.py | ||
| tests/io_tests/test_sam.py | ||
@@ -225,0 +230,0 @@ tests/io_tests/test_sub_delimiters.py |
@@ -5,3 +5,3 @@ """Top-level package for bionumpy.""" | ||
| __email__ = "knutdrand@gmail.com" | ||
| __version__ = '1.0.12' | ||
| __version__ = '1.0.13' | ||
@@ -8,0 +8,0 @@ import npstructures as nps |
@@ -10,6 +10,7 @@ from typing import List | ||
| from .bedgraph import BedGraph | ||
| from .. import streamable | ||
| from ..streams.decorators import streamable | ||
| from ..streams.grouped import chromosome_map | ||
| from ..datatypes import Interval | ||
| from ..bnpdataclass import bnpdataclass | ||
| from ..string_array import StringArray | ||
| from ..util import interleave | ||
@@ -252,3 +253,2 @@ from ..bnpdataclass import replace | ||
| return intervals[args] | ||
| if sort_order is not None: | ||
@@ -261,3 +261,13 @@ chromosome_key_function = {name: i for i, name in enumerate(sort_order)}.__getitem__ | ||
| def fast_sort_intervals(intervals: Interval) -> Interval: | ||
| if hasattr(intervals.chromosome, 'encoding') and isinstance(intervals.chromosome.encoding, StringEncoding): | ||
| args = np.lexsort((intervals.start, intervals.chromosome)) | ||
| return intervals[args] | ||
| if isinstance(intervals.chromosome, StringArray): | ||
| args = np.lexsort((intervals.start, intervals.chromosome.raw())) | ||
| return intervals[args] | ||
| assert False, 'Fast sort intervals only works with StringEncoding and StringArray' | ||
| @chromosome_map() | ||
@@ -264,0 +274,0 @@ def merge_intervals(intervals: Interval, distance: int = 0) -> Interval: |
@@ -344,55 +344,68 @@ import dataclasses | ||
| pre_val = getattr(obj, field.name) | ||
| numeric_types = (int, float, bool) | ||
| optional_numeric_types = tuple(Optional[t] for t in numeric_types) | ||
| if field.type == Union[BNPDataClass, str]: | ||
| if isinstance(pre_val, | ||
| (str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \ | ||
| hasattr(pre_val, 'to_numpy'): | ||
| val = as_encoded_array(pre_val) | ||
| elif True or isinstance(pre_val, BNPDataClass): | ||
| val = pre_val | ||
| else: | ||
| assert False, (field.type, type(pre_val)) | ||
| try: | ||
| val = cls.__convert_single_field(field, pre_val) | ||
| except Exception as e: | ||
| raise ValueError(f"Error when converting {field.name} to {field.type} with value {pre_val}") from e | ||
| elif field.type in numeric_types + optional_numeric_types: | ||
| val = np.asanyarray(pre_val) | ||
| elif field.type == str: | ||
| assert isinstance(pre_val, ( | ||
| setattr(obj, field.name, val) | ||
| @classmethod | ||
| def __convert_single_field(cls, field, pre_val): | ||
| numeric_types = (int, float, bool) | ||
| optional_numeric_types = tuple(Optional[t] for t in numeric_types) | ||
| if field.type == Union[BNPDataClass, str]: | ||
| if isinstance(pre_val, | ||
| (str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or \ | ||
| hasattr(pre_val, 'to_numpy'): | ||
| val = as_encoded_array(pre_val) | ||
| elif True or isinstance(pre_val, BNPDataClass): | ||
| val = pre_val | ||
| else: | ||
| assert False, (field.type, type(pre_val)) | ||
| elif field.type in numeric_types + optional_numeric_types: | ||
| val = np.asanyarray(pre_val) | ||
| elif field.type == str: | ||
| assert isinstance(pre_val, ( | ||
| str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)) or hasattr(pre_val, | ||
| 'to_numpy'), ( | ||
| field, pre_val, type(pre_val)) | ||
| val = as_encoded_array(pre_val) | ||
| elif field.type == SequenceID or field.type == List[str]: | ||
| if isinstance(pre_val, EncodedArray): | ||
| val = pre_val | ||
| else: | ||
| val = as_string_array(pre_val) | ||
| elif is_subclass_or_instance(field.type, Encoding): | ||
| if is_subclass_or_instance(field.type, NumericEncoding): | ||
| assert isinstance(pre_val, | ||
| (str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \ | ||
| (field, pre_val, type(pre_val)) | ||
| else: | ||
| assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray)) or hasattr(pre_val, 'to_numpy'), (field, pre_val) | ||
| # must do as_encoded and not explicit encode as pre_val might already | ||
| # be encoded | ||
| val = as_encoded_array(pre_val) | ||
| elif field.type == SequenceID or field.type == List[str]: | ||
| if isinstance(pre_val, EncodedArray): | ||
| val = pre_val | ||
| else: | ||
| val = as_string_array(pre_val) | ||
| elif is_subclass_or_instance(field.type, Encoding): | ||
| if is_subclass_or_instance(field.type, NumericEncoding): | ||
| assert isinstance(pre_val, | ||
| (str, list, EncodedArray, EncodedRaggedArray, RaggedArray, np.ndarray)), \ | ||
| (field, pre_val, type(pre_val)) | ||
| val = as_encoded_array(pre_val, field.type) | ||
| if isinstance(field.type, FlatAlphabetEncoding): | ||
| val = val.ravel() | ||
| elif field.type == List[int] or field.type == List[bool] or field.type == List[float]: | ||
| if not isinstance(pre_val, RaggedArray): | ||
| try: | ||
| val = RaggedArray(pre_val) | ||
| except TypeError as e: | ||
| val = np.asanyarray(pre_val) | ||
| else: | ||
| val = pre_val | ||
| elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass): | ||
| # assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val)) | ||
| elif getattr(field.type, 'returns_raw', False) and isinstance(pre_val, (np.ndarray, np.generic)): | ||
| val = pre_val | ||
| else: | ||
| assert False, field.type | ||
| assert isinstance(pre_val, (str, list, EncodedArray, EncodedRaggedArray, bool)) or hasattr(pre_val, | ||
| 'to_numpy'), ( | ||
| field, pre_val, type(pre_val), isinstance(pre_val, np.generic)) | ||
| val = as_encoded_array(pre_val, field.type) | ||
| # must do as_encoded and not explicit encode as pre_val might already | ||
| # be encoded | ||
| if isinstance(field.type, FlatAlphabetEncoding): | ||
| val = val.ravel() | ||
| elif field.type == List[int] or field.type == List[bool] or field.type == List[float]: | ||
| if not isinstance(pre_val, RaggedArray): | ||
| try: | ||
| val = RaggedArray(pre_val) | ||
| except TypeError as e: | ||
| val = np.asanyarray(pre_val) | ||
| else: | ||
| val = pre_val | ||
| elif inspect.isclass(field.type) and issubclass(field.type, BNPDataClass): | ||
| # assert isinstance(pre_val, (field.type, field.type._single_entry)), (field.type, type(pre_val)) | ||
| val = pre_val | ||
| else: | ||
| assert False, field.type | ||
| return val | ||
| setattr(obj, field.name, val) | ||
| NewClass.__name__ = base_class.__name__ | ||
@@ -399,0 +412,0 @@ NewClass.__qualname__ = base_class.__qualname__ |
@@ -28,3 +28,4 @@ from functools import lru_cache | ||
| self._ends = ends | ||
| self._chromosome_names = as_encoded_array([h[0] for h in header_data]) | ||
| self._chromosome_names = np.array([h[0] for h in header_data]) | ||
| # self._chromosome_names = as_encoded_array([h[0] for h in header_data]) | ||
| self._header_data = header_data | ||
@@ -31,0 +32,0 @@ self._functions = [self._get_chromosome, |
@@ -7,2 +7,3 @@ import numpy as np | ||
| change_encoding, as_encoded_array | ||
| from ..encodings.bool_encoding import bool_string | ||
| from ..encodings.string_encodings import StringEncoding | ||
@@ -52,2 +53,3 @@ from npstructures import RaggedArray | ||
| str: str_func, # lambda x: x, | ||
| bool_string: lambda x: bool_string.decode(x), | ||
| SequenceID: seq_id_func, | ||
@@ -60,3 +62,3 @@ List[int]: int_lists_to_strings, | ||
| } | ||
| if is_subclass_or_instance(datatype, Encoding): | ||
| if is_subclass_or_instance(datatype, Encoding) and not datatype==bool_string: | ||
| encoding = datatype | ||
@@ -63,0 +65,0 @@ |
@@ -214,4 +214,4 @@ from pathlib import PurePath | ||
| is_gzip = suffix in (".gz", ".bam") | ||
| if suffix == '.bam': | ||
| reader = NumpyBamReader | ||
| #if suffix == '.bam': | ||
| # reader = NumpyBamReader | ||
| if suffix == ".gz": | ||
@@ -218,0 +218,0 @@ suffix = path.suffixes[-2] |
| from pathlib import PurePath | ||
| import os | ||
| from .indexed_bam import IndexedBamFile | ||
| from .indexed_fasta import IndexedFasta, create_index | ||
@@ -48,6 +50,11 @@ from .files import bnp_open | ||
| index_file_name = path.with_suffix(path.suffix + ".fai") | ||
| assert suffix in (".fa", ".fasta"), "Only fasta supported for indexed read" | ||
| if not os.path.isfile(index_file_name): | ||
| index = create_index(path) | ||
| bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index) | ||
| return IndexedFasta(filename) | ||
| if suffix in (".fa", ".fasta"): | ||
| if not os.path.isfile(index_file_name): | ||
| index = create_index(path) | ||
| bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index) | ||
| return IndexedFasta(filename) | ||
| elif suffix == '.bam': | ||
| return IndexedBamFile(filename, create_index=True) | ||
| else: | ||
| raise ValueError(f"Unknown file type {suffix} for indexed read. Only .fa, .fasta and .bam are supported.") |
@@ -25,3 +25,15 @@ import numpy as np | ||
| def read_csv_motif(filename): | ||
| def read_csv_motif(filename: str) -> PWM: | ||
| ''' | ||
| Read a PWM from a CSV file. The first line should be the alphabet, and the rest should be the matrix with probabilities. | ||
| Parameters | ||
| ---------- | ||
| filename | ||
| Returns | ||
| ------- | ||
| PWM | ||
| ''' | ||
| f = open(filename) | ||
@@ -31,5 +43,5 @@ alphabet = f.readline().strip().split(",") | ||
| for line in f: | ||
| line = line.strip().split(",") | ||
| parts = line.strip().split(",") | ||
| for i, letter in enumerate(alphabet): | ||
| pwm[letter].append(float(line[i])) | ||
| return PWM.from_dict(pwm) | ||
| pwm[letter].append(float(parts[i])) | ||
| return PWM.from_dict(pwm) |
@@ -84,3 +84,4 @@ import numpy as np | ||
| # use the faster _get_dna_kmers | ||
| return _get_dna_kmers(sequence, k) | ||
| result = _get_dna_kmers(sequence, k) | ||
| return result | ||
@@ -98,3 +99,4 @@ return KmerEncoder(k, sequence.encoding).rolling_window(sequence) | ||
| out = EncodedRaggedArray(convoluted, shape, safe_mode=False) # FIXME: Should not use unsafe here | ||
| if window_size == 1: | ||
| return out | ||
| return out[..., : (-window_size + 1)] | ||
@@ -101,0 +103,0 @@ |
@@ -103,3 +103,3 @@ import numpy as np | ||
| @classmethod | ||
| def from_dict(cls, dictionary: Dict[str, ArrayLike], background: Dict[str, float] = None) -> "PWM": | ||
| def from_dict(cls, dictionary: Dict[str, typing.Union[ArrayLike, typing.List]], background: Dict[str, float] = None) -> "PWM": | ||
| """Create a PWM object from a dict of letters to position probabilities | ||
@@ -106,0 +106,0 @@ |
@@ -63,2 +63,4 @@ from abc import abstractmethod | ||
| out = as_strided(convoluted, shape) | ||
| if window_size == 1: | ||
| return out | ||
| if mode == "valid": | ||
@@ -65,0 +67,0 @@ return out[..., : (-window_size + 1)] |
@@ -101,3 +101,3 @@ import numpy as np | ||
| return value.raw() | ||
| elif isinstance(value, EncodedArray): | ||
| elif isinstance(value, (EncodedArray, EncodedRaggedArray)): | ||
| print(value) | ||
@@ -104,0 +104,0 @@ return string_array(value) |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: bionumpy | ||
| Version: 1.0.12 | ||
| Version: 1.0.13 | ||
| Summary: Library for working with biological sequence data as numpy arrays. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/bionumpy/bionumpy |
+1
-1
@@ -49,3 +49,3 @@ #!/usr/bin/env python | ||
| url='https://github.com/bionumpy/bionumpy', | ||
| version='1.0.12', | ||
| version='1.0.13', | ||
| zip_safe=False, | ||
@@ -52,0 +52,0 @@ extras_require={'full': ['isal']} |
| import dataclasses | ||
| import pytest | ||
| import numpy as np | ||
| from bionumpy import AminoAcidEncoding, DNAEncoding | ||
| from bionumpy import AminoAcidEncoding, DNAEncoding, EncodedArray, BaseEncoding | ||
| from bionumpy.bnpdataclass import bnpdataclass | ||
@@ -9,2 +9,5 @@ from bionumpy.bnpdataclass.bnpdataclass import make_dataclass, BNPDataClass, dynamic_concatenate | ||
| from numpy.testing import assert_equal | ||
| from bionumpy.datatypes import SequenceID | ||
| from bionumpy.encodings.bool_encoding import bool_string | ||
| from bionumpy.util.testing import assert_bnpdataclass_equal | ||
@@ -145,1 +148,18 @@ # import pandas as pd | ||
| @pytest.fixture() | ||
| def bool_class(): | ||
| @bnpdataclass | ||
| class BNPDC: | ||
| sequence_id: SequenceID | ||
| test_field: bool_string | ||
| return BNPDC | ||
| def test_bool_class(bool_class): | ||
| obj = bool_class(sequence_id=['hei', 'ja'], | ||
| test_field=['True', 'False']) | ||
| from bionumpy.io.delimited_buffers import DelimitedBuffer | ||
| buffer = DelimitedBuffer.from_data(obj) | ||
@@ -13,7 +13,8 @@ import numpy as np | ||
| from bionumpy.datatypes import SequenceEntryWithQuality | ||
| from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, ACGTnEncoding | ||
| from bionumpy.encodings import DigitEncoding, QualityEncoding, CigarEncoding, DigitEncodingFactory, DNAEncoding, \ | ||
| ACGTnEncoding | ||
| from bionumpy.encoded_array import NumericEncoding, OneToOneEncoding, BaseEncoding | ||
| #from bionumpy.encoded_array import OneToOneEncoding | ||
| # from bionumpy.encoded_array import OneToOneEncoding | ||
@@ -84,5 +85,5 @@ @pytest.fixture | ||
| ["1234", | ||
| ["1234", "5678"], | ||
| np.array([1, 2, 3, 4]), | ||
| RaggedArray([[1, 2, 3], [4]])]) | ||
| ["1234", "5678"], | ||
| np.array([1, 2, 3, 4]), | ||
| RaggedArray([[1, 2, 3], [4]])]) | ||
| def test_digit_encoding(data): | ||
@@ -166,1 +167,11 @@ encoding = DigitEncoding | ||
| assert_raggedarray_equal(encoded, encoded2) | ||
| def test_bool_encoding(): | ||
| from bionumpy.encodings.bool_encoding import BoolStringEncoding, bool_string | ||
| encoding = bool_string | ||
| data = ["True", "False", "True"] | ||
| encoded = encoding.encode(data) | ||
| decoded = encoding.decode(encoded) | ||
| assert np.all(decoded == data) | ||
| print(as_encoded_array(data, encoding)) |
@@ -59,2 +59,7 @@ import pytest | ||
| @pytest.mark.parametrize("encoding", [bnp.DNAEncoding, bnp.AminoAcidEncoding]) | ||
| def test_get_kmers_one(encoding): | ||
| sequence = bionumpy.encoded_array.as_encoded_array(["ACTG"], encoding) | ||
| kmers = bnp.sequence.get_kmers(sequence, 1) | ||
| assert len(kmers[0]) == 4, kmers[0] | ||
@@ -61,0 +66,0 @@ |
@@ -117,3 +117,3 @@ import pytest | ||
| bam2 = bnp.open(data_path / "test.bam").read() | ||
| assert np.all(str_equal(bam.chromosome, bam2.chromosome)) | ||
| assert np.all(bam.chromosome==bam2.chromosome) | ||
@@ -120,0 +120,0 @@ |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
723013
2.6%233
2.19%16358
2.31%