binpickle
Advanced tools
| recursive-include tests *.py |
| import pytest | ||
| import numpy as np | ||
| from hypothesis import given, assume, settings | ||
| import hypothesis.strategies as st | ||
| from hypothesis.extra.numpy import arrays, integer_dtypes, floating_dtypes | ||
| from binpickle.codecs import * | ||
| if NC.AVAILABLE: | ||
| from numcodecs import LZ4, LZMA | ||
| KNOWN_CODECS = [c for c in CODECS.values() if c.NAME != 'numcodec'] # exclude numcodec from common tests | ||
| need_blosc = pytest.mark.skipif(not Blosc.AVAILABLE, reason='Blosc not available') | ||
| need_numcodecs = pytest.mark.skipif(not NC.AVAILABLE, reason='numcodecs not available') | ||
| def test_make_codec_none(): | ||
| assert isinstance(make_codec(None), Null) | ||
| def test_make_codec_null_str(): | ||
| assert isinstance(make_codec('null'), Null) | ||
| def test_make_codec_gz_str(): | ||
| assert isinstance(make_codec('gz'), GZ) | ||
| def test_make_codec_return(): | ||
| codec = GZ() | ||
| assert make_codec(codec) is codec | ||
| @need_numcodecs | ||
| def test_make_codec_wrap(): | ||
| inner = LZ4() | ||
| codec = make_codec(inner) | ||
| assert isinstance(codec, NC) | ||
| assert codec.codec is inner | ||
| def test_make_codec_to_none(): | ||
| "Test internal-use none codec" | ||
| assert make_codec(None, null_as_none=True) is None | ||
| assert make_codec(Null(), null_as_none=True) is None | ||
| def test_get_null_with_none(): | ||
| codec = get_codec(None, {}) | ||
| assert isinstance(codec, Null) | ||
| def test_get_null(): | ||
| codec = get_codec('null', {}) | ||
| assert isinstance(codec, Null) | ||
| def test_get_gz(): | ||
| codec = get_codec('gz', {}) | ||
| assert isinstance(codec, GZ) | ||
| assert codec.level == 9 | ||
| def test_get_gz_level(): | ||
| codec = get_codec('gz', {'level': 5}) | ||
| assert isinstance(codec, GZ) | ||
| assert codec.level == 5 | ||
| @need_blosc | ||
| def test_get_blosc(): | ||
| codec = get_codec('blosc', {}) | ||
| assert isinstance(codec, Blosc) | ||
| assert codec.level == 9 | ||
| @need_blosc | ||
| def test_get_blosc_lvl(): | ||
| codec = get_codec('blosc', {'name': 'zstd', 'level': 5}) | ||
| assert isinstance(codec, Blosc) | ||
| assert codec.name == 'zstd' | ||
| assert codec.level == 5 | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| @settings(deadline=500) | ||
| @given(st.binary()) | ||
| def test_codec_roundtrip(codec, data): | ||
| "Round-trip a codec" | ||
| c = codec() | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| assert len(dec) == len(data) | ||
| assert dec == data | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| @settings(deadline=500) | ||
| @given(arrays(st.one_of(integer_dtypes(), floating_dtypes()), | ||
| st.integers(10, 10000))) | ||
| def test_codec_roundtrip_array(codec, data): | ||
| "Round-trip a codec" | ||
| assume(not any(np.isnan(data))) | ||
| c = codec() | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| a2 = np.frombuffer(dec, dtype=data.dtype) | ||
| assert len(a2) == len(data) | ||
| assert all(a2 == data) | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| def test_codec_decode_oversize(codec): | ||
| "Test decoding data to an oversized bytearray" | ||
| c = codec() | ||
| data = bytearray(np.random.randn(500)) | ||
| out = bytearray(len(data) * 2) | ||
| enc = c.encode(data) | ||
| c.decode_to(enc, out) | ||
| assert len(out) == len(data) | ||
| assert out == data | ||
| @need_blosc | ||
| def test_large_blosc_encode(): | ||
| "Test encoding Blosc data that needs to be split" | ||
| c = Blosc(blocksize=4096) | ||
| data = np.random.randn(10000) | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| assert len(enc) < len(dec) # we should have compressed | ||
| assert len(dec) == data.nbytes | ||
| assert dec == memoryview(data) | ||
| a2 = np.frombuffer(data) | ||
| assert len(a2) == len(data) | ||
| assert all(a2 == data) | ||
| @need_numcodecs | ||
| @given(st.binary()) | ||
| def test_numcodec_roundtrip(data): | ||
| c = NC(LZMA()) | ||
| buf = c.encode(data) | ||
| d2 = c.decode(buf) | ||
| assert len(d2) == len(data) | ||
| assert d2 == data | ||
| @need_numcodecs | ||
| @given(st.binary()) | ||
| def test_chain(data): | ||
| # Useless but a test | ||
| codec = Chain([LZMA(), GZ()]) | ||
| buf = codec.encode(data) | ||
| d2 = codec.decode(buf) | ||
| assert len(d2) == len(data) | ||
| assert d2 == data | ||
| @need_numcodecs | ||
| def test_chain_config(): | ||
| codec = Chain([LZMA(), GZ()]) | ||
| assert len(codec.codecs) == 2 | ||
| assert isinstance(codec.codecs[0], NC) | ||
| assert isinstance(codec.codecs[1], GZ) | ||
| cfg = codec.config() | ||
| c2 = get_codec(Chain.NAME, cfg) | ||
| assert len(codec.codecs) == 2 | ||
| assert isinstance(codec.codecs[0], NC) | ||
| assert isinstance(codec.codecs[1], GZ) | ||
| def test_is_not_numcodec(): | ||
| assert not numcodecs.is_numcodec(GZ()) | ||
| @need_numcodecs | ||
| def test_is_numcodec(): | ||
| assert numcodecs.is_numcodec(LZ4()) |
| from pytest import raises | ||
| from binpickle.format import * | ||
| def test_format_sizes(): | ||
| assert HEADER_FORMAT.size == 16 | ||
| assert TRAILER_FORMAT.size == 16 | ||
| def test_pack_default_header(): | ||
| h = FileHeader() | ||
| bs = h.encode() | ||
| assert len(bs) == 16 | ||
| def test_default_header_round_trip(): | ||
| h = FileHeader() | ||
| bs = h.encode() | ||
| assert len(bs) == 16 | ||
| h2 = FileHeader.decode(bs) | ||
| assert h2 is not h | ||
| assert h2 == h | ||
| def test_size_round_trip(): | ||
| h = FileHeader(length=57) | ||
| bs = h.encode() | ||
| assert len(bs) == 16 | ||
| h2 = FileHeader.decode(bs) | ||
| assert h2.length == 57 | ||
| assert h2 == h | ||
| def test_catch_bad_magic(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BNPQ\x00\x00\x00\x00' + (b'\x00' * 8)) | ||
| assert 'magic' in str(exc.value) | ||
| def test_catch_bad_version(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BPCK\x00\x02\x00\x00' + (b'\x00' * 8)) | ||
| assert 'version' in str(exc.value) | ||
| def test_catch_bad_padding(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BPCK\x00\x01\x00\xff' + (b'\x00' * 8)) | ||
| assert 'padding' in str(exc.value) | ||
+235
| import itertools as it | ||
| from tempfile import TemporaryDirectory | ||
| from pathlib import Path | ||
| import gc | ||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
| from hypothesis import given, assume, settings | ||
| import hypothesis.strategies as st | ||
| from hypothesis.extra.numpy import arrays, scalar_dtypes | ||
| from binpickle.read import BinPickleFile, load | ||
| from binpickle.write import BinPickler, dump | ||
| from binpickle import codecs | ||
| RW_CTORS = [BinPickler, BinPickler.mappable, BinPickler.compressed] | ||
| RW_CODECS = [st.just(None), st.builds(codecs.GZ)] | ||
| if codecs.Blosc.AVAILABLE: | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Blosc('zstd', 5))) | ||
| RW_CODECS.append(st.builds(codecs.Blosc)) | ||
| RW_CODECS.append(st.builds(codecs.Blosc, st.just('zstd'))) | ||
| if codecs.NC.AVAILABLE: | ||
| import numcodecs | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, numcodecs.LZMA())) | ||
| RW_CODECS.append(st.builds(codecs.NC, st.just(numcodecs.LZMA()))) | ||
| # also build a chain test | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Chain([numcodecs.MsgPack(), codecs.GZ()]))) | ||
| RW_CONFIGS = it.product( | ||
| RW_CTORS, | ||
| [False, True] | ||
| ) | ||
| RW_PARAMS = ['writer', 'direct'] | ||
| @pytest.fixture | ||
| def rng(): | ||
| return np.random.default_rng() | ||
| def test_empty(tmp_path): | ||
| "Write a file with nothing in it" | ||
| file = tmp_path / 'data.bpk' | ||
| with BinPickler(file) as w: | ||
| w._finish_file() | ||
| assert file.stat().st_size == 33 | ||
| with BinPickleFile(file) as bpf: | ||
| assert len(bpf.entries) == 0 | ||
| def test_write_buf(tmp_path, rng: np.random.Generator): | ||
| "Write a file with a single array" | ||
| file = tmp_path / 'data.bpk' | ||
| a = rng.integers(0, 5000, 1024, dtype='i4') | ||
| with BinPickler(file) as w: | ||
| w._write_buffer(a) | ||
| w._finish_file() | ||
| with BinPickleFile(file, direct=True) as bpf: | ||
| assert len(bpf.entries) == 1 | ||
| e = bpf.entries[0] | ||
| assert e.dec_length == a.nbytes | ||
| assert e.enc_length == a.nbytes | ||
| b2 = bpf._read_buffer(e) | ||
| assert b2.nbytes == e.dec_length | ||
| a2 = np.frombuffer(b2, dtype='i4') | ||
| assert len(a2) == len(a) | ||
| assert all(a2 == a) | ||
| del a2 | ||
| del b2 | ||
| @settings(deadline=None) | ||
| @given(st.lists(st.binary()), | ||
| st.one_of(RW_CODECS)) | ||
| def test_write_encoded_arrays(arrays, codec): | ||
| with TemporaryDirectory('.test', 'binpickle-') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with BinPickler.compressed(file, codec) as w: | ||
| for a in arrays: | ||
| w._write_buffer(a) | ||
| w._finish_file() | ||
| with BinPickleFile(file) as bpf: | ||
| assert not bpf.find_errors() | ||
| assert len(bpf.entries) == len(arrays) | ||
| for e, a in zip(bpf.entries, arrays): | ||
| try: | ||
| if codec is not None: | ||
| assert e.codec | ||
| assert e.dec_length == len(a) | ||
| dat = bpf._read_buffer(e) | ||
| assert dat == a | ||
| finally: # delete things to make failures clearer | ||
| del dat | ||
| del e | ||
| gc.collect() | ||
| def test_pickle_array(tmp_path, rng: np.random.Generator): | ||
| "Pickle a NumPy array" | ||
| file = tmp_path / 'data.bpk' | ||
| a = rng.integers(0, 5000, 1024, dtype='i4') | ||
| with BinPickler(file) as w: | ||
| w.dump(a) | ||
| with BinPickleFile(file) as bpf: | ||
| assert len(bpf.entries) == 2 | ||
| a2 = bpf.load() | ||
| assert len(a2) == len(a) | ||
| assert all(a2 == a) | ||
| @pytest.mark.parametrize(RW_PARAMS, RW_CONFIGS) | ||
| def test_pickle_frame(tmp_path, rng: np.random.Generator, writer, direct): | ||
| "Pickle a Pandas data frame" | ||
| file = tmp_path / 'data.bpk' | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| with writer(file) as w: | ||
| w.dump(df) | ||
| with BinPickleFile(file, direct=direct) as bpf: | ||
| assert not bpf.find_errors() | ||
| df2 = bpf.load() | ||
| print(df2) | ||
| assert all(df2.columns == df.columns) | ||
| for c in df2.columns: | ||
| assert all(df2[c] == df[c]) | ||
| del df2 | ||
| @pytest.mark.skipif(not codecs.NC.AVAILABLE, reason='numcodecs not available') | ||
| def test_pickle_frame_dyncodec(tmp_path, rng: np.random.Generator): | ||
| file = tmp_path / 'data.bpk' | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000, dtype='i4'), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| def codec(buf): | ||
| obj = memoryview(buf).obj | ||
| if isinstance(obj, np.ndarray) and obj.dtype == np.float64: | ||
| print('compacting double array') | ||
| return codecs.Chain([numcodecs.AsType('f4', 'f8'), codecs.Blosc('zstd', 9)]) | ||
| else: | ||
| return codecs.Blosc('zstd', 9) | ||
| with BinPickler.compressed(file, codec) as w: | ||
| w.dump(df) | ||
| with BinPickleFile(file) as bpf: | ||
| assert not bpf.find_errors() | ||
| df2 = bpf.load() | ||
| print(df2) | ||
| assert all(df2.columns == df.columns) | ||
| assert all(df2['key'] == df['key']) | ||
| assert all(df2['count'] == df['count']) | ||
| assert all(df2['score'].astype('f4') == df['score'].astype('f4')) | ||
| del df2 | ||
| def test_dump_frame(tmp_path, rng: np.random.Generator): | ||
| "Pickle a Pandas data frame" | ||
| file = tmp_path / 'data.bpk' | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| dump(df, file) | ||
| df2 = load(file) | ||
| assert all(df2.columns == df.columns) | ||
| for c in df2.columns: | ||
| assert all(df2[c] == df[c]) | ||
| @given(arrays(scalar_dtypes(), st.integers(500, 10000))) | ||
| def test_compress_many_arrays(tmp_path, a): | ||
| "Pickle random NumPy arrays" | ||
| assume(not any(np.isnan(a))) | ||
| with TemporaryDirectory('.test', 'binpickle') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with BinPickler.compressed(file) as w: | ||
| w.dump(a) | ||
| with BinPickleFile(file) as bpf: | ||
| assert not bpf.find_errors() | ||
| assert len(bpf.entries) in (1, 2) | ||
| a2 = bpf.load() | ||
| assert len(a2) == len(a) | ||
| assert all(a2 == a) | ||
| @settings(deadline=None) | ||
| @given(arrays(scalar_dtypes(), st.integers(500, 10000))) | ||
| def test_map_many_arrays(a): | ||
| "Pickle random NumPy arrays" | ||
| assume(not any(np.isnan(a))) | ||
| with TemporaryDirectory('.test', 'binpickle') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with BinPickler.mappable(file) as w: | ||
| w.dump(a) | ||
| with BinPickleFile(file, direct=True) as bpf: | ||
| assert not bpf.find_errors() | ||
| assert len(bpf.entries) in (1, 2) | ||
| a2 = bpf.load() | ||
| assert len(a2) == len(a) | ||
| assert all(a2 == a) | ||
| del a2 |
| import logging | ||
| import io | ||
| import zlib | ||
| import functools as ft | ||
| import numpy as np | ||
| from hypothesis import given, settings, HealthCheck | ||
| import hypothesis.strategies as st | ||
| import pytest | ||
| from binpickle.write import _align_pos, CKOut | ||
| _log = logging.getLogger(__name__) | ||
| def _split_blocks(*args): | ||
| blosc = pytest.importorskip('binpickle.codecs.blosc') | ||
| return blosc._split_blocks(*args) | ||
| @given(st.integers(100, 10000000)) | ||
| def test_align(n): | ||
| res = _align_pos(n, 1024) | ||
| assert res >= n | ||
| assert res % 1024 == 0 | ||
| @given(st.binary()) | ||
| def test_checksum_bytes(data): | ||
| out = io.BytesIO() | ||
| cko = CKOut(out) | ||
| cko.write(data) | ||
| assert out.getbuffer() == data | ||
| assert cko.bytes == len(data) | ||
| assert cko.checksum == zlib.adler32(data) | ||
| @given(st.lists(st.binary(), min_size=1, max_size=10)) | ||
| def test_checksum_multi_bytes(arrays): | ||
| out = io.BytesIO() | ||
| cko = CKOut(out) | ||
| for a in arrays: | ||
| cko.write(a) | ||
| cat = ft.reduce(lambda b1, b2: b1 + b2, arrays) | ||
| assert out.getbuffer() == cat | ||
| assert cko.bytes == len(cat) | ||
| assert cko.checksum == zlib.adler32(cat) | ||
| def test_split_empty_block(): | ||
| blocks = _split_blocks(memoryview(b''), 10) | ||
| assert len(blocks) == 1 | ||
| assert blocks[0] == b'' | ||
| def test_split_one_block(): | ||
| blocks = _split_blocks(memoryview(b'asdf'), 10) | ||
| assert len(blocks) == 1 | ||
| assert blocks[0] == b'asdf' | ||
| def test_split_two_blocks(): | ||
| blocks = _split_blocks(memoryview(b'asdf'), 2) | ||
| assert len(blocks) == 2 | ||
| assert blocks[0] == b'as' | ||
| assert blocks[1] == b'df' | ||
| assert blocks[0].nbytes == 2 | ||
| assert blocks[1].nbytes == 2 | ||
| def test_split_blocks_mismatch(): | ||
| blocks = _split_blocks(memoryview(b'asdfg'), 2) | ||
| assert len(blocks) == 3 | ||
| assert blocks[0] == b'as' | ||
| assert blocks[0].nbytes == 2 | ||
| assert blocks[1] == b'df' | ||
| assert blocks[1].nbytes == 2 | ||
| assert blocks[2] == b'g' | ||
| assert blocks[2].nbytes == 1 | ||
| @settings(suppress_health_check=[HealthCheck.too_slow]) | ||
| @given(st.data()) | ||
| def test_split_blocks(data): | ||
| bs = data.draw(st.integers(8, 4096)) | ||
| input = data.draw(st.binary(min_size=bs//2, max_size=bs*8)) | ||
| _log.info('input size %d, block size %d', len(input), bs) | ||
| blocks = _split_blocks(memoryview(input), bs) | ||
| _log.info('split into %d blocks', len(blocks)) | ||
| assert all(b.nbytes <= bs for b in blocks) | ||
| assert all(len(b) <= bs for b in blocks) | ||
| assert sum(b.nbytes for b in blocks) == len(input) | ||
| reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes()) | ||
| assert len(reconst) == len(input) | ||
| assert reconst == input | ||
| @settings(suppress_health_check=[HealthCheck.too_slow]) | ||
| @given(st.data()) | ||
| def test_split_arrays(data): | ||
| bs = data.draw(st.integers(8, 4096)) | ||
| size = data.draw(st.integers(bs//8, bs*4)) | ||
| array = np.random.randn(size) | ||
| input = memoryview(array) | ||
| _log.info('input size %d (%d bytes), block size %d', len(input), input.nbytes, bs) | ||
| blocks = _split_blocks(memoryview(input), bs) | ||
| _log.info('split into %d blocks', len(blocks)) | ||
| assert all(b.nbytes <= bs for b in blocks) | ||
| assert all(len(b) <= bs for b in blocks) | ||
| assert sum(b.nbytes for b in blocks) == input.nbytes | ||
| reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes()) | ||
| assert len(reconst) == input.nbytes | ||
| rcv = memoryview(reconst).cast(input.format) | ||
| assert rcv == input | ||
| a2 = np.frombuffer(reconst, array.dtype) | ||
| assert all(a2 == array) |
| Metadata-Version: 2.1 | ||
| Name: binpickle | ||
| Version: 0.3.2 | ||
| Version: 0.3.3 | ||
| Summary: Efficient binary storage of ML models | ||
@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org |
| LICENSE | ||
| MANIFEST.in | ||
| README.md | ||
@@ -23,2 +24,6 @@ pyproject.toml | ||
| binpickle/codecs/null.py | ||
| binpickle/codecs/numcodecs.py | ||
| binpickle/codecs/numcodecs.py | ||
| tests/test_codecs.py | ||
| tests/test_format.py | ||
| tests/test_rw.py | ||
| tests/test_util.py |
@@ -5,5 +5,5 @@ """ | ||
| __version__ = '0.3.2' | ||
| __version__ = '0.3.3' | ||
| from .write import dump, BinPickler # noqa: F401 | ||
| from .read import load, BinPickleFile # noqa: F401 |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: binpickle | ||
| Version: 0.3.2 | ||
| Version: 0.3.3 | ||
| Summary: Efficient binary storage of ML models | ||
@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
50124
46.75%29
20.83%1137
64.31%