binpickle - PyPI Package Compare versions

+1

MANIFEST.in

recursive-include tests *.py

+185

tests/test_codecs.py

		import pytest
		import numpy as np

		from hypothesis import given, assume, settings
		import hypothesis.strategies as st
		from hypothesis.extra.numpy import arrays, integer_dtypes, floating_dtypes

		from binpickle.codecs import *
		if NC.AVAILABLE:
		from numcodecs import LZ4, LZMA

		KNOWN_CODECS = [c for c in CODECS.values() if c.NAME != 'numcodec'] # exclude numcodec from common tests

		need_blosc = pytest.mark.skipif(not Blosc.AVAILABLE, reason='Blosc not available')
		need_numcodecs = pytest.mark.skipif(not NC.AVAILABLE, reason='numcodecs not available')


		def test_make_codec_none():
		assert isinstance(make_codec(None), Null)


		def test_make_codec_null_str():
		assert isinstance(make_codec('null'), Null)


		def test_make_codec_gz_str():
		assert isinstance(make_codec('gz'), GZ)


		def test_make_codec_return():
		codec = GZ()
		assert make_codec(codec) is codec


		@need_numcodecs
		def test_make_codec_wrap():
		inner = LZ4()
		codec = make_codec(inner)
		assert isinstance(codec, NC)
		assert codec.codec is inner


		def test_make_codec_to_none():
		"Test internal-use none codec"
		assert make_codec(None, null_as_none=True) is None
		assert make_codec(Null(), null_as_none=True) is None


		def test_get_null_with_none():
		codec = get_codec(None, {})
		assert isinstance(codec, Null)


		def test_get_null():
		codec = get_codec('null', {})
		assert isinstance(codec, Null)


		def test_get_gz():
		codec = get_codec('gz', {})
		assert isinstance(codec, GZ)
		assert codec.level == 9


		def test_get_gz_level():
		codec = get_codec('gz', {'level': 5})
		assert isinstance(codec, GZ)
		assert codec.level == 5


		@need_blosc
		def test_get_blosc():
		codec = get_codec('blosc', {})
		assert isinstance(codec, Blosc)
		assert codec.level == 9


		@need_blosc
		def test_get_blosc_lvl():
		codec = get_codec('blosc', {'name': 'zstd', 'level': 5})
		assert isinstance(codec, Blosc)
		assert codec.name == 'zstd'
		assert codec.level == 5


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		@settings(deadline=500)
		@given(st.binary())
		def test_codec_roundtrip(codec, data):
		"Round-trip a codec"

		c = codec()
		enc = c.encode(data)
		dec = c.decode(enc)
		assert len(dec) == len(data)
		assert dec == data


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		@settings(deadline=500)
		@given(arrays(st.one_of(integer_dtypes(), floating_dtypes()),
		st.integers(10, 10000)))
		def test_codec_roundtrip_array(codec, data):
		"Round-trip a codec"
		assume(not any(np.isnan(data)))

		c = codec()
		enc = c.encode(data)
		dec = c.decode(enc)
		a2 = np.frombuffer(dec, dtype=data.dtype)
		assert len(a2) == len(data)
		assert all(a2 == data)


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		def test_codec_decode_oversize(codec):
		"Test decoding data to an oversized bytearray"
		c = codec()
		data = bytearray(np.random.randn(500))
		out = bytearray(len(data) * 2)
		enc = c.encode(data)
		c.decode_to(enc, out)
		assert len(out) == len(data)
		assert out == data


		@need_blosc
		def test_large_blosc_encode():
		"Test encoding Blosc data that needs to be split"
		c = Blosc(blocksize=4096)

		data = np.random.randn(10000)
		enc = c.encode(data)
		dec = c.decode(enc)
		assert len(enc) < len(dec) # we should have compressed
		assert len(dec) == data.nbytes
		assert dec == memoryview(data)

		a2 = np.frombuffer(data)
		assert len(a2) == len(data)
		assert all(a2 == data)


		@need_numcodecs
		@given(st.binary())
		def test_numcodec_roundtrip(data):
		c = NC(LZMA())
		buf = c.encode(data)
		d2 = c.decode(buf)
		assert len(d2) == len(data)
		assert d2 == data


		@need_numcodecs
		@given(st.binary())
		def test_chain(data):
		# Useless but a test
		codec = Chain([LZMA(), GZ()])
		buf = codec.encode(data)
		d2 = codec.decode(buf)

		assert len(d2) == len(data)
		assert d2 == data


		@need_numcodecs
		def test_chain_config():
		codec = Chain([LZMA(), GZ()])
		assert len(codec.codecs) == 2
		assert isinstance(codec.codecs[0], NC)
		assert isinstance(codec.codecs[1], GZ)

		cfg = codec.config()
		c2 = get_codec(Chain.NAME, cfg)
		assert len(codec.codecs) == 2
		assert isinstance(codec.codecs[0], NC)
		assert isinstance(codec.codecs[1], GZ)


		def test_is_not_numcodec():
		assert not numcodecs.is_numcodec(GZ())

		@need_numcodecs
		def test_is_numcodec():
		assert numcodecs.is_numcodec(LZ4())

+54

tests/test_format.py

		from pytest import raises

		from binpickle.format import *


		def test_format_sizes():
		assert HEADER_FORMAT.size == 16
		assert TRAILER_FORMAT.size == 16


		def test_pack_default_header():
		h = FileHeader()
		bs = h.encode()
		assert len(bs) == 16


		def test_default_header_round_trip():
		h = FileHeader()
		bs = h.encode()
		assert len(bs) == 16

		h2 = FileHeader.decode(bs)
		assert h2 is not h
		assert h2 == h


		def test_size_round_trip():
		h = FileHeader(length=57)
		bs = h.encode()
		assert len(bs) == 16

		h2 = FileHeader.decode(bs)
		assert h2.length == 57
		assert h2 == h


		def test_catch_bad_magic():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BNPQ\x00\x00\x00\x00' + (b'\x00' * 8))
		assert 'magic' in str(exc.value)


		def test_catch_bad_version():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BPCK\x00\x02\x00\x00' + (b'\x00' * 8))
		assert 'version' in str(exc.value)


		def test_catch_bad_padding():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BPCK\x00\x01\x00\xff' + (b'\x00' * 8))
		assert 'padding' in str(exc.value)

+235

tests/test_rw.py

		import itertools as it
		from tempfile import TemporaryDirectory
		from pathlib import Path
		import gc
		import numpy as np
		import pandas as pd

		import pytest
		from hypothesis import given, assume, settings
		import hypothesis.strategies as st
		from hypothesis.extra.numpy import arrays, scalar_dtypes

		from binpickle.read import BinPickleFile, load
		from binpickle.write import BinPickler, dump
		from binpickle import codecs


		RW_CTORS = [BinPickler, BinPickler.mappable, BinPickler.compressed]
		RW_CODECS = [st.just(None), st.builds(codecs.GZ)]
		if codecs.Blosc.AVAILABLE:
		RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Blosc('zstd', 5)))
		RW_CODECS.append(st.builds(codecs.Blosc))
		RW_CODECS.append(st.builds(codecs.Blosc, st.just('zstd')))
		if codecs.NC.AVAILABLE:
		import numcodecs
		RW_CTORS.append(lambda f: BinPickler.compressed(f, numcodecs.LZMA()))
		RW_CODECS.append(st.builds(codecs.NC, st.just(numcodecs.LZMA())))
		# also build a chain test
		RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Chain([numcodecs.MsgPack(), codecs.GZ()])))

		RW_CONFIGS = it.product(
		RW_CTORS,
		[False, True]
		)
		RW_PARAMS = ['writer', 'direct']



		@pytest.fixture
		def rng():
		return np.random.default_rng()


		def test_empty(tmp_path):
		"Write a file with nothing in it"
		file = tmp_path / 'data.bpk'

		with BinPickler(file) as w:
		w._finish_file()

		assert file.stat().st_size == 33

		with BinPickleFile(file) as bpf:
		assert len(bpf.entries) == 0


		def test_write_buf(tmp_path, rng: np.random.Generator):
		"Write a file with a single array"
		file = tmp_path / 'data.bpk'

		a = rng.integers(0, 5000, 1024, dtype='i4')

		with BinPickler(file) as w:
		w._write_buffer(a)
		w._finish_file()

		with BinPickleFile(file, direct=True) as bpf:
		assert len(bpf.entries) == 1
		e = bpf.entries[0]
		assert e.dec_length == a.nbytes
		assert e.enc_length == a.nbytes
		b2 = bpf._read_buffer(e)
		assert b2.nbytes == e.dec_length
		a2 = np.frombuffer(b2, dtype='i4')
		assert len(a2) == len(a)
		assert all(a2 == a)
		del a2
		del b2


		@settings(deadline=None)
		@given(st.lists(st.binary()),
		st.one_of(RW_CODECS))
		def test_write_encoded_arrays(arrays, codec):
		with TemporaryDirectory('.test', 'binpickle-') as path:
		file = Path(path) / 'data.bpk'

		with BinPickler.compressed(file, codec) as w:
		for a in arrays:
		w._write_buffer(a)
		w._finish_file()

		with BinPickleFile(file) as bpf:
		assert not bpf.find_errors()
		assert len(bpf.entries) == len(arrays)
		for e, a in zip(bpf.entries, arrays):
		try:
		if codec is not None:
		assert e.codec
		assert e.dec_length == len(a)
		dat = bpf._read_buffer(e)
		assert dat == a
		finally: # delete things to make failures clearer
		del dat
		del e
		gc.collect()


		def test_pickle_array(tmp_path, rng: np.random.Generator):
		"Pickle a NumPy array"
		file = tmp_path / 'data.bpk'

		a = rng.integers(0, 5000, 1024, dtype='i4')

		with BinPickler(file) as w:
		w.dump(a)

		with BinPickleFile(file) as bpf:
		assert len(bpf.entries) == 2
		a2 = bpf.load()
		assert len(a2) == len(a)
		assert all(a2 == a)


		@pytest.mark.parametrize(RW_PARAMS, RW_CONFIGS)
		def test_pickle_frame(tmp_path, rng: np.random.Generator, writer, direct):
		"Pickle a Pandas data frame"
		file = tmp_path / 'data.bpk'

		df = pd.DataFrame({
		'key': np.arange(0, 5000),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})

		with writer(file) as w:
		w.dump(df)

		with BinPickleFile(file, direct=direct) as bpf:
		assert not bpf.find_errors()
		df2 = bpf.load()
		print(df2)
		assert all(df2.columns == df.columns)
		for c in df2.columns:
		assert all(df2[c] == df[c])
		del df2


		@pytest.mark.skipif(not codecs.NC.AVAILABLE, reason='numcodecs not available')
		def test_pickle_frame_dyncodec(tmp_path, rng: np.random.Generator):
		file = tmp_path / 'data.bpk'

		df = pd.DataFrame({
		'key': np.arange(0, 5000, dtype='i4'),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})

		def codec(buf):
		obj = memoryview(buf).obj
		if isinstance(obj, np.ndarray) and obj.dtype == np.float64:
		print('compacting double array')
		return codecs.Chain([numcodecs.AsType('f4', 'f8'), codecs.Blosc('zstd', 9)])
		else:
		return codecs.Blosc('zstd', 9)

		with BinPickler.compressed(file, codec) as w:
		w.dump(df)

		with BinPickleFile(file) as bpf:
		assert not bpf.find_errors()
		df2 = bpf.load()
		print(df2)
		assert all(df2.columns == df.columns)
		assert all(df2['key'] == df['key'])
		assert all(df2['count'] == df['count'])
		assert all(df2['score'].astype('f4') == df['score'].astype('f4'))
		del df2


		def test_dump_frame(tmp_path, rng: np.random.Generator):
		"Pickle a Pandas data frame"
		file = tmp_path / 'data.bpk'

		df = pd.DataFrame({
		'key': np.arange(0, 5000),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})

		dump(df, file)
		df2 = load(file)

		assert all(df2.columns == df.columns)
		for c in df2.columns:
		assert all(df2[c] == df[c])


		@given(arrays(scalar_dtypes(), st.integers(500, 10000)))
		def test_compress_many_arrays(tmp_path, a):
		"Pickle random NumPy arrays"
		assume(not any(np.isnan(a)))

		with TemporaryDirectory('.test', 'binpickle') as path:
		file = Path(path) / 'data.bpk'

		with BinPickler.compressed(file) as w:
		w.dump(a)

		with BinPickleFile(file) as bpf:
		assert not bpf.find_errors()
		assert len(bpf.entries) in (1, 2)
		a2 = bpf.load()
		assert len(a2) == len(a)
		assert all(a2 == a)


		@settings(deadline=None)
		@given(arrays(scalar_dtypes(), st.integers(500, 10000)))
		def test_map_many_arrays(a):
		"Pickle random NumPy arrays"
		assume(not any(np.isnan(a)))
		with TemporaryDirectory('.test', 'binpickle') as path:
		file = Path(path) / 'data.bpk'

		with BinPickler.mappable(file) as w:
		w.dump(a)

		with BinPickleFile(file, direct=True) as bpf:
		assert not bpf.find_errors()
		assert len(bpf.entries) in (1, 2)
		a2 = bpf.load()
		assert len(a2) == len(a)
		assert all(a2 == a)
		del a2

+117

tests/test_util.py

		import logging
		import io
		import zlib
		import functools as ft

		import numpy as np

		from hypothesis import given, settings, HealthCheck
		import hypothesis.strategies as st
		import pytest

		from binpickle.write import _align_pos, CKOut

		_log = logging.getLogger(__name__)


		def _split_blocks(*args):
		blosc = pytest.importorskip('binpickle.codecs.blosc')
		return blosc._split_blocks(*args)


		@given(st.integers(100, 10000000))
		def test_align(n):
		res = _align_pos(n, 1024)
		assert res >= n
		assert res % 1024 == 0


		@given(st.binary())
		def test_checksum_bytes(data):
		out = io.BytesIO()
		cko = CKOut(out)
		cko.write(data)
		assert out.getbuffer() == data
		assert cko.bytes == len(data)
		assert cko.checksum == zlib.adler32(data)


		@given(st.lists(st.binary(), min_size=1, max_size=10))
		def test_checksum_multi_bytes(arrays):
		out = io.BytesIO()
		cko = CKOut(out)
		for a in arrays:
		cko.write(a)
		cat = ft.reduce(lambda b1, b2: b1 + b2, arrays)
		assert out.getbuffer() == cat
		assert cko.bytes == len(cat)
		assert cko.checksum == zlib.adler32(cat)


		def test_split_empty_block():
		blocks = _split_blocks(memoryview(b''), 10)
		assert len(blocks) == 1
		assert blocks[0] == b''


		def test_split_one_block():
		blocks = _split_blocks(memoryview(b'asdf'), 10)
		assert len(blocks) == 1
		assert blocks[0] == b'asdf'


		def test_split_two_blocks():
		blocks = _split_blocks(memoryview(b'asdf'), 2)
		assert len(blocks) == 2
		assert blocks[0] == b'as'
		assert blocks[1] == b'df'
		assert blocks[0].nbytes == 2
		assert blocks[1].nbytes == 2


		def test_split_blocks_mismatch():
		blocks = _split_blocks(memoryview(b'asdfg'), 2)
		assert len(blocks) == 3
		assert blocks[0] == b'as'
		assert blocks[0].nbytes == 2
		assert blocks[1] == b'df'
		assert blocks[1].nbytes == 2
		assert blocks[2] == b'g'
		assert blocks[2].nbytes == 1


		@settings(suppress_health_check=[HealthCheck.too_slow])
		@given(st.data())
		def test_split_blocks(data):
		bs = data.draw(st.integers(8, 4096))
		input = data.draw(st.binary(min_size=bs//2, max_size=bs*8))
		_log.info('input size %d, block size %d', len(input), bs)
		blocks = _split_blocks(memoryview(input), bs)
		_log.info('split into %d blocks', len(blocks))
		assert all(b.nbytes <= bs for b in blocks)
		assert all(len(b) <= bs for b in blocks)
		assert sum(b.nbytes for b in blocks) == len(input)
		reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
		assert len(reconst) == len(input)
		assert reconst == input


		@settings(suppress_health_check=[HealthCheck.too_slow])
		@given(st.data())
		def test_split_arrays(data):
		bs = data.draw(st.integers(8, 4096))
		size = data.draw(st.integers(bs//8, bs*4))
		array = np.random.randn(size)
		input = memoryview(array)
		_log.info('input size %d (%d bytes), block size %d', len(input), input.nbytes, bs)
		blocks = _split_blocks(memoryview(input), bs)
		_log.info('split into %d blocks', len(blocks))
		assert all(b.nbytes <= bs for b in blocks)
		assert all(len(b) <= bs for b in blocks)
		assert sum(b.nbytes for b in blocks) == input.nbytes
		reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
		assert len(reconst) == input.nbytes
		rcv = memoryview(reconst).cast(input.format)
		assert rcv == input
		a2 = np.frombuffer(reconst, array.dtype)
		assert all(a2 == array)

+1

-1

binpickle.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: binpickle
		Version: 0.3.2
		Version: 0.3.3
		Summary: Efficient binary storage of ML models
		@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org

+6

-1

binpickle.egg-info/SOURCES.txt

		LICENSE
		MANIFEST.in
		README.md
		@@ -23,2 +24,6 @@ pyproject.toml
		binpickle/codecs/null.py
		binpickle/codecs/numcodecs.py
		binpickle/codecs/numcodecs.py
		tests/test_codecs.py
		tests/test_format.py
		tests/test_rw.py
		tests/test_util.py

+1

-1

binpickle/__init__.py

		@@ -5,5 +5,5 @@ """

		__version__ = '0.3.2'
		__version__ = '0.3.3'

		from .write import dump, BinPickler # noqa: F401
		from .read import load, BinPickleFile # noqa: F401

+1

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: binpickle
		Version: 0.3.2
		Version: 0.3.3
		Summary: Efficient binary storage of ML models
		@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org

binpickle - pypi Package Compare versions

Improved metrics