Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoSign in
Socket

binpickle

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

binpickle - pypi Package Compare versions

Comparing version
0.3.2
to
0.3.3
+1
MANIFEST.in
recursive-include tests *.py
import pytest
import numpy as np
from hypothesis import given, assume, settings
import hypothesis.strategies as st
from hypothesis.extra.numpy import arrays, integer_dtypes, floating_dtypes
from binpickle.codecs import *
if NC.AVAILABLE:
from numcodecs import LZ4, LZMA
KNOWN_CODECS = [c for c in CODECS.values() if c.NAME != 'numcodec'] # exclude numcodec from common tests
need_blosc = pytest.mark.skipif(not Blosc.AVAILABLE, reason='Blosc not available')
need_numcodecs = pytest.mark.skipif(not NC.AVAILABLE, reason='numcodecs not available')
def test_make_codec_none():
assert isinstance(make_codec(None), Null)
def test_make_codec_null_str():
assert isinstance(make_codec('null'), Null)
def test_make_codec_gz_str():
assert isinstance(make_codec('gz'), GZ)
def test_make_codec_return():
codec = GZ()
assert make_codec(codec) is codec
@need_numcodecs
def test_make_codec_wrap():
inner = LZ4()
codec = make_codec(inner)
assert isinstance(codec, NC)
assert codec.codec is inner
def test_make_codec_to_none():
"Test internal-use none codec"
assert make_codec(None, null_as_none=True) is None
assert make_codec(Null(), null_as_none=True) is None
def test_get_null_with_none():
codec = get_codec(None, {})
assert isinstance(codec, Null)
def test_get_null():
codec = get_codec('null', {})
assert isinstance(codec, Null)
def test_get_gz():
codec = get_codec('gz', {})
assert isinstance(codec, GZ)
assert codec.level == 9
def test_get_gz_level():
codec = get_codec('gz', {'level': 5})
assert isinstance(codec, GZ)
assert codec.level == 5
@need_blosc
def test_get_blosc():
codec = get_codec('blosc', {})
assert isinstance(codec, Blosc)
assert codec.level == 9
@need_blosc
def test_get_blosc_lvl():
codec = get_codec('blosc', {'name': 'zstd', 'level': 5})
assert isinstance(codec, Blosc)
assert codec.name == 'zstd'
assert codec.level == 5
@pytest.mark.parametrize('codec', KNOWN_CODECS)
@settings(deadline=500)
@given(st.binary())
def test_codec_roundtrip(codec, data):
"Round-trip a codec"
c = codec()
enc = c.encode(data)
dec = c.decode(enc)
assert len(dec) == len(data)
assert dec == data
@pytest.mark.parametrize('codec', KNOWN_CODECS)
@settings(deadline=500)
@given(arrays(st.one_of(integer_dtypes(), floating_dtypes()),
st.integers(10, 10000)))
def test_codec_roundtrip_array(codec, data):
"Round-trip a codec"
assume(not any(np.isnan(data)))
c = codec()
enc = c.encode(data)
dec = c.decode(enc)
a2 = np.frombuffer(dec, dtype=data.dtype)
assert len(a2) == len(data)
assert all(a2 == data)
@pytest.mark.parametrize('codec', KNOWN_CODECS)
def test_codec_decode_oversize(codec):
"Test decoding data to an oversized bytearray"
c = codec()
data = bytearray(np.random.randn(500))
out = bytearray(len(data) * 2)
enc = c.encode(data)
c.decode_to(enc, out)
assert len(out) == len(data)
assert out == data
@need_blosc
def test_large_blosc_encode():
"Test encoding Blosc data that needs to be split"
c = Blosc(blocksize=4096)
data = np.random.randn(10000)
enc = c.encode(data)
dec = c.decode(enc)
assert len(enc) < len(dec) # we should have compressed
assert len(dec) == data.nbytes
assert dec == memoryview(data)
a2 = np.frombuffer(data)
assert len(a2) == len(data)
assert all(a2 == data)
@need_numcodecs
@given(st.binary())
def test_numcodec_roundtrip(data):
c = NC(LZMA())
buf = c.encode(data)
d2 = c.decode(buf)
assert len(d2) == len(data)
assert d2 == data
@need_numcodecs
@given(st.binary())
def test_chain(data):
# Useless but a test
codec = Chain([LZMA(), GZ()])
buf = codec.encode(data)
d2 = codec.decode(buf)
assert len(d2) == len(data)
assert d2 == data
@need_numcodecs
def test_chain_config():
codec = Chain([LZMA(), GZ()])
assert len(codec.codecs) == 2
assert isinstance(codec.codecs[0], NC)
assert isinstance(codec.codecs[1], GZ)
cfg = codec.config()
c2 = get_codec(Chain.NAME, cfg)
assert len(codec.codecs) == 2
assert isinstance(codec.codecs[0], NC)
assert isinstance(codec.codecs[1], GZ)
def test_is_not_numcodec():
assert not numcodecs.is_numcodec(GZ())
@need_numcodecs
def test_is_numcodec():
assert numcodecs.is_numcodec(LZ4())
from pytest import raises
from binpickle.format import *
def test_format_sizes():
assert HEADER_FORMAT.size == 16
assert TRAILER_FORMAT.size == 16
def test_pack_default_header():
h = FileHeader()
bs = h.encode()
assert len(bs) == 16
def test_default_header_round_trip():
h = FileHeader()
bs = h.encode()
assert len(bs) == 16
h2 = FileHeader.decode(bs)
assert h2 is not h
assert h2 == h
def test_size_round_trip():
h = FileHeader(length=57)
bs = h.encode()
assert len(bs) == 16
h2 = FileHeader.decode(bs)
assert h2.length == 57
assert h2 == h
def test_catch_bad_magic():
with raises(ValueError) as exc:
FileHeader.decode(b'BNPQ\x00\x00\x00\x00' + (b'\x00' * 8))
assert 'magic' in str(exc.value)
def test_catch_bad_version():
with raises(ValueError) as exc:
FileHeader.decode(b'BPCK\x00\x02\x00\x00' + (b'\x00' * 8))
assert 'version' in str(exc.value)
def test_catch_bad_padding():
with raises(ValueError) as exc:
FileHeader.decode(b'BPCK\x00\x01\x00\xff' + (b'\x00' * 8))
assert 'padding' in str(exc.value)
import itertools as it
from tempfile import TemporaryDirectory
from pathlib import Path
import gc
import numpy as np
import pandas as pd
import pytest
from hypothesis import given, assume, settings
import hypothesis.strategies as st
from hypothesis.extra.numpy import arrays, scalar_dtypes
from binpickle.read import BinPickleFile, load
from binpickle.write import BinPickler, dump
from binpickle import codecs
RW_CTORS = [BinPickler, BinPickler.mappable, BinPickler.compressed]
RW_CODECS = [st.just(None), st.builds(codecs.GZ)]
if codecs.Blosc.AVAILABLE:
RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Blosc('zstd', 5)))
RW_CODECS.append(st.builds(codecs.Blosc))
RW_CODECS.append(st.builds(codecs.Blosc, st.just('zstd')))
if codecs.NC.AVAILABLE:
import numcodecs
RW_CTORS.append(lambda f: BinPickler.compressed(f, numcodecs.LZMA()))
RW_CODECS.append(st.builds(codecs.NC, st.just(numcodecs.LZMA())))
# also build a chain test
RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Chain([numcodecs.MsgPack(), codecs.GZ()])))
RW_CONFIGS = it.product(
RW_CTORS,
[False, True]
)
RW_PARAMS = ['writer', 'direct']
@pytest.fixture
def rng():
return np.random.default_rng()
def test_empty(tmp_path):
"Write a file with nothing in it"
file = tmp_path / 'data.bpk'
with BinPickler(file) as w:
w._finish_file()
assert file.stat().st_size == 33
with BinPickleFile(file) as bpf:
assert len(bpf.entries) == 0
def test_write_buf(tmp_path, rng: np.random.Generator):
"Write a file with a single array"
file = tmp_path / 'data.bpk'
a = rng.integers(0, 5000, 1024, dtype='i4')
with BinPickler(file) as w:
w._write_buffer(a)
w._finish_file()
with BinPickleFile(file, direct=True) as bpf:
assert len(bpf.entries) == 1
e = bpf.entries[0]
assert e.dec_length == a.nbytes
assert e.enc_length == a.nbytes
b2 = bpf._read_buffer(e)
assert b2.nbytes == e.dec_length
a2 = np.frombuffer(b2, dtype='i4')
assert len(a2) == len(a)
assert all(a2 == a)
del a2
del b2
@settings(deadline=None)
@given(st.lists(st.binary()),
st.one_of(RW_CODECS))
def test_write_encoded_arrays(arrays, codec):
with TemporaryDirectory('.test', 'binpickle-') as path:
file = Path(path) / 'data.bpk'
with BinPickler.compressed(file, codec) as w:
for a in arrays:
w._write_buffer(a)
w._finish_file()
with BinPickleFile(file) as bpf:
assert not bpf.find_errors()
assert len(bpf.entries) == len(arrays)
for e, a in zip(bpf.entries, arrays):
try:
if codec is not None:
assert e.codec
assert e.dec_length == len(a)
dat = bpf._read_buffer(e)
assert dat == a
finally: # delete things to make failures clearer
del dat
del e
gc.collect()
def test_pickle_array(tmp_path, rng: np.random.Generator):
"Pickle a NumPy array"
file = tmp_path / 'data.bpk'
a = rng.integers(0, 5000, 1024, dtype='i4')
with BinPickler(file) as w:
w.dump(a)
with BinPickleFile(file) as bpf:
assert len(bpf.entries) == 2
a2 = bpf.load()
assert len(a2) == len(a)
assert all(a2 == a)
@pytest.mark.parametrize(RW_PARAMS, RW_CONFIGS)
def test_pickle_frame(tmp_path, rng: np.random.Generator, writer, direct):
"Pickle a Pandas data frame"
file = tmp_path / 'data.bpk'
df = pd.DataFrame({
'key': np.arange(0, 5000),
'count': rng.integers(0, 1000, 5000),
'score': rng.normal(10, 2, 5000)
})
with writer(file) as w:
w.dump(df)
with BinPickleFile(file, direct=direct) as bpf:
assert not bpf.find_errors()
df2 = bpf.load()
print(df2)
assert all(df2.columns == df.columns)
for c in df2.columns:
assert all(df2[c] == df[c])
del df2
@pytest.mark.skipif(not codecs.NC.AVAILABLE, reason='numcodecs not available')
def test_pickle_frame_dyncodec(tmp_path, rng: np.random.Generator):
file = tmp_path / 'data.bpk'
df = pd.DataFrame({
'key': np.arange(0, 5000, dtype='i4'),
'count': rng.integers(0, 1000, 5000),
'score': rng.normal(10, 2, 5000)
})
def codec(buf):
obj = memoryview(buf).obj
if isinstance(obj, np.ndarray) and obj.dtype == np.float64:
print('compacting double array')
return codecs.Chain([numcodecs.AsType('f4', 'f8'), codecs.Blosc('zstd', 9)])
else:
return codecs.Blosc('zstd', 9)
with BinPickler.compressed(file, codec) as w:
w.dump(df)
with BinPickleFile(file) as bpf:
assert not bpf.find_errors()
df2 = bpf.load()
print(df2)
assert all(df2.columns == df.columns)
assert all(df2['key'] == df['key'])
assert all(df2['count'] == df['count'])
assert all(df2['score'].astype('f4') == df['score'].astype('f4'))
del df2
def test_dump_frame(tmp_path, rng: np.random.Generator):
"Pickle a Pandas data frame"
file = tmp_path / 'data.bpk'
df = pd.DataFrame({
'key': np.arange(0, 5000),
'count': rng.integers(0, 1000, 5000),
'score': rng.normal(10, 2, 5000)
})
dump(df, file)
df2 = load(file)
assert all(df2.columns == df.columns)
for c in df2.columns:
assert all(df2[c] == df[c])
@given(arrays(scalar_dtypes(), st.integers(500, 10000)))
def test_compress_many_arrays(tmp_path, a):
"Pickle random NumPy arrays"
assume(not any(np.isnan(a)))
with TemporaryDirectory('.test', 'binpickle') as path:
file = Path(path) / 'data.bpk'
with BinPickler.compressed(file) as w:
w.dump(a)
with BinPickleFile(file) as bpf:
assert not bpf.find_errors()
assert len(bpf.entries) in (1, 2)
a2 = bpf.load()
assert len(a2) == len(a)
assert all(a2 == a)
@settings(deadline=None)
@given(arrays(scalar_dtypes(), st.integers(500, 10000)))
def test_map_many_arrays(a):
"Pickle random NumPy arrays"
assume(not any(np.isnan(a)))
with TemporaryDirectory('.test', 'binpickle') as path:
file = Path(path) / 'data.bpk'
with BinPickler.mappable(file) as w:
w.dump(a)
with BinPickleFile(file, direct=True) as bpf:
assert not bpf.find_errors()
assert len(bpf.entries) in (1, 2)
a2 = bpf.load()
assert len(a2) == len(a)
assert all(a2 == a)
del a2
import logging
import io
import zlib
import functools as ft
import numpy as np
from hypothesis import given, settings, HealthCheck
import hypothesis.strategies as st
import pytest
from binpickle.write import _align_pos, CKOut
_log = logging.getLogger(__name__)
def _split_blocks(*args):
blosc = pytest.importorskip('binpickle.codecs.blosc')
return blosc._split_blocks(*args)
@given(st.integers(100, 10000000))
def test_align(n):
res = _align_pos(n, 1024)
assert res >= n
assert res % 1024 == 0
@given(st.binary())
def test_checksum_bytes(data):
out = io.BytesIO()
cko = CKOut(out)
cko.write(data)
assert out.getbuffer() == data
assert cko.bytes == len(data)
assert cko.checksum == zlib.adler32(data)
@given(st.lists(st.binary(), min_size=1, max_size=10))
def test_checksum_multi_bytes(arrays):
out = io.BytesIO()
cko = CKOut(out)
for a in arrays:
cko.write(a)
cat = ft.reduce(lambda b1, b2: b1 + b2, arrays)
assert out.getbuffer() == cat
assert cko.bytes == len(cat)
assert cko.checksum == zlib.adler32(cat)
def test_split_empty_block():
blocks = _split_blocks(memoryview(b''), 10)
assert len(blocks) == 1
assert blocks[0] == b''
def test_split_one_block():
blocks = _split_blocks(memoryview(b'asdf'), 10)
assert len(blocks) == 1
assert blocks[0] == b'asdf'
def test_split_two_blocks():
blocks = _split_blocks(memoryview(b'asdf'), 2)
assert len(blocks) == 2
assert blocks[0] == b'as'
assert blocks[1] == b'df'
assert blocks[0].nbytes == 2
assert blocks[1].nbytes == 2
def test_split_blocks_mismatch():
blocks = _split_blocks(memoryview(b'asdfg'), 2)
assert len(blocks) == 3
assert blocks[0] == b'as'
assert blocks[0].nbytes == 2
assert blocks[1] == b'df'
assert blocks[1].nbytes == 2
assert blocks[2] == b'g'
assert blocks[2].nbytes == 1
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(st.data())
def test_split_blocks(data):
bs = data.draw(st.integers(8, 4096))
input = data.draw(st.binary(min_size=bs//2, max_size=bs*8))
_log.info('input size %d, block size %d', len(input), bs)
blocks = _split_blocks(memoryview(input), bs)
_log.info('split into %d blocks', len(blocks))
assert all(b.nbytes <= bs for b in blocks)
assert all(len(b) <= bs for b in blocks)
assert sum(b.nbytes for b in blocks) == len(input)
reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
assert len(reconst) == len(input)
assert reconst == input
@settings(suppress_health_check=[HealthCheck.too_slow])
@given(st.data())
def test_split_arrays(data):
bs = data.draw(st.integers(8, 4096))
size = data.draw(st.integers(bs//8, bs*4))
array = np.random.randn(size)
input = memoryview(array)
_log.info('input size %d (%d bytes), block size %d', len(input), input.nbytes, bs)
blocks = _split_blocks(memoryview(input), bs)
_log.info('split into %d blocks', len(blocks))
assert all(b.nbytes <= bs for b in blocks)
assert all(len(b) <= bs for b in blocks)
assert sum(b.nbytes for b in blocks) == input.nbytes
reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
assert len(reconst) == input.nbytes
rcv = memoryview(reconst).cast(input.format)
assert rcv == input
a2 = np.frombuffer(reconst, array.dtype)
assert all(a2 == array)
+1
-1
Metadata-Version: 2.1
Name: binpickle
Version: 0.3.2
Version: 0.3.3
Summary: Efficient binary storage of ML models

@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org

LICENSE
MANIFEST.in
README.md

@@ -23,2 +24,6 @@ pyproject.toml

binpickle/codecs/null.py
binpickle/codecs/numcodecs.py
binpickle/codecs/numcodecs.py
tests/test_codecs.py
tests/test_format.py
tests/test_rw.py
tests/test_util.py

@@ -5,5 +5,5 @@ """

__version__ = '0.3.2'
__version__ = '0.3.3'
from .write import dump, BinPickler # noqa: F401
from .read import load, BinPickleFile # noqa: F401
Metadata-Version: 2.1
Name: binpickle
Version: 0.3.2
Version: 0.3.3
Summary: Efficient binary storage of ML models

@@ -5,0 +5,0 @@ Home-page: https://binpickle.lenskit.org