binpickle - npm Package Compare versions

+9

.copier-answers.yml

		# Changes here will be overwritten by Copier
		_commit: 0e64af4
		_src_path: https://github.com/lenskit/lk-project-template
		package_name: binpickle
		project_descr: Optimized format for pickling binary data.
		project_name: binpickle
		project_title: BinPickle
		require_lint: true
		start_year: 2020

+4

.gitattributes

		* text=auto
		*.sh text eol=lf
		*.bat text eol=crlf
		*.cmd text eol=crlf

+74

.github/workflows/check-sources.yml

		name: Validate Source Rules
		on:
		push:
		branches:
		- main
		pull_request:

		concurrency:
		group: check-${{github.ref}}
		cancel-in-progress: true

		jobs:
		lint:
		name: Check Source Style
		runs-on: ubuntu-latest

		steps:
		- name: 📥 Check out source code
		uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: 🐍 Set up Python
		uses: actions/setup-python@v4
		with:
		python-version: "3.11"
		cache: 'pip'

		- name: 🛠️ Install tools
		run: \|
		pip install ruff

		- name: 🪮 Check source code formatting
		id: format
		run: \|
		if pipx run ruff format --diff $PKG_DIR; then
		echo passed=yes >>"$GITHUB_OUTPUT"
		else
		echo passed=no >>"$GITHUB_OUTPUT"
		echo "::error::source code not formatted"
		fi
		env:
		PKG_DIR: binpickle

		- name: 🐜 Check source code lint rules
		id: lint
		run: \|
		if pipx run ruff check --output-format=github $PKG_DIR; then
		echo passed=yes >>"$GITHUB_OUTPUT"
		else
		echo passed=no >>"$GITHUB_OUTPUT"
		echo "::error::source code lint check failed"
		fi
		env:
		PKG_DIR: binpickle

		- name: 🧾 Checking results
		run: \|
		if [ "$FMT_PASSED" = no ]; then
		echo "::error::format failed, failing build"
		exit 1
		fi
		if [ "$LINT_PASSED" = no ]; then
		if [ "$LINT_REQUIRED" = true ]; then
		echo "::error::lint failed, failing build"
		exit 2
		else
		echo "::error::lint failed but non-mandatory"
		fi
		fi
		env:
		FMT_PASSED: ${{ steps.fmt.outputs.passed }}
		LINT_PASSED: ${{ steps.lint.outputs.passed }}
		LINT_REQUIRED: True

+108

.github/workflows/test.yml

		name: Test and Package
		on:
		push:
		branches:
		- main
		release:
		types: [created,published]
		pull_request:

		concurrency:
		group: test-${{github.ref}}
		cancel-in-progress: true

		jobs:
		test:
		name: Test with Python ${{matrix.python}} on ${{matrix.platform}}
		runs-on: ${{matrix.platform}}-latest
		strategy:
		matrix:
		platform:
		- macos
		- windows
		- ubuntu
		python:
		- "3.10"
		- "3.11"
		- "3.12"
		exclude:
		- platform: macos
		python: 3.9

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Set up Python
		uses: actions/setup-python@v4
		with:
		python-version: ${{matrix.python}}

		- name: Set up dependencies
		run: \|
		pip install -e '.[test]'

		- name: Run tests
		run: python -m pytest --cov=binpickle --cov-report=xml tests

		- name: Save test results
		uses: lenskit/lkbuild/actions/save-test-results@main
		with:
		artifact-name: test-${{matrix.platform}}-py${{matrix.python}}

		report:
		name: Process test results
		runs-on: ubuntu-latest
		needs: [test]

		steps:
		- uses: actions/checkout@v3
		with:
		fetch-depth: 0

		- name: Report test results
		uses: lenskit/lkbuild/actions/report-test-results@main

		sdist:
		name: Build Source Packages
		runs-on: ubuntu-latest
		needs: [test]

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Fetch Git tags
		run: git fetch --tags

		- name: Set up Python
		uses: actions/setup-python@v2
		with:
		python-version: "3.10"

		- name: Install Python deps
		run: pip install -U build

		- name: Build distribution
		run: python -m build

		- name: Save archive
		uses: actions/upload-artifact@v1
		with:
		name: pypi-pkgs
		path: dist

		- name: List dist dir
		run: ls -R dist

		- name: Publish PyPI packages
		if: github.event_name == 'release'
		run: \|
		twine upload dist/*
		shell: bash
		env:
		TWINE_NON_INTERACTIVE: y
		TWINE_USERNAME: __token__
		TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }}

+7

.vscode/settings.json

		{
		"mypy-type-checker.reportingScope": "workspace",
		"[python]": {
		"editor.defaultFormatter": "charliermarsh.ruff",
		"editor.formatOnSave": true,
		},
		}

+1

binpickle.egg-info/dependency_links.txt

+91

binpickle.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: binpickle
		Version: 0.4.0a1
		Summary: Optimized format for pickling binary data.
		Author-email: Michael Ekstrand <mdekstrand@drexel.edu>
		License: Copyright (c) 2020–2023 Boise State University
		Copyright (c) 2023 Michael Ekstrand

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		> The above copyright notice and this permission notice shall be included in
		> all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.

		Project-URL: Homepage, https://binpickle.lenksit.org
		Project-URL: GitHub, https://github.com/lenskit/binpickle
		Classifier: License :: OSI Approved :: MIT License
		Classifier: Programming Language :: Python :: 3
		Classifier: Programming Language :: Python :: 3.10
		Classifier: Programming Language :: Python :: 3.11
		Classifier: Operating System :: OS Independent
		Requires-Python: >=3.10
		Description-Content-Type: text/markdown
		License-File: LICENSE.md
		Requires-Dist: msgpack>=1.0
		Requires-Dist: numcodecs>=0.12
		Requires-Dist: typing-extensions~=4.8
		Provides-Extra: dev
		Requires-Dist: setuptools>=64; extra == "dev"
		Requires-Dist: setuptools_scm>=8; extra == "dev"
		Requires-Dist: ruff; extra == "dev"
		Requires-Dist: mypy~=1.5; extra == "dev"
		Requires-Dist: copier; extra == "dev"
		Requires-Dist: sphinx-autobuild; extra == "dev"
		Requires-Dist: humanize~=4.0; extra == "dev"
		Requires-Dist: msgpack-types; extra == "dev"
		Requires-Dist: pandas-stubs; extra == "dev"
		Provides-Extra: test
		Requires-Dist: pytest>=5; extra == "test"
		Requires-Dist: pytest-cov; extra == "test"
		Requires-Dist: hypothesis>=6; extra == "test"
		Requires-Dist: pandas>=1.4; extra == "test"
		Requires-Dist: numpy>=1.22; extra == "test"
		Provides-Extra: doc
		Requires-Dist: sphinx>=4.2; extra == "doc"
		Requires-Dist: sphinxext-opengraph>=0.5; extra == "doc"
		Requires-Dist: furo; extra == "doc"

		# BinPickle - efficient binary pickled data

		[![PyPI version](https://badge.fury.io/py/binpickle.svg)](https://badge.fury.io/py/binpickle)
		![Test and Build](https://github.com/lenskit/binpickle/workflows/Test%20and%20Package/badge.svg)
		[![codecov](https://codecov.io/gh/lenskit/binpickle/branch/master/graph/badge.svg)](https://codecov.io/gh/lenskit/binpickle)

		This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently
		serialize large objects, particularly from scientific Python packages, to an
		on-disk format. This format is designed to support two use cases:

		1. Serializing data-intensive statistical models in a memory-mappable format so
		multiple processes can share the same (read-only) model memory.
		2. Serializing data-intensive statistical models with good compression for long-term
		storage and cross-machine transportation.

		BinPickle does this by using Pickle 5's out-of-band buffer serialization support to
		write buffers uncompressed and page-aligned for memory mapping (use case 1) or with
		per-buffer efficient compression with libraries like Blosc (use case 2).

		## Format Stability

		We do not yet guarantee the stability of the BinPickle format. We will avoid gratuitous changes,
		but BinPickle 1.0 will be the first with a stability guarantee.

		## Acknowledgements

		This material is based upon work supported by the National Science Foundation under
		Grant No. IIS 17-51278. Any opinions, findings, and conclusions or recommendations
		expressed in this material are those of the author(s) and do not necessarily reflect
		the views of the National Science Foundation. This page has not been approved by
		Boise State University and does not reflect official university positions.

+26

binpickle.egg-info/requires.txt

		msgpack>=1.0
		numcodecs>=0.12
		typing-extensions~=4.8

		[dev]
		setuptools>=64
		setuptools_scm>=8
		ruff
		mypy~=1.5
		copier
		sphinx-autobuild
		humanize~=4.0
		msgpack-types
		pandas-stubs

		[doc]
		sphinx>=4.2
		sphinxext-opengraph>=0.5
		furo

		[test]
		pytest>=5
		pytest-cov
		hypothesis>=6
		pandas>=1.4
		numpy>=1.22

+39

binpickle.egg-info/SOURCES.txt

		.copier-answers.yml
		.editorconfig
		.gitattributes
		.gitignore
		.readthedocs.yml
		LICENSE.md
		README.md
		codecov.yml
		conftest.py
		pyproject.toml
		.github/workflows/check-sources.yml
		.github/workflows/test.yml
		.vscode/settings.json
		binpickle/__init__.py
		binpickle/_util.py
		binpickle/encode.py
		binpickle/errors.py
		binpickle/format.py
		binpickle/read.py
		binpickle/write.py
		binpickle.egg-info/PKG-INFO
		binpickle.egg-info/SOURCES.txt
		binpickle.egg-info/dependency_links.txt
		binpickle.egg-info/requires.txt
		binpickle.egg-info/top_level.txt
		docs/conf.py
		docs/format.rst
		docs/index.rst
		docs/read.rst
		docs/write.rst
		docs/_templates/base.html
		stubs/numcodecs/__init__.pyi
		stubs/numcodecs/abc.pyi
		stubs/numcodecs/registry.pyi
		tests/test_file_info.py
		tests/test_format.py
		tests/test_rw.py
		tests/test_util.py
		tests/test_validation.py

+1

binpickle.egg-info/top_level.txt

binpickle

+28

binpickle/_util.py

		"""
		Internal utility functions for Binpickle.
		"""
		from __future__ import annotations
		from typing import Optional, Any
		import hashlib
		from typing_extensions import Buffer

		naturalsize: Optional[Any]

		try:
		from humanize import naturalsize
		except ImportError:
		naturalsize = None


		def human_size(bytes: int \| float) -> str:
		if naturalsize:
		return naturalsize(bytes, binary=True, format="%.2f")
		else:
		return "{:.2f} MiB".format(bytes / (1024 * 1024))


		def hash_buffer(buf: Buffer) -> bytes:
		if not isinstance(buf, memoryview):
		buf = memoryview(buf)

		return hashlib.sha256(buf).digest()

+55

binpickle/encode.py

		"""
		Support for encoding and decoding.
		"""

		from __future__ import annotations
		from typing import Optional, TypeAlias, Callable, overload
		from typing_extensions import Buffer

		from numcodecs.abc import Codec
		from numcodecs.registry import get_codec

		from binpickle.format import CodecSpec

		CodecFunc: TypeAlias = Callable[[Buffer], Codec \| str \| CodecSpec \| None]
		CodecArg: TypeAlias = Codec \| str \| CodecSpec \| CodecFunc
		ResolvedCodec: TypeAlias = Codec \| CodecFunc


		@overload
		def resolve_codec(codec: CodecSpec) -> Codec:
		...


		@overload
		def resolve_codec(codec: CodecArg) -> ResolvedCodec:
		...


		@overload
		def resolve_codec(codec: CodecArg, buf: Buffer) -> Codec \| None:
		...


		def resolve_codec(codec: CodecArg, buf: Optional[Buffer] = None) -> ResolvedCodec \| None:
		"""
		Resolve a codec arg into an instantiated codec.
		"""

		if isinstance(codec, str):
		return resolve_codec({"id": codec})
		elif isinstance(codec, dict):
		return get_codec(codec)
		elif isinstance(codec, Codec):
		return codec
		elif hasattr(codec, "__call__"):
		if buf is None:
		return codec
		else:
		spec = codec(buf)
		if spec is None:
		return None
		else:
		return resolve_codec(spec, buf)
		else:
		raise TypeError(f"invalid codec argument {type(codec)}")

+16

binpickle/errors.py

		class BinPickleError(Exception):
		"""
		Base class for Binpickle errors.
		"""


		class FormatError(BinPickleError):
		"""
		The Binpickle file is invalid.
		"""


		class IntegrityError(BinPickleError):
		"""
		The Binpickle file failed an integrity check.
		"""

+6

docs/_templates/base.html

		{% extends '!base.html' %}

		{% block theme_scripts %}
		<script data-goatcounter="https://binpickle.goatcounter.com/count"
		async src="//gc.zgo.at/count.js"></script>
		{% endblock %}

+20

LICENSE.md

		Copyright (c) 2020–2023 Boise State University
		Copyright (c) 2023 Michael Ekstrand

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		> The above copyright notice and this permission notice shall be included in
		> all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.

+2

stubs/numcodecs/__init__.pyi

		from . import abc
		from . import registry

+11

stubs/numcodecs/abc.pyi

		from abc import ABC
		from typing_extensions import Buffer, Optional, Self

		class Codec(ABC):
		codec_id: Optional[str]

		def encode(self, buf: Buffer) -> Buffer: ...
		def decode(self, buf: Buffer, out: Optional[Buffer] = None) -> Buffer: ...
		def get_config(self) -> dict: ...
		@classmethod
		def from_config(cls, cfg: dict) -> Self: ...

+5

stubs/numcodecs/registry.pyi

		from .abc import Codec

		codec_registry: dict[str, Codec]

		def get_codec(config: dict) -> Codec: ...

+39

tests/test_file_info.py

		from pathlib import Path

		from binpickle import file_info
		from binpickle.read import FileStatus
		from binpickle.write import dump


		def test_missing_file(tmp_path: Path):
		file = tmp_path / "data.bpk"
		info = file_info(file)
		assert info.status == FileStatus.MISSING
		assert not info.is_valid


		def test_empty_file(tmp_path: Path):
		file = tmp_path / "data.bpk"
		file.write_bytes(b"")
		info = file_info(file)
		assert info.status == FileStatus.INVALID
		assert not info.is_valid


		def test_invalid_file(tmp_path: Path):
		file = tmp_path / "data.bpk"
		file.write_bytes(b"0" * 4096)
		info = file_info(file)
		assert info.status == FileStatus.INVALID
		assert not info.is_valid


		def test_valid_file(tmp_path: Path):
		file = tmp_path / "data.bpk"

		dump(None, file)

		info = file_info(file)
		assert info.status == FileStatus.BINPICKLE
		assert info.is_valid
		assert info.size == file.stat().st_size

+66

tests/test_validation.py

		import os
		import logging

		import numpy as np
		import pandas as pd

		import pytest

		from binpickle import dump, BinPickleFile
		from binpickle.errors import IntegrityError

		_log = logging.getLogger(__name__)


		def test_verfy_index(tmp_path, rng: np.random.Generator):
		"Index hash mismatch should fail"
		file = tmp_path / "data.bpk"

		df = pd.DataFrame(
		{
		"key": np.arange(0, 5000),
		"count": rng.integers(0, 1000, 5000),
		"score": rng.normal(10, 2, 5000),
		}
		)

		dump(df, file, codecs=["lz4"])

		# corrupt the file
		stat = os.stat(file)
		_log.info("%s: length %d", file, stat.st_size)
		with open(file, "r+b") as f:
		f.seek(stat.st_size - 2)
		f.write(b"XX")

		# try to read the file
		with pytest.raises(IntegrityError, match=r"incorrect hash"):
		with BinPickleFile(file) as _bpf:
		pass


		def test_verfy_buffer(tmp_path, rng: np.random.Generator):
		"Corrupt buffer should fail hash."
		file = tmp_path / "data.bpk"

		df = pd.DataFrame(
		{
		"key": np.arange(0, 5000),
		"count": rng.integers(0, 1000, 5000),
		"score": rng.normal(10, 2, 5000),
		}
		)

		dump(df, file, codecs=["lz4"])

		# corrupt the file
		stat = os.stat(file)
		_log.info("%s: length %d", file, stat.st_size)
		with open(file, "r+b") as f:
		f.seek(32)
		f.write(b"XXXXXXXX")

		# try to read the file
		with BinPickleFile(file) as bpf:
		with pytest.raises(IntegrityError, match=r"incorrect hash"):
		bpf.load()

+16

-4

.editorconfig

		root = true

		[*]
		indent_style = space
		indent_size = 4
		charset = utf-8
		insert_final_newline = true
		trim_trailing_whitespace = true
		insert_final_newline = true
		indent_size = 4
		indent_style = space

		[{.yaml,.yml}]
		[{.json,.yml,.yaml,.yml.jinja}]
		indent_size = 2

		[*.toml]
		indent_size = 2

		[*.sh]
		end_of_line = lf

		[*.{bat,cmd}]
		end_of_line = crlf

		[*.md]
		trim_trailing_whitespace = false

+48

-8

.gitignore

		@@ -1,10 +0,50 @@
		/.vscode
		/build
		/dist
		/.hypothesis
		*.egg-info/
		*.dist-info/
		.coverage
		# log and debug outputs
		*.log
		*.pdb
		*.prof
		*.lprof
		emissions.csv
		intel_power_gadget_log.csv
		.coverage*
		coverage.xml
		cov-reports/
		test-logs/
		htmlcov/

		# caches and working directories
		__pycache__/
		*.pyc
		__pycache__
		.ipynb_checkpoints/
		dask-worker-space/
		.idea/
		.*_cache/
		.hypothesis/
		.tox/
		.vagrant/
		.venv/
		scratch/

		# build outputs
		build/
		dist/
		*.egg-info
		*.pyd
		*.so
		*.dll
		*.exp
		*.lib
		*.o
		*.obj

		# environment locks that aren't committed
		/env.yml
		conda-lock.yml
		*.lock
		*.lock.yml

		# Editor and OS cruft
		.DS_Store
		._.DS_Store
		*~
		*.tmp
		.vs/

+5

-1

.readthedocs.yml

		version: 2

		build:
		os: ubuntu-22.04
		tools:
		python: "3.11"

		sphinx:
		@@ -7,3 +12,2 @@ configuration: docs/conf.py
		python:
		version: 3.8
		install:
		@@ -10,0 +14,0 @@ - method: pip

+11

-3

binpickle/__init__.py

		@@ -5,5 +5,13 @@ """

		__version__ = '0.3.4'
		from importlib.metadata import version, PackageNotFoundError

		from .write import dump, BinPickler # noqa: F401
		from .read import load, BinPickleFile # noqa: F401
		from .write import dump, BinPickler
		from .read import load, BinPickleFile, file_info

		try:
		__version__ = version("binpickle")
		except PackageNotFoundError:
		# package is not installed
		pass

		__all__ = ["dump", "BinPickler", "load", "BinPickleFile", "file_info"]

+56

-28

binpickle/format.py

		@@ -5,12 +5,27 @@ """

		from dataclasses import dataclass, field, fields
		import struct
		from typing import NamedTuple
		from typing import TypeAlias

		MAGIC = b'BPCK'
		VERSION = 1
		HEADER_FORMAT = struct.Struct('!4sHHq')
		TRAILER_FORMAT = struct.Struct('!QLL')
		from binpickle.errors import FormatError

		CodecSpec: TypeAlias = dict[str, str \| bool \| int \| float \| None]
		"""
		Type of codec specification dictionaries, to be passed to
		:func:`numcodecs.registry.get_codec`.
		"""

		class FileHeader(NamedTuple):
		BufferTypeInfo: TypeAlias = tuple[str, str, tuple[int, ...]]
		"""
		Type of buffer type (and size/shape) information.
		"""

		MAGIC = b"BPCK"
		VERSION = 2
		HEADER_FORMAT = struct.Struct("!4sHHq")
		TRAILER_FORMAT = struct.Struct("!QL32s")


		@dataclass
		class FileHeader:
		"""
		@@ -20,7 +35,10 @@ File header for a BinPickle file. The header is a 16-byte sequence containing the

		1. File version (2 bytes, big-endian). Currently only version 1 exists.
		2. Reserved (2 bytes). Set to 0.
		1. File version (2 bytes, big-endian).
		2. Flags (2 bytes). Currently no flags are defined, so this is set to 0.
		3. File length (8 bytes, big-endian). Length is signed; if the file length is not known,
		this field is set to -1.
		"""

		SIZE = HEADER_FORMAT.size

		version: int = VERSION
		@@ -36,11 +54,14 @@ "The NumPy file version."
		@classmethod
		def decode(cls, buf, *, verify=True):
		def decode(cls, buf: bytes, *, verify=True):
		"Decode a file header from bytes."
		m, v, pad, off = HEADER_FORMAT.unpack(buf)
		if len(buf) != HEADER_FORMAT.size:
		raise FormatError("incorrect header length")

		m, v, flags, off = HEADER_FORMAT.unpack(buf)
		if verify and m != MAGIC:
		raise ValueError('invalid magic {}'.format(m))
		raise FormatError("invalid magic {}".format(m))
		if verify and v != VERSION:
		raise ValueError('invalid version {}'.format(v))
		if verify and pad != 0:
		raise ValueError('invalid padding')
		raise FormatError("invalid version {}".format(v))
		if verify and flags != 0:
		raise FormatError("unsupported flags")
		return cls(v, off)
		@@ -58,3 +79,3 @@
		elif self.length > 0:
		raise ValueError('file size {} not enough for BinPickle'.format(self.length))
		raise FormatError("file size {} not enough for BinPickle".format(self.length))
		else:
		@@ -64,5 +85,6 @@ return None # We do not know the file size

		class FileTrailer(NamedTuple):
		@dataclass
		class FileTrailer:
		"""
		File trailer for a BinPickle file. The trailer is a 16-byte sequence that tells the
		File trailer for a BinPickle file. The trailer is a 44-byte sequence that tells the
		reader where to find the rest of the binpickle data. It consists of the following
		@@ -73,12 +95,14 @@ fields:
		2. Index length (4 bytes, big-endian). The number of bytes in the index.
		3. Index checksum (4 bytes, big-endian). The Adler32 checksum of the index data.
		3. Index digest (32 bytes). The SHA256 digest of the index data.
		"""

		SIZE = TRAILER_FORMAT.size

		offset: int
		length: int
		checksum: int
		hash: bytes

		def encode(self):
		"Encode the file trailer as bytes."
		return TRAILER_FORMAT.pack(self.offset, self.length, self.checksum)
		return TRAILER_FORMAT.pack(self.offset, self.length, self.hash)

		@@ -88,10 +112,12 @@ @classmethod
		"Decode a file trailer from bytes."
		o, l, c = TRAILER_FORMAT.unpack(buf)
		return cls(o, l, c)
		off, len, ck = TRAILER_FORMAT.unpack(buf)
		return cls(off, len, ck)


		class IndexEntry(NamedTuple):
		@dataclass
		class IndexEntry:
		"""
		Index entry for a buffer in the BinPickle index.
		"""

		offset: int
		@@ -103,10 +129,12 @@ "The position in the file where the buffer begins (bytes)."
		"The decoded length of the buffer in bytes."
		checksum: int
		"The Adler-32 checksum of the encoded buffer data."
		codec: tuple = None
		"The codec used to encode the buffer, or None."
		hash: bytes
		"The SHA-256 checksum of the encoded buffer data."
		info: BufferTypeInfo \| None
		"Type information for the buffer (if available)."
		codecs: list[CodecSpec] = field(default_factory=list)
		"The sequence of codecs used to encode the buffer."

		def to_repr(self):
		"Convert an index entry to its MsgPack-compatible representation"
		return dict((k, getattr(self, k)) for k in self._fields)
		return dict((f.name, getattr(self, f.name)) for f in fields(self))

		@@ -113,0 +141,0 @@ @classmethod

+126

-39

binpickle/read.py

		@@ -0,10 +1,18 @@
		from dataclasses import dataclass
		from enum import Enum
		import hashlib
		import mmap
		import logging
		import io
		from zlib import adler32
		from os import PathLike
		from typing import Optional
		from typing_extensions import Buffer
		import pickle
		import msgpack

		from .compat import pickle
		from binpickle.encode import resolve_codec
		from binpickle.errors import BinPickleError, FormatError, IntegrityError

		from .format import FileHeader, IndexEntry, FileTrailer
		from .codecs import get_codec
		from ._util import hash_buffer

		@@ -14,2 +22,18 @@ _log = logging.getLogger(__name__)

		class FileStatus(Enum):
		MISSING = 0
		INVALID = 1
		BINPICKLE = 2


		@dataclass
		class BPKInfo:
		status: FileStatus
		size: int

		@property
		def is_valid(self):
		return self.status == FileStatus.BINPICKLE


		class BinPickleFile:
		@@ -27,11 +51,23 @@ """
		:meth:`close` is called.
		verify(bool):
		If ``True`` (the default), verify file checksums while reading.
		"""

		def __init__(self, filename, *, direct=False):
		filename: str \| PathLike
		direct: bool
		verify: bool
		header: FileHeader
		trailer: FileTrailer
		_map: Optional[mmap.mmap]
		_mv: Optional[memoryview]
		_index_buf: Optional[memoryview]
		entries: list[IndexEntry]

		def __init__(self, filename, *, direct: bool = False, verify: bool = True):
		self.filename = filename
		self.direct = direct
		with open(filename, 'rb') as bpf:
		self.verify = verify
		with open(filename, "rb") as bpf:
		self.header = FileHeader.read(bpf)
		self._map = mmap.mmap(bpf.fileno(), self.header.length,
		access=mmap.ACCESS_READ)
		self._map = mmap.mmap(bpf.fileno(), self.header.length, access=mmap.ACCESS_READ)
		self._mv = memoryview(self._map)
		@@ -47,3 +83,3 @@ self._read_index()

		def load(self):
		def load(self) -> object:
		"""
		@@ -53,6 +89,7 @@ Load the object from the binpickle file.
		if not self.entries:
		raise ValueError('empty pickle file has no objects')
		raise ValueError("empty pickle file has no objects")
		p_bytes = self._read_buffer(self.entries[-1], direct=True)
		_log.debug('unpickling %d bytes and %d buffers',
		len(p_bytes), len(self.entries) - 1)
		_log.debug(
		"unpickling %d bytes and %d buffers", memoryview(p_bytes).nbytes, len(self.entries) - 1
		)

		@@ -63,3 +100,8 @@ buf_gen = (self._read_buffer(e) for e in self.entries[:-1])

		def find_errors(self):
		@property
		def is_mappable(self) -> bool:
		"Query whether this file can be memory-mapped."
		return all(not e.codecs for e in self.entries)

		def find_errors(self) -> list[str]:
		"""
		@@ -71,9 +113,10 @@ Verify binpickle data structure validity. If the file is invalid, returns
		invalid msgpack formats in the index won't be detected here. This method checks
		buffer checksums, offset overlaps, and such.
		buffer hashes, offset overlaps, and such.
		"""
		errors = []
		assert self._index_buf is not None, "file not loaded"

		i_sum = adler32(self._index_buf)
		if i_sum != self.trailer.checksum:
		errors.append(f'invalid index checksum ({i_sum} != {self.trailer.checksum})')
		i_sum = hashlib.sha256(self._index_buf).digest()
		if i_sum != self.trailer.hash:
		errors.append("index hash mismatch")

		@@ -83,14 +126,14 @@ position = 16
		if e.offset < position:
		errors.append(f'entry {i}: offset {e.offset} before expected start {position}')
		errors.append(f"entry {i}: offset {e.offset} before expected start {position}")
		buf = self._read_buffer(e, direct=True)
		ndec = len(buf)
		ndec = memoryview(buf).nbytes
		if ndec != e.dec_length:
		errors.append(f'entry {i}: decoded to {ndec} bytes, expected {e.dec_length}')
		cks = adler32(self._read_buffer(e, direct=True, decode=False))
		if cks != e.checksum:
		errors.append('entry {i}: invalid checksum ({cks} != {e.checksum}')
		errors.append(f"entry {i}: decoded to {ndec} bytes, expected {e.dec_length}")
		cks = hashlib.sha256(self._read_buffer(e, direct=True, decode=False)).digest()
		if cks != e.hash:
		errors.append("entry {i}: invalid digest")

		return errors

		def close(self):
		def close(self) -> None:
		"""
		@@ -106,9 +149,10 @@ Close the BinPickle file. If the file is in direct mode, all

		def _read_index(self):
		def _read_index(self) -> None:
		tpos = self.header.trailer_pos()
		if tpos is None:
		raise ValueError('no file length, corrupt binpickle file?')
		raise FormatError("no file length, corrupt binpickle file?")
		assert self._mv is not None, "file not open"

		buf = self._mv[tpos:]
		assert len(buf) == 16
		assert len(buf) == 44
		self.trailer = FileTrailer.decode(buf)
		@@ -119,6 +163,17 @@
		self._index_buf = self._mv[i_start:i_end]
		try:
		self._verify_buffer(self._index_buf, self.trailer.hash, "index")
		except Exception as e:
		self._index_buf.release()
		self._index_buf = None
		raise e

		self.entries = [IndexEntry.from_repr(e) for e in msgpack.unpackb(self._index_buf)]
		_log.debug('read %d entries from file', len(self.entries))
		_log.debug("read %d entries from file", len(self.entries))

		def _read_buffer(self, entry: IndexEntry, *, direct=None, decode=True):
		def _read_buffer(
		self, entry: IndexEntry, *, direct: Optional[bool] = None, decode: bool = True
		) -> Buffer:
		assert self._mv is not None, "file not open"
		assert self._map is not None, "file not open"
		start = entry.offset
		@@ -130,18 +185,35 @@ length = entry.enc_length

		if decode and entry.codec:
		name, cfg = entry.codec
		_log.debug('decoding %d bytes from %d with %s', length, start, name)
		out = bytearray(entry.dec_length)
		codec = get_codec(name, cfg)
		codec.decode_to(self._mv[start:end], out)
		buf = self._mv[start:end]
		try:
		self._verify_buffer(buf, entry.hash)
		except Exception as e:
		# make sure we release the buffer, even if it's captured by the stack trace
		buf.release()
		raise e

		_log.debug("decoding %d bytes from %d with %s", length, start, entry.codecs)

		if decode and entry.codecs:
		codecs = [resolve_codec(c) for c in entry.codecs]
		out: Buffer = buf
		for codec in codecs[::-1]:
		out = codec.decode(out)
		return out

		if direct:
		_log.debug('mapping %d bytes from %d', length, start)
		return self._mv[start:end]
		_log.debug("mapping %d bytes from %d", length, start)
		return buf
		else:
		_log.debug('copying %d bytes from %d', length, start)
		return self._map[start:end]
		_log.debug("copying %d bytes from %d", length, start)
		return buf.tobytes()

		def _verify_buffer(self, buf: memoryview, hash: bytes, msg: str = "buffer"):
		if self.verify:
		_log.debug("verifying %s", msg)
		bhash = hash_buffer(buf)
		if bhash != hash:
		raise IntegrityError(f"{msg} has incorrect hash, corrupt file?")

		def load(file):

		def load(file: str \| PathLike) -> object:
		"""
		@@ -156,1 +228,16 @@ Load an object from a BinPickle file.
		return bpf.load()


		def file_info(file: str \| PathLike) -> BPKInfo:
		"""
		Test whether a file is a BinPickle file, and if so, return basic information
		about it.
		"""
		try:
		with open(file, "rb") as f:
		info = FileHeader.read(f)
		return BPKInfo(FileStatus.BINPICKLE, info.length)
		except FileNotFoundError:
		return BPKInfo(FileStatus.MISSING, 0)
		except BinPickleError:
		return BPKInfo(FileStatus.INVALID, 0)

+103

-76

binpickle/write.py

		import mmap
		from os import PathLike
		import warnings
		import logging
		import io
		from zlib import adler32
		import hashlib
		import pickle
		import msgpack

		from .compat import pickle
		from .format import FileHeader, FileTrailer, IndexEntry
		from . import codecs
		from typing_extensions import Buffer, List, Optional, Self

		import numpy as np

		from .format import CodecSpec, FileHeader, FileTrailer, IndexEntry
		from .encode import ResolvedCodec, resolve_codec, CodecArg
		from ._util import human_size

		_log = logging.getLogger(__name__)


		def _align_pos(pos, size=mmap.PAGESIZE):
		def _align_pos(pos: int, size: int = mmap.PAGESIZE) -> int:
		"Advance a position to be aligned."
		@@ -24,23 +30,2 @@ rem = pos % size

		class CKOut:
		"""
		Wrapper for binary output that computes checksums and sizes on the fly.
		"""

		def __init__(self, base):
		self.bytes = 0
		self.checksum = 1
		self.delegate = base

		def write(self, data):
		# get a memory view so we have a portable count of bytes
		mv = memoryview(data)
		self.bytes += mv.nbytes
		self.checksum = adler32(data, self.checksum)
		return self.delegate.write(data)

		def flush(self):
		self.delegate.flush()


		class BinPickler:
		@@ -68,13 +53,30 @@ """

		def __init__(self, filename, *, align=False, codec=None):
		filename: str \| PathLike
		align: bool
		codecs: list[ResolvedCodec]
		entries: List[IndexEntry]
		_file: io.BufferedWriter

		def __init__(
		self,
		filename: str \| PathLike,
		*,
		align=False,
		codecs: Optional[list[CodecArg]] = None,
		):
		self.filename = filename
		self.align = align
		self._file = open(filename, 'wb')
		self._file = open(filename, "wb")
		self.entries = []
		self.codec = codec

		if codecs is None:
		self.codecs = []
		else:
		# pre-resolve the codecs
		self.codecs = [resolve_codec(c) for c in codecs]

		self._init_header()

		@classmethod
		def mappable(cls, filename):
		def mappable(cls, filename: str \| PathLike):
		"Convenience method to construct a pickler for memory-mapped use."
		@@ -84,22 +86,32 @@ return cls(filename, align=True)
		@classmethod
		def compressed(cls, filename, codec=codecs.GZ()):
		def compressed(cls, filename: str \| PathLike, codec: CodecArg = "gzip"):
		"Convenience method to construct a pickler for compressed storage."
		return cls(filename, codec=codec)
		return cls(filename, codecs=[codec])

		def dump(self, obj):
		def dump(self, obj: object) -> None:
		"Dump an object to the file. Can only be called once."
		bio = io.BytesIO()
		pk = pickle.Pickler(bio, protocol=pickle.HIGHEST_PROTOCOL,
		buffer_callback=self._write_buffer)
		pk = pickle.Pickler(
		bio, protocol=pickle.HIGHEST_PROTOCOL, buffer_callback=self._write_buffer
		)
		pk.dump(obj)
		buf = bio.getbuffer()
		_log.info('pickled %d bytes with %d buffers', buf.nbytes, len(self.entries))

		tot_enc = sum(e.enc_length for e in self.entries)
		tot_dec = sum(e.dec_length for e in self.entries)
		_log.info(
		"pickled %d bytes with %d buffers totaling %s (%s encoded)",
		buf.nbytes,
		len(self.entries),
		human_size(tot_dec),
		human_size(tot_enc),
		)
		self._write_buffer(buf)
		self._finish_file()

		def close(self):
		def close(self) -> None:
		"Close the bin pickler."
		self._file.close()

		def __enter__(self):
		def __enter__(self) -> Self:
		return self
		@@ -111,27 +123,30 @@

		def _init_header(self):
		def _init_header(self) -> None:
		pos = self._file.tell()
		if pos > 0:
		warnings.warn('BinPickler not at beginning of file')
		warnings.warn("BinPickler not at beginning of file")
		h = FileHeader()
		_log.debug('initializing header for %s', self.filename)
		_log.debug("initializing header for %s", self.filename)
		self._file.write(h.encode())
		assert self._file.tell() == pos + 16
		assert self._file.tell() == pos + FileHeader.SIZE

		def _encode_buffer(self, buf, out):
		if self.codec is None:
		out.write(buf)
		return None
		elif hasattr(self.codec, '__call__'):
		# codec is callable, call it to get the codec
		codec = self.codec(buf)
		codec = codecs.make_codec(codec)
		else:
		codec = codecs.make_codec(self.codec)
		def _encode_buffer(
		self,
		buf: Buffer,
		) -> tuple[Buffer, list[CodecSpec]]:
		# fast-path empty buffers
		if memoryview(buf).nbytes == 0:
		return b"", []

		codec.encode_to(buf, out)
		return (codec.NAME, codec.config())
		# resolve any deferred codecs
		codecs = [resolve_codec(c, buf) for c in self.codecs]

		def _write_buffer(self, buf):
		mv = memoryview(buf)
		for codec in codecs:
		if codec is not None:
		buf = codec.encode(buf)

		return buf, [c.get_config() for c in codecs if c is not None]

		def _write_buffer(self, buf: Buffer) -> None:
		mv = buf.raw() if isinstance(buf, pickle.PickleBuffer) else memoryview(buf)
		offset = self._file.tell()
		@@ -143,3 +158,3 @@
		nzeds = off2 - offset
		zeds = b'\x00' * nzeds
		zeds = b"\x00" * nzeds
		self._file.write(zeds)
		@@ -151,30 +166,42 @@ assert self._file.tell() == off2

		_log.debug('writing %d bytes at position %d', length, offset)
		cko = CKOut(self._file)
		c_spec = self._encode_buffer(buf, cko)
		_log.debug('encoded %d bytes to %d (%.2f%% saved)', length, cko.bytes,
		(length - cko.bytes) / length * 100 if length else -0.0)
		_log.debug('used codec %s', c_spec)
		binfo = None
		if isinstance(mv.obj, np.ndarray):
		binfo = ("ndarray", str(mv.obj.dtype), mv.obj.shape)

		assert self._file.tell() == offset + cko.bytes
		_log.debug("writing %d bytes at position %d", length, offset)
		buf, c_spec = self._encode_buffer(buf)
		enc_len = memoryview(buf).nbytes
		_log.debug(
		"encoded %d bytes to %d (%.2f%% saved)",
		length,
		enc_len,
		(length - enc_len) / length * 100 if length else -0.0,
		)
		_log.debug("used codecs %s", c_spec)
		hash = hashlib.sha256(buf)
		_log.debug("has hash %s", hash.hexdigest())
		self._file.write(buf)

		self.entries.append(IndexEntry(offset, cko.bytes, length, cko.checksum,
		c_spec))
		assert self._file.tell() == offset + enc_len

		def _write_index(self):
		self.entries.append(IndexEntry(offset, enc_len, length, hash.digest(), binfo, c_spec))

		def _write_index(self) -> FileTrailer:
		buf = msgpack.packb([e.to_repr() for e in self.entries])
		pos = self._file.tell()
		nbs = len(buf)
		_log.debug('writing %d index entries (%d bytes) at position %d',
		len(self.entries), nbs, pos)
		_log.debug(
		"writing %d index entries (%d bytes) at position %d", len(self.entries), nbs, pos
		)
		self._file.write(buf)
		ft = FileTrailer(pos, nbs, adler32(buf))
		hash = hashlib.sha256(buf)
		ft = FileTrailer(pos, nbs, hash.digest())
		self._file.write(ft.encode())
		return ft

		def _finish_file(self):
		def _finish_file(self) -> None:
		self._write_index()

		pos = self._file.tell()
		_log.debug('finalizing file with length %d', pos)
		_log.debug("finalizing file with length %d", pos)
		h = FileHeader(length=pos)
		@@ -186,3 +213,3 @@ self._file.seek(0)

		def dump(obj, file, *, mappable=False, codec=codecs.GZ()):
		def dump(obj, file: str \| PathLike, *, mappable: bool = False, codecs: list[CodecArg] = ["gzip"]):
		"""
		@@ -206,4 +233,4 @@ Dump an object to a BinPickle file. This is a convenience wrapper
		in this case.
		codec(codecs.Codec):
		The codec to use to compress the data, when not saving for
		codecs:
		The codecs to use to compress the data, when not saving for
		memory-mapping.
		@@ -215,4 +242,4 @@ """
		else:
		bpk = BinPickler(file, align=False, codec=codec)
		bpk = BinPickler(file, align=False, codecs=codecs)
		with bpk:
		bpk.dump(obj)

+2

-1

codecov.yml

		ignore:
		- build*.py
		- build-tools/
		- build-tools/*
		- lkbuild/*

+12

-4

conftest.py

		from hypothesis import settings
		import pytest
		import numpy as np


		@pytest.fixture
		def rng():
		return np.random.default_rng()


		# set up profiles
		settings.register_profile('default', deadline=500)
		settings.register_profile('large', max_examples=5000)
		settings.register_profile('fast', max_examples=10)
		settings.load_profile('default')
		settings.register_profile("default", deadline=1000)
		settings.register_profile("large", max_examples=5000)
		settings.register_profile("fast", max_examples=10)
		settings.load_profile("default")

+19

-24

docs/conf.py

		import os
		import sys
		sys.path.insert(0, os.path.abspath('..'))
		sys.path.insert(0, os.path.abspath(".."))

		import sphinx_rtd_theme

		import binpickle

		project = 'BinPickle'
		copyright = '2020 Boise State University'
		author = 'Michael D. Ekstrand'
		project = "BinPickle"
		copyright = "2023 Michael Ekstrand"
		author = "Michael D. Ekstrand"

		@@ -16,31 +14,28 @@ release = binpickle.__version__
		extensions = [
		'sphinx.ext.napoleon',
		'sphinx.ext.autodoc',
		'sphinx.ext.autosummary',
		'sphinx.ext.intersphinx',
		'sphinx_rtd_theme'
		"sphinx.ext.napoleon",
		"sphinx.ext.autodoc",
		"sphinx.ext.autosummary",
		"sphinx.ext.intersphinx",
		"sphinxext.opengraph",
		]

		source_suffix = '.rst'
		source_suffix = ".rst"

		pygments_style = 'sphinx'
		highlight_language = 'python3'
		pygments_style = "sphinx"
		highlight_language = "python3"

		html_theme = 'sphinx_rtd_theme'
		html_theme = "furo"
		html_theme_options = {
		'github_user': 'lenskit',
		'github_repo': 'binpickle',
		'travis_button': False,
		'canonical_url': 'https://binpickle.lenskit.org/',
		'font_family': 'Georgia, Charter, serif'
		}
		templates_path = ['_templates']
		templates_path = ["_templates"]

		intersphinx_mapping = {
		'python': ('https://docs.python.org/3/', None)
		"python": ("https://docs.python.org/3/", None),
		"numpy": ("https://docs.scipy.org/doc/numpy/", None),
		"sklearn": ("https://scikit-learn.org/stable/", None),
		}

		autodoc_default_options = {
		'members': True,
		'member-order': 'bysource'
		"members": True,
		"member-order": "bysource"
		}

+31

-2

docs/format.rst

		@@ -10,4 +10,6 @@ Format
		Users will not need these classes. They are documented here in the interest of documenting
		the file format.
		the file format. The current format version is 2, first used in binpickle 0.4.0; this
		is not compatible with prior versions.


		File Structure
		@@ -34,3 +36,3 @@ --------------
		4. The file index, stored as a list of :py:class:`IndexEntry` objects encoded in MsgPack.
		5. 16-byte trailer (see :py:class:`FileTrailer`).
		5. 44-byte trailer (see :py:class:`FileTrailer`).

		@@ -50,1 +52,28 @@ The position and length of each buffer is stored in the index, so buffers can have arbitrary
		.. autoclass:: IndexEntry

		Format History
		--------------

		The current file format version is 2, introduced in BinPickle 0.4.0.

		.. _format-v2:

		Version 2
		~~~~~~~~~

		Version 2 introduced the following:

		* Replaced Adler32 checksums with SHA-256 digests.
		* Replaced the single ``codec`` field with a ``codecs`` list field. The new
		field directly specifies a list of :py:mod:`numcodecs` codec configurations
		in the order they were applied to encode the buffer. The old native codecs
		have been removed, all codecs come from numcodecs.
		* Added the ``info`` field to :py:class:`IndexEntry` to store information about
		the buffer's data, when available (currently stores NumPy data type and shape
		when serializing a NumPy array).

		Version 1
		~~~~~~~~~

		Version 1 is the original BinPickle format, used through the 0.3 release series. It
		is no longer supported.

+19

-7

docs/index.rst

		@@ -15,8 +15,9 @@ BinPickle

		BinPickle wraps Python's pickling functionality, so any object that can be pickled
		(including SciKit models) can be stored with BinPickle. If the object supports
		Pickle Protocol 5 (or stores most of its data in NumPy arrays, which in recent
		versions support Pickle 5), then large array data will be efficiently stored,
		either compressed (with Blosc compression by default) or page-aligned and ready
		for memory-mapping, possibly into multiple processes simultaneously.
		BinPickle wraps Python's pickling functionality, so any object that can be
		pickled (including SciKit models) can be stored with BinPickle. If the object
		supports Pickle Protocol 5 (or stores most of its data in NumPy arrays, which in
		recent versions support Pickle 5), then large array data will be efficiently
		stored, either compressed (using any compressor supported by
		:py:mod:`numcodecs`) or page-aligned and ready for memory-mapping, possibly into
		multiple processes simultaneously.

		@@ -43,3 +44,2 @@ Quick Start
		read
		codecs
		format
		@@ -55,1 +55,13 @@
		.. _`joblib`: https://github.com/joblib/joblib

		Acknowledgements
		----------------

		This material is based upon work supported by the National Science Foundation under
		Grant No. `IIS 17-51278`_. Any
		opinions, findings, and conclusions or recommendations expressed in this material
		are those of the author(s) and do not necessarily reflect the views of the
		National Science Foundation. This page has not been approved by
		Boise State University and does not reflect official university positions.

		.. _`IIS 17-51278`: https://md.ekstrandom.net/research/career

+0

-0

docs/read.rst

@@ -0,0 +0,0 @@ Reading BinPickle Files

+0

-0

docs/write.rst

@@ -0,0 +0,0 @@ Writing BinPickle Files

+89

-5

PKG-INFO

		@@ -1,7 +0,91 @@
		Metadata-Version: 1.1
		Metadata-Version: 2.1
		Name: binpickle
		Version: 0.3.4
		Version: 0.4.0a1
		Summary: Optimized format for pickling binary data.
		Home-page: https://binpickle.lenskit.org
		Author: Michael Ekstrand
		Author-email: michaelekstrand@boisestate.edu
		Author-email: Michael Ekstrand <mdekstrand@drexel.edu>
		License: Copyright (c) 2020–2023 Boise State University
		Copyright (c) 2023 Michael Ekstrand

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		> The above copyright notice and this permission notice shall be included in
		> all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.

		Project-URL: Homepage, https://binpickle.lenksit.org
		Project-URL: GitHub, https://github.com/lenskit/binpickle
		Classifier: License :: OSI Approved :: MIT License
		Classifier: Programming Language :: Python :: 3
		Classifier: Programming Language :: Python :: 3.10
		Classifier: Programming Language :: Python :: 3.11
		Classifier: Operating System :: OS Independent
		Requires-Python: >=3.10
		Description-Content-Type: text/markdown
		License-File: LICENSE.md
		Requires-Dist: msgpack>=1.0
		Requires-Dist: numcodecs>=0.12
		Requires-Dist: typing-extensions~=4.8
		Provides-Extra: dev
		Requires-Dist: setuptools>=64; extra == "dev"
		Requires-Dist: setuptools_scm>=8; extra == "dev"
		Requires-Dist: ruff; extra == "dev"
		Requires-Dist: mypy~=1.5; extra == "dev"
		Requires-Dist: copier; extra == "dev"
		Requires-Dist: sphinx-autobuild; extra == "dev"
		Requires-Dist: humanize~=4.0; extra == "dev"
		Requires-Dist: msgpack-types; extra == "dev"
		Requires-Dist: pandas-stubs; extra == "dev"
		Provides-Extra: test
		Requires-Dist: pytest>=5; extra == "test"
		Requires-Dist: pytest-cov; extra == "test"
		Requires-Dist: hypothesis>=6; extra == "test"
		Requires-Dist: pandas>=1.4; extra == "test"
		Requires-Dist: numpy>=1.22; extra == "test"
		Provides-Extra: doc
		Requires-Dist: sphinx>=4.2; extra == "doc"
		Requires-Dist: sphinxext-opengraph>=0.5; extra == "doc"
		Requires-Dist: furo; extra == "doc"

		# BinPickle - efficient binary pickled data

		[![PyPI version](https://badge.fury.io/py/binpickle.svg)](https://badge.fury.io/py/binpickle)
		![Test and Build](https://github.com/lenskit/binpickle/workflows/Test%20and%20Package/badge.svg)
		[![codecov](https://codecov.io/gh/lenskit/binpickle/branch/master/graph/badge.svg)](https://codecov.io/gh/lenskit/binpickle)

		This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently
		serialize large objects, particularly from scientific Python packages, to an
		on-disk format. This format is designed to support two use cases:

		1. Serializing data-intensive statistical models in a memory-mappable format so
		multiple processes can share the same (read-only) model memory.
		2. Serializing data-intensive statistical models with good compression for long-term
		storage and cross-machine transportation.

		BinPickle does this by using Pickle 5's out-of-band buffer serialization support to
		write buffers uncompressed and page-aligned for memory mapping (use case 1) or with
		per-buffer efficient compression with libraries like Blosc (use case 2).

		## Format Stability

		We do not yet guarantee the stability of the BinPickle format. We will avoid gratuitous changes,
		but BinPickle 1.0 will be the first with a stability guarantee.

		## Acknowledgements

		This material is based upon work supported by the National Science Foundation under
		Grant No. IIS 17-51278. Any opinions, findings, and conclusions or recommendations
		expressed in this material are those of the author(s) and do not necessarily reflect
		the views of the National Science Foundation. This page has not been approved by
		Boise State University and does not reflect official university positions.

+70

-37

pyproject.toml

		[build-system]
		requires = ["flit_core >=2,<4"]
		build-backend = "flit_core.buildapi"
		requires = ["setuptools>=64", "setuptools_scm>=8"]
		build-backend = "setuptools.build_meta"

		[tool.flit.metadata]
		module = "binpickle"
		author = "Michael Ekstrand"
		author-email = "michaelekstrand@boisestate.edu"
		home-page = "https://binpickle.lenskit.org"
		classifiers = ["License :: OSI Approved :: MIT License"]
		description-file = "README.md"
		requires-python = ">= 3.6.1"
		requires = [
		"msgpack >= 1.0",
		"pickle5; python_version < '3.8'"
		[project]
		name = "binpickle"
		description = "Optimized format for pickling binary data."
		authors = [
		{name="Michael Ekstrand", email="mdekstrand@drexel.edu"}
		]
		classifiers = [
		"License :: OSI Approved :: MIT License",
		"Programming Language :: Python :: 3",
		"Programming Language :: Python :: 3.10",
		"Programming Language :: Python :: 3.11",
		"Operating System :: OS Independent",
		]
		requires-python = ">= 3.10"
		readme = "README.md"
		license = { file = "LICENSE.md" }
		dynamic = ["version"]
		dependencies = [
		"msgpack >= 1.0",
		"numcodecs >= 0.12",
		"typing-extensions ~= 4.8",
		]

		[tool.flit.metadata.urls]
		GitHub = "https://github.com/lenskit/binpickle"

		[tool.flit.metadata.requires-extra]
		blosc = [ "blosc" ]
		numcodecs = [ "numcodecs >= 0.7" ]
		[project.optional-dependencies]
		dev = [
		"setuptools>=64",
		"setuptools_scm>=8",
		"ruff",
		"mypy ~=1.5",
		"copier",
		"sphinx-autobuild",
		"humanize ~=4.0",
		"msgpack-types",
		"pandas-stubs",
		]
		test = [
		"pytest >= 5",
		"pytest-cov",
		"hypothesis >= 6",
		"pandas >= 1.0",
		"numpy >= 1.17"
		"pytest >= 5",
		"pytest-cov",
		"hypothesis >= 6",
		"pandas >= 1.4",
		"numpy >= 1.22",
		]
		doc = ["sphinx"]
		dev = [
		"flake8",
		"rstcheck"
		doc = [
		"sphinx >=4.2",
		"sphinxext-opengraph >= 0.5",
		"furo",
		]

		[tools.flit.sdist]
		include = ["tests/*"]
		[project.urls]
		Homepage = "https://binpickle.lenksit.org"
		GitHub = "https://github.com/lenskit/binpickle"

		# configure build tools
		[tool.setuptools]
		packages = ["binpickle"]

		[tool.setuptools_scm]
		version_scheme = "release-branch-semver"

		# settings for generating conda environments for dev & CI, when needed
		[tool.pyproject2conda]
		channels = ["conda-forge"]

		[tool.ruff]
		line-length = 100
		target-version = "py310"
		exclude = [
		".github"
		".git",
		"__pycache__",
		"docs/conf.py",
		"build",
		"dist",
		]

		[tool.envtool.conda]
		name = "binpickle"
		channels = ["conda-forge"]

		[tool.envtool.conda.overrides]
		msgpack = "msgpack-python"
		[tool.mypy]
		mypy_path = "$MYPY_CONFIG_FILE_DIR/stubs"
		exclude = "^docs/"

+3

-3

README.md

		@@ -7,5 +7,5 @@ # BinPickle - efficient binary pickled data

		This package uses the new Pickle Protocol 5 in Python 3.8 (or its `pickle5` backport)
		to efficiently serialize large objects, particularly from scientific Python packages,
		to an on-disk format. This format is designed to support two use cases:
		This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently
		serialize large objects, particularly from scientific Python packages, to an
		on-disk format. This format is designed to support two use cases:

		@@ -12,0 +12,0 @@ 1. Serializing data-intensive statistical models in a memory-mappable format so

+3

-11

setup.cfg

		@@ -1,12 +0,4 @@
		[flake8]
		max-line-length = 100
		exclude =
		.git
		__pycache__
		docs/conf.py
		build
		dist
		tests
		[egg_info]
		tag_build =
		tag_date = 0

		[pep8]
		max-line-length = 100

+14

-13

tests/test_format.py

		from pytest import raises

		from binpickle.format import *
		from binpickle.errors import FormatError
		from binpickle.format import FileHeader, FileTrailer, HEADER_FORMAT, TRAILER_FORMAT

		@@ -8,3 +9,5 @@
		assert HEADER_FORMAT.size == 16
		assert TRAILER_FORMAT.size == 16
		assert FileHeader.SIZE == 16
		assert TRAILER_FORMAT.size == 44
		assert FileTrailer.SIZE == 44

		@@ -39,18 +42,16 @@
		def test_catch_bad_magic():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BNPQ\x00\x00\x00\x00' + (b'\x00' * 8))
		assert 'magic' in str(exc.value)
		with raises(FormatError) as exc:
		FileHeader.decode(b"BNPQ\x00\x00\x00\x00" + (b"\x00" * 8))
		assert "magic" in str(exc.value)


		def test_catch_bad_version():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BPCK\x00\x02\x00\x00' + (b'\x00' * 8))
		assert 'version' in str(exc.value)
		with raises(FormatError) as exc:
		FileHeader.decode(b"BPCK\x00\x12\x00\x00" + (b"\x00" * 8))
		assert "invalid version" in str(exc.value)


		def test_catch_bad_padding():
		with raises(ValueError) as exc:
		FileHeader.decode(b'BPCK\x00\x01\x00\xff' + (b'\x00' * 8))
		assert 'padding' in str(exc.value)


		with raises(FormatError) as exc:
		FileHeader.decode(b"BPCK\x00\x02\x00\xff" + (b"\x00" * 8))
		assert "unsupported flags" in str(exc.value)

+73

-69

tests/test_rw.py

		@@ -7,2 +7,5 @@ import itertools as it
		import pandas as pd
		import numcodecs as nc
		from numcodecs.registry import codec_registry
		from numcodecs.abc import Codec

		@@ -16,34 +19,27 @@ import pytest
		from binpickle.write import BinPickler, dump
		from binpickle import codecs


		RW_CTORS = [BinPickler, BinPickler.mappable, BinPickler.compressed]
		RW_CODECS = [st.just(None), st.builds(codecs.GZ)]
		if codecs.Blosc.AVAILABLE:
		RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Blosc('zstd', 5)))
		RW_CODECS.append(st.builds(codecs.Blosc))
		RW_CODECS.append(st.builds(codecs.Blosc, st.just('zstd')))
		if codecs.NC.AVAILABLE:
		import numcodecs
		RW_CTORS.append(lambda f: BinPickler.compressed(f, numcodecs.LZMA()))
		RW_CODECS.append(st.builds(codecs.NC, st.just(numcodecs.LZMA())))
		# also build a chain test
		RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Chain([numcodecs.MsgPack(), codecs.GZ()])))
		RW_CTORS = [
		BinPickler,
		BinPickler.mappable,
		BinPickler.compressed,
		lambda f: BinPickler.compressed(f, nc.LZMA()),
		]
		RW_CODECS: list[st.SearchStrategy[Codec \| str \| None]] = [
		st.just(None),
		st.just("gzip"),
		st.builds(nc.GZip),
		st.builds(nc.LZMA),
		]
		if "blosc" in codec_registry:
		RW_CODECS.append(st.builds(nc.Blosc))
		RW_CODECS.append(st.builds(nc.Blosc, st.one_of(st.just("zstd"), st.just("lz4"))))

		RW_CONFIGS = it.product(
		RW_CTORS,
		[False, True]
		)
		RW_PARAMS = ['writer', 'direct']
		RW_CONFIGS = it.product(RW_CTORS, [False, True])
		RW_PARAMS = ["writer", "direct"]



		@pytest.fixture
		def rng():
		return np.random.default_rng()


		def test_empty(tmp_path):
		"Write a file with nothing in it"
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		@@ -53,3 +49,3 @@ with BinPickler(file) as w:

		assert file.stat().st_size == 33
		assert file.stat().st_size == 61

		@@ -62,5 +58,5 @@ with BinPickleFile(file) as bpf:
		"Write a file with a single array"
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		a = rng.integers(0, 5000, 1024, dtype='i4')
		a = rng.integers(0, 5000, 1024, dtype="i4")

		@@ -78,3 +74,3 @@ with BinPickler(file) as w:
		assert b2.nbytes == e.dec_length
		a2 = np.frombuffer(b2, dtype='i4')
		a2 = np.frombuffer(b2, dtype="i4")
		assert len(a2) == len(a)
		@@ -87,9 +83,8 @@ assert all(a2 == a)
		@settings(deadline=None)
		@given(st.lists(st.binary()),
		st.one_of(RW_CODECS))
		@given(st.lists(st.binary()), st.one_of(RW_CODECS))
		def test_write_encoded_arrays(arrays, codec):
		with TemporaryDirectory('.test', 'binpickle-') as path:
		file = Path(path) / 'data.bpk'
		with TemporaryDirectory(".test", "binpickle-") as path:
		file = Path(path) / "data.bpk"

		with BinPickler.compressed(file, codec) as w:
		with BinPickler(file, codecs=[codec] if codec else []) as w:
		for a in arrays:
		@@ -104,4 +99,4 @@ w._write_buffer(a)
		try:
		if codec is not None:
		assert e.codec
		if codec is not None and e.dec_length > 0:
		assert e.codecs
		assert e.dec_length == len(a)
		@@ -118,5 +113,5 @@ dat = bpf._read_buffer(e)
		"Pickle a NumPy array"
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		a = rng.integers(0, 5000, 1024, dtype='i4')
		a = rng.integers(0, 5000, 1024, dtype="i4")

		@@ -136,9 +131,11 @@ with BinPickler(file) as w:
		"Pickle a Pandas data frame"
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		df = pd.DataFrame({
		'key': np.arange(0, 5000),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})
		df = pd.DataFrame(
		{
		"key": np.arange(0, 5000),
		"count": rng.integers(0, 1000, 5000),
		"score": rng.normal(10, 2, 5000),
		}
		)

		@@ -158,11 +155,12 @@ with writer(file) as w:

		@pytest.mark.skipif(not codecs.NC.AVAILABLE, reason='numcodecs not available')
		def test_pickle_frame_dyncodec(tmp_path, rng: np.random.Generator):
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		df = pd.DataFrame({
		'key': np.arange(0, 5000, dtype='i4'),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})
		df = pd.DataFrame(
		{
		"key": np.arange(0, 5000, dtype="i4"),
		"count": rng.integers(0, 1000, 5000),
		"score": rng.normal(10, 2, 5000),
		}
		)

		@@ -172,8 +170,8 @@ def codec(buf):
		if isinstance(obj, np.ndarray) and obj.dtype == np.float64:
		print('compacting double array')
		return codecs.Chain([numcodecs.AsType('f4', 'f8'), codecs.Blosc('zstd', 9)])
		print("compacting double array")
		return nc.AsType("f4", "f8")
		else:
		return codecs.Blosc('zstd', 9)
		None

		with BinPickler.compressed(file, codec) as w:
		with BinPickler(file, codecs=[codec, nc.Blosc("zstd", 3)]) as w:
		w.dump(df)
		@@ -186,6 +184,7 @@
		assert all(df2.columns == df.columns)
		assert all(df2['key'] == df['key'])
		assert all(df2['count'] == df['count'])
		assert all(df2['score'].astype('f4') == df['score'].astype('f4'))
		assert all(df2["key"] == df["key"])
		assert all(df2["count"] == df["count"])
		assert all(df2["score"].astype("f4") == df["score"].astype("f4"))
		del df2
		assert bpf.entries[0].info

		@@ -195,12 +194,14 @@
		"Pickle a Pandas data frame"
		file = tmp_path / 'data.bpk'
		file = tmp_path / "data.bpk"

		df = pd.DataFrame({
		'key': np.arange(0, 5000),
		'count': rng.integers(0, 1000, 5000),
		'score': rng.normal(10, 2, 5000)
		})
		df = pd.DataFrame(
		{
		"key": np.arange(0, 5000),
		"count": rng.integers(0, 1000, 5000),
		"score": rng.normal(10, 2, 5000),
		}
		)

		dump(df, file)
		df2 = load(file)
		df2: pd.DataFrame = load(file)

		@@ -212,2 +213,3 @@ assert all(df2.columns == df.columns)

		@settings(deadline=None)
		@given(arrays(scalar_dtypes(), st.integers(500, 10000)))
		@@ -218,4 +220,4 @@ def test_compress_many_arrays(a):

		with TemporaryDirectory('.test', 'binpickle') as path:
		file = Path(path) / 'data.bpk'
		with TemporaryDirectory(".test", "binpickle") as path:
		file = Path(path) / "data.bpk"

		@@ -227,2 +229,3 @@ with BinPickler.compressed(file) as w:
		assert not bpf.find_errors()
		assert not bpf.is_mappable
		assert len(bpf.entries) in (1, 2)
		@@ -239,4 +242,4 @@ a2 = bpf.load()
		assume(not any(np.isnan(a)))
		with TemporaryDirectory('.test', 'binpickle') as path:
		file = Path(path) / 'data.bpk'
		with TemporaryDirectory(".test", "binpickle") as path:
		file = Path(path) / "data.bpk"

		@@ -248,2 +251,3 @@ with BinPickler.mappable(file) as w:
		assert not bpf.find_errors()
		assert bpf.is_mappable
		assert len(bpf.entries) in (1, 2)
		@@ -250,0 +254,0 @@ a2 = bpf.load()

+2

-104

tests/test_util.py

		import logging
		import io
		import zlib
		import functools as ft

		import numpy as np

		from hypothesis import given, settings, HealthCheck
		from hypothesis import given
		import hypothesis.strategies as st
		import pytest

		from binpickle.write import _align_pos, CKOut
		from binpickle.write import _align_pos

		@@ -17,7 +11,2 @@ _log = logging.getLogger(__name__)

		def _split_blocks(*args):
		blosc = pytest.importorskip('binpickle.codecs.blosc')
		return blosc._split_blocks(*args)


		@given(st.integers(100, 10000000))
		@@ -28,92 +17,1 @@ def test_align(n):
		assert res % 1024 == 0


		@given(st.binary())
		def test_checksum_bytes(data):
		out = io.BytesIO()
		cko = CKOut(out)
		cko.write(data)
		assert out.getbuffer() == data
		assert cko.bytes == len(data)
		assert cko.checksum == zlib.adler32(data)


		@given(st.lists(st.binary(), min_size=1, max_size=10))
		def test_checksum_multi_bytes(arrays):
		out = io.BytesIO()
		cko = CKOut(out)
		for a in arrays:
		cko.write(a)
		cat = ft.reduce(lambda b1, b2: b1 + b2, arrays)
		assert out.getbuffer() == cat
		assert cko.bytes == len(cat)
		assert cko.checksum == zlib.adler32(cat)


		def test_split_empty_block():
		blocks = _split_blocks(memoryview(b''), 10)
		assert len(blocks) == 1
		assert blocks[0] == b''


		def test_split_one_block():
		blocks = _split_blocks(memoryview(b'asdf'), 10)
		assert len(blocks) == 1
		assert blocks[0] == b'asdf'


		def test_split_two_blocks():
		blocks = _split_blocks(memoryview(b'asdf'), 2)
		assert len(blocks) == 2
		assert blocks[0] == b'as'
		assert blocks[1] == b'df'
		assert blocks[0].nbytes == 2
		assert blocks[1].nbytes == 2


		def test_split_blocks_mismatch():
		blocks = _split_blocks(memoryview(b'asdfg'), 2)
		assert len(blocks) == 3
		assert blocks[0] == b'as'
		assert blocks[0].nbytes == 2
		assert blocks[1] == b'df'
		assert blocks[1].nbytes == 2
		assert blocks[2] == b'g'
		assert blocks[2].nbytes == 1


		@settings(suppress_health_check=[HealthCheck.too_slow])
		@given(st.data())
		def test_split_blocks(data):
		bs = data.draw(st.integers(8, 4096))
		input = data.draw(st.binary(min_size=bs//2, max_size=bs*8))
		_log.info('input size %d, block size %d', len(input), bs)
		blocks = _split_blocks(memoryview(input), bs)
		_log.info('split into %d blocks', len(blocks))
		assert all(b.nbytes <= bs for b in blocks)
		assert all(len(b) <= bs for b in blocks)
		assert sum(b.nbytes for b in blocks) == len(input)
		reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
		assert len(reconst) == len(input)
		assert reconst == input


		@settings(suppress_health_check=[HealthCheck.too_slow])
		@given(st.data())
		def test_split_arrays(data):
		bs = data.draw(st.integers(8, 4096))
		size = data.draw(st.integers(bs//8, bs*4))
		array = np.random.randn(size)
		input = memoryview(array)
		_log.info('input size %d (%d bytes), block size %d', len(input), input.nbytes, bs)
		blocks = _split_blocks(memoryview(input), bs)
		_log.info('split into %d blocks', len(blocks))
		assert all(b.nbytes <= bs for b in blocks)
		assert all(len(b) <= bs for b in blocks)
		assert sum(b.nbytes for b in blocks) == input.nbytes
		reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes())
		assert len(reconst) == input.nbytes
		rcv = memoryview(reconst).cast(input.format)
		assert rcv == input
		a2 = np.frombuffer(reconst, array.dtype)
		assert all(a2 == array)

-17

.github/flake8-matcher.json

		{
		"problemMatcher": [
		{
		"owner": "flake8",
		"pattern": [
		{
		"regexp": "^([^:]):(\\d+):(\\d+): (error\|warning): (\\w\\d\\d\\d) (.)$",
		"file": 1,
		"line": 2,
		"column": 3,
		"severity": 4,
		"message": 6
		}
		]
		}
		]
		}

-4

.github/release-drafter.yml

		template: \|
		## Merged PRs

		$CHANGES

-16

.github/workflows/draft-release.yml

		name: Draft Release

		on:
		push:
		# branches to consider in the event; optional, defaults to all
		branches:
		- master

		jobs:
		update_release_draft:
		runs-on: ubuntu-latest
		steps:
		# Drafts your next Release notes as Pull Requests are merged into "master"
		- uses: release-drafter/release-drafter@v5
		env:
		GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

-188

.github/workflows/package.yml

		name: Test and Package
		on:
		push:
		branches:
		- main
		release:
		types: [created,published]
		pull_request:

		jobs:
		lint:
		name: Check Source Style
		runs-on: ubuntu-latest

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Set up Python ${{matrix.python}}
		uses: actions/setup-python@v2
		with:
		python-version: 3.8

		- name: Prep Pip caching
		id: pip-cache
		run: \|
		echo "::set-output name=dir::$(pip cache dir)"
		shell: bash

		- name: Cache Pip wheels
		uses: actions/cache@v1
		with:
		path: ${{ steps.pip-cache.outputs.dir }}
		key: py38-lint-pip-${{ hashFiles('*.egg-info/requires.txt')}}

		- name: Install environment
		run: \|
		pip install -U flit

		- name: Install package
		run: \|
		flit install --pth-file --extras dev

		- name: Run lint
		run: \|
		# Flake8 problem matcher & transform regex from https://github.com/TrueBrain/actions-flake8
		echo "::add-matcher::.github/flake8-matcher.json"
		set -o pipefail
		flake8 \|sed -r 's/: ([^W][0-9][0-9][0-9])/: error: \1/;s/: (W[0-9][0-9][0-9])/: warning: \1/'
		echo "::remove-matcher owner=flake8::"

		test:
		name: Test with Python ${{matrix.python}} on ${{matrix.platform}}
		runs-on: ${{matrix.platform}}-latest
		strategy:
		matrix:
		platform:
		- macos
		- windows
		- ubuntu
		python:
		- 3.6
		- 3.7
		- 3.8
		- 3.9
		exclude:
		- platform: macos
		python: 3.9

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Set up Python ${{matrix.python}}
		uses: actions/setup-python@v2
		with:
		python-version: ${{matrix.python}}

		- name: Prep Pip cache
		id: pip-cache
		run: \|
		echo "::set-output name=dir::$(pip cache dir)"
		shell: bash

		- name: Cache Pip wheels
		uses: actions/cache@v1
		with:
		path: ${{ steps.pip-cache.outputs.dir }}
		key: ${{ matrix.platform }}-py${{ matrix.python }}-pip-${{ hashFiles('*.egg-info/requires.txt')}}

		- name: Install environment
		run: \|
		pip install -U flit

		- name: Install package
		run: \|
		flit install --pth-file --extras all

		- name: Run tests
		run: python -m pytest --cov=binpickle --cov-report=xml tests

		- name: Upload coverage
		uses: codecov/codecov-action@v1

		no-extras:
		name: Test without extras
		runs-on: ubuntu-latest

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Set up Python ${{matrix.python}}
		uses: actions/setup-python@v2
		with:
		python-version: 3.8

		- name: Get Pip cache dir
		id: pip-cache
		run: \|
		echo "::set-output name=dir::$(pip cache dir)"
		shell: bash

		- name: Cache Pip wheels
		uses: actions/cache@v1
		with:
		path: ${{ steps.pip-cache.outputs.dir }}
		key: no-extras-pip-${{ hashFiles('*.egg-info/requires.txt')}}

		- name: Install environment
		run: \|
		pip install -U flit

		- name: Install package
		run: \|
		flit install --pth-file --extras dev,test

		- name: Run tests
		run: python -m pytest --cov=binpickle --cov-report=xml tests

		- name: Upload coverage
		uses: codecov/codecov-action@v1

		sdist:
		name: Build Source Packages
		runs-on: ubuntu-latest
		needs: [test, lint, no-extras]

		steps:
		- uses: actions/checkout@v2
		with:
		fetch-depth: 0

		- name: Fetch Git tags
		run: git fetch --tags

		- name: Set up Python
		uses: actions/setup-python@v2
		with:
		python-version: 3.8

		- name: Install Python deps
		run: pip install -U flit

		- name: Build distribution
		run: flit build

		- name: Save archive
		uses: actions/upload-artifact@v1
		with:
		name: pypi-pkgs
		path: dist

		- name: List dist dir
		run: ls -R dist

		- name: Publish PyPI packages
		if: github.event_name == 'release'
		run: \|
		flit publish
		shell: bash
		env:
		TWINE_NON_INTERACTIVE: y
		FLIT_USERNAME: __token__
		FLIT_PASSWORD: ${{ secrets.TWINE_TOKEN }}

-95

binpickle/codecs/__init__.py

		"""
		Codecs for encoding and decoding buffers in BinPickle.

		This is similar in spirit to numcodecs_, but automatically handles some cases
		such as splitting arrays into blocks.

		.. _numcodecs: https://numcodecs.readthedocs.io/en/stable/
		"""

		from ._base import Codec # noqa: F401
		import logging

		from . import null
		from . import gz
		from . import blosc
		from . import numcodecs

		_log = logging.getLogger(__name__)

		CODECS = {}

		Null = null.Null
		GZ = gz.GZ
		Blosc = blosc.Blosc
		NC = numcodecs.NC


		def register(cls):
		CODECS[cls.NAME] = cls


		def make_codec(codec, *, null_as_none=False, list_is_tuple=False):
		"""
		Resolve a codec into a BinPickle-compatible codec.

		Args:
		codec(obj):
		The codec to resolve into a codec. Can be one of:

		* ``None`` (returns :class:`Null`)
		* A :class:`Codec` object (returned as-is)
		* A string (look up codec by name and return with default options)
		* A tuple ``(name, config)`` (pass to :func:`get_config`)
		* A list (wrapped in :class:`Chain`)
		* A :class:`numcodecs.abc.Codec` (wrapped in :class:`NC` and returned)

		Returns:
		Codec: the codec.
		"""
		if codec is None and not null_as_none:
		return Null()
		elif isinstance(codec, str):
		return CODECS[codec]()
		elif isinstance(codec, tuple) or (list_is_tuple and isinstance(codec, list)):
		name, config = codec
		return get_codec(name, config)
		elif isinstance(codec, list):
		return Chain(codec)
		elif numcodecs.is_numcodec(codec):
		return NC(codec)
		elif isinstance(codec, Null) and null_as_none:
		return None
		else:
		return codec


		def get_codec(name, config):
		"""
		Get a codec by name and configuration (as stored in the BinPickle manifest).

		Args:
		name(str or None): the codec name.
		config: the codec configuration, as returned by :meth:`Codec.config`.

		Returns:
		Codec: the configured codec.
		"""
		if name is None:
		return Null()
		elif name in CODECS:
		_log.debug('configuring %s: %s', name, config)
		return CODECS[name](**config)
		else:
		raise ValueError(f'unknown codec {name}')


		from .chain import Chain # noqa: E402

		register(Null)
		register(Chain)
		register(GZ)
		if Blosc.AVAILABLE:
		register(Blosc)
		if NC.AVAILABLE:
		register(NC)

-71

binpickle/codecs/_base.py

		from abc import ABC, abstractmethod
		import io


		class Codec(ABC):
		"""
		Base class for a codec.

		Attributes:
		NAME(str): the name for this codec, used by :func:`get_codec` and in index entries.
		"""

		def encode(self, buf):
		"""
		Encode a buffer.

		Args:
		buf(bytes-like): the buffer to encode.

		Returns:
		bytes-like: the encoded data
		"""
		out = io.BytesIO()
		self.encode_to(buf, out)
		return out.getbuffer()

		@abstractmethod
		def encode_to(self, buf, out):
		"""
		Encode a buffer to a binary output stream.

		Args:
		buf(bytes-like): the buffer to encode.
		out(file-like):
		the output stream. Must have a ``write`` method
		taking a :class:`bytes`.
		"""

		def decode(self, buf):
		"""
		Decode a buffer.

		Args:
		buf(bytes-like): the buffer to decode.

		Returns:
		bytes-like: the decoded data
		"""

		out = bytearray()
		self.decode_to(buf, out)
		return out

		@abstractmethod
		def decode_to(self, buf, out):
		"""
		Decode a buffer into a bytearray.

		Args:
		buf(bytes-like): the buffer to decode.
		out(bytearray):
		the bytearray to receive the output. This method will resize the
		bytearray as needed to accomodate the output.
		"""

		@abstractmethod
		def config(self):
		"""
		Get a JSON-serializable configuration for this codec. It should be able
		to be passed as ``**kwargs`` to the constructor.
		"""

-84

binpickle/codecs/blosc.py

		import logging
		import msgpack
		from importlib.util import find_spec

		from ._base import Codec


		DEFAULT_BLOCKSIZE = 1024 * 1024 * 1024
		_log = logging.getLogger(__name__)


		def _split_blocks(buf, blocksize):
		if buf.itemsize > 1:
		buf = buf.cast('B')
		length = buf.nbytes
		chunks = []
		for start in range(0, length, blocksize):
		end = start + blocksize
		if end > length:
		end = length
		chunks.append(buf[start:end])

		if not chunks:
		chunks.append(memoryview(b''))
		return chunks


		class Blosc(Codec):
		"""
		Blosc codec.
		"""

		NAME = 'blosc'
		AVAILABLE = find_spec('blosc') is not None

		def __init__(self, name='blosclz', level=9,
		shuffle=1, blocksize=DEFAULT_BLOCKSIZE):
		if not self.AVAILABLE:
		raise ImportError('blosc is not available')
		self.name = name
		self.level = level
		self.shuffle = shuffle
		self.blocksize = blocksize

		def encode_to(self, buf, out):
		# We have to encode by chunks
		import blosc
		pack = msgpack.Packer()
		mv = memoryview(buf)
		_log.debug('encoding %d bytes (itemsize=%d, format=%s)',
		mv.nbytes, mv.itemsize, mv.format)
		_log.debug('splitting with block size %d', self.blocksize)
		blocks = _split_blocks(mv, self.blocksize)
		out.write(pack.pack_array_header(len(blocks)))
		for block in blocks:
		assert block.nbytes <= self.blocksize
		comp = blosc.compress(block, cname=self.name, clevel=self.level,
		shuffle=self.shuffle, typesize=mv.itemsize)
		out.write(pack.pack(comp))
		block.release()

		def decode_to(self, buf, out):
		import blosc
		blocks = msgpack.unpackb(buf, use_list=True)
		pos = 0
		for block in blocks:
		dec = blosc.decompress(block)
		dmv = memoryview(dec) # to reduce copies
		n = len(dec)
		e1 = min(pos + n, len(out))
		n1 = e1 - pos
		out[pos:e1] = dmv[:n1]
		if n1 < n:
		out.extend(dmv[n1:])
		pos += n
		if len(out) > pos:
		del out[pos:]

		def config(self):
		return {
		'name': self.name,
		'level': self.level,
		'shuffle': self.shuffle
		}

-36

binpickle/codecs/chain.py

		from ._base import Codec
		from . import make_codec


		class Chain(Codec):
		"""
		Codec that chains together other codecs in sequence. The codecs are applied
		in the provided order for encoding, and reverse order for decoding.
		"""
		NAME = 'chain'

		def __init__(self, codecs=()):
		self.codecs = [make_codec(c, list_is_tuple=True) for c in codecs]

		def encode(self, buf):
		data = buf
		for codec in self.codecs:
		data = codec.encode(data)
		return data

		def encode_to(self, buf, w):
		w.write(self.encode(buf))

		def decode(self, buf):
		data = buf
		for codec in self.codecs[::-1]:
		data = codec.decode(data)
		return data

		def decode_to(self, buf, out):
		out[:] = self.decode(buf)

		def config(self):
		return {
		'codecs': [(c.NAME, c.config()) for c in self.codecs]
		}

-32

binpickle/codecs/gz.py

		import zlib

		from ._base import Codec


		class GZ(Codec):
		"""
		Zlib (gzip-compatible) codec.
		"""

		NAME = 'gz'

		def __init__(self, level=9):
		self.level = level

		def encode(self, buf):
		return zlib.compress(buf, self.level)

		def encode_to(self, buf, out):
		# We have to encode by chunks
		out.write(self.encode(buf))

		def decode(self, buf):
		return zlib.decompress(buf)

		def decode_to(self, buf, out):
		out[:] = self.decode(buf)

		def config(self):
		return {
		'level': self.level
		}

-23

binpickle/codecs/null.py

		from ._base import Codec


		class Null(Codec):
		"""
		Null codec (passthrough).
		"""
		NAME = 'null'

		def encode(self, buf):
		return buf

		def encode_to(self, buf, out):
		out.write(buf)

		def decode(self, buf, length=None):
		return buf

		def decode_to(self, buf, out):
		out[:] = buf

		def config(self):
		return {}

-41

binpickle/codecs/numcodecs.py

		from importlib.util import find_spec
		from ._base import Codec


		def is_numcodec(codec):
		"Test whether a codec is a NumCodecs codec."
		if NC.AVAILABLE:
		import numcodecs
		return isinstance(codec, numcodecs.abc.Codec)
		else:
		return False # if numcodecs aren't available, it can't be one


		class NC(Codec):
		"""
		NumCodec wrapper.
		"""
		NAME = 'numcodec'
		AVAILABLE = find_spec('numcodecs') is not None

		def __init__(self, codec=None, **kwargs):
		if codec is None:
		import numcodecs
		self.codec = numcodecs.get_codec(kwargs)
		else:
		self.codec = codec

		def encode(self, buf):
		return self.codec.encode(buf)

		def encode_to(self, buf, w):
		w.write(self.encode(buf))

		def decode(self, buf):
		return memoryview(self.codec.decode(buf))

		def decode_to(self, buf, out):
		out[:] = self.decode(buf)

		def config(self):
		return self.codec.get_config()

-9

binpickle/compat.py

		"""
		Compatibility support.
		"""

		import pickle

		# Make sure we have Pickle 5
		if pickle.HIGHEST_PROTOCOL < 5:
		import pickle5 as pickle

-183

build-tools/flit-conda.py

		"""
		Environment management tool to instantiate Conda environments from Flit.
		Requires flit-core and packaging to be installed.
		"""

		import os
		import sys
		import tempfile
		import subprocess
		from pathlib import Path
		import argparse
		from flit_core.config import read_flit_config, toml
		from packaging.requirements import Requirement
		from packaging.markers import default_environment


		def write_env(obj, out):
		try:
		import yaml
		yaml.safe_dump(obj, out)
		except ImportError:
		import json
		json.dump(obj, out, indent=2)


		def parse_args():
		parser = argparse.ArgumentParser(description='Manage development environments.')
		parser.add_argument('--python-version', '-V', metavar='VER',
		help='use Python version VER')
		parser.add_argument('--extra', '-E', metavar='EXTRA', action='append',
		help='include EXTRA')
		parser.add_argument('--name', '-n', metavar='NAME',
		help='name Conda environment NAME')
		parser.add_argument('--no-dev', action='store_true', help='skip dev dependencies')
		parser.add_argument('--save-env', metavar='FILE',
		help='save environment to FILE')
		parser.add_argument('--create-env', action='store_true',
		help='create Conda environment')
		parser.add_argument('--update-env', action='store_true',
		help='update Conda environment')
		args = parser.parse_args()
		return args


		def load_project():
		tp = Path('pyproject.toml')
		fc = read_flit_config(tp)
		pyp = toml.loads(tp.read_text())
		return pyp, fc


		class conda_config:
		def __init__(self, project):
		cfg = project.get('tool', {})
		cfg = cfg.get('envtool', {})
		self.config = cfg.get('conda', {})

		@property
		def name(self):
		return str(self.config.get('name', 'dev-env'))

		@property
		def channels(self):
		return [str(c) for c in self.config.get('channels', [])]

		@property
		def extras(self):
		return self.config.get('extras', {})

		def get_override(self, dep):
		ovr = self.config.get('overrides', {})
		dep_over = ovr.get(dep, {})
		if isinstance(dep_over, str):
		dep_over = {'name': dep_over}
		return dep_over

		def source(self, dep):
		dov = self.get_override(dep)
		return dov.get('source', None)

		def conda_name(self, dep):
		dov = self.get_override(dep)
		return str(dov.get('name', dep))


		def marker_env(args):
		"Get the marker environment"
		env = {}
		env.update(default_environment())
		if args.python_version:
		env['python_version'] = args.python_version
		env['python_full_version'] = args.python_version
		return env


		def req_active(env, req):
		if req.marker:
		return req.marker.evaluate(env)
		else:
		return True


		def dep_str(cfg, req):
		dep = cfg.conda_name(req.name)
		if req.specifier:
		dep += f' {req.specifier}'
		return dep


		def conda_env(args, pyp, flp):
		cfg = conda_config(pyp)
		mkenv = marker_env(args)
		name = args.name
		if name is None:
		name = cfg.name

		env = {'name': name}
		if cfg.channels:
		env['channels'] = cfg.channels

		deps = []
		if args.python_version:
		deps.append(f'python ={args.python_version}')
		elif flp.metadata['requires_python']:
		deps.append('python ' + str(flp.metadata['requires_python']))
		deps.append('pip')

		extras = set(['.none'])
		if not args.no_dev:
		extras \|= set(['dev', 'doc', 'test'])
		if args.extra:
		for e in args.extra:
		if e == 'all':
		extras \|= set(flp.reqs_by_extra.keys())
		else:
		extras.add(e)

		pip_deps = []

		for e in extras:
		for req in flp.reqs_by_extra.get(e, []):
		req = Requirement(req)
		if req_active(mkenv, req):
		if req.url or cfg.source(req.name) == 'pip':
		pip_deps.append(req)
		else:
		deps.append(dep_str(cfg, req))
		for cr in cfg.extras.get(e, []):
		deps.append(str(cr))

		if pip_deps:
		deps.append({'pip': [str(r) for r in pip_deps]})
		env['dependencies'] = deps

		return env


		def env_command(env, cmd):
		with tempfile.TemporaryDirectory() as td:
		path = Path(td)
		ef = path / 'environment.yml'
		with ef.open('w') as f:
		write_env(env, f)
		print(cmd, 'environment', ef)
		subprocess.run(['conda', 'env', cmd, '-f', os.fspath(ef)], check=True)


		def main(args):
		py_p, flit_p = load_project()
		env = conda_env(args, py_p, flit_p)
		if args.save_env:
		with open(args.save_env, 'w') as ef:
		write_env(env, ef)
		elif args.create_env:
		env_command(env, 'create')
		elif args.update_env:
		env_command(env, 'update')
		else:
		write_env(env, sys.stdout)


		if __name__ == '__main__':
		main(parse_args())

-21

build-tools/LICENSE.md

		The MIT License (MIT)

		Copyright (c) 2021 Boise State University

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		The above copyright notice and this permission notice shall be included in
		all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
		THE SOFTWARE.

-12

docs/_templates/footer.html

		{% extends '!footer.html' %}

		{% block extrafooter %}
		<p>This material is based upon work supported by the National Science Foundation under
		Grant No. <a href="https://md.ekstrandom.net/research/career">IIS 17-51278</a>. Any
		opinions, findings, and conclusions or recommendations expressed in this material
		are those of the author(s) and do not necessarily reflect the views of the
		National Science Foundation. This page has not been approved by
		Boise State University and does not reflect official university positions.</p>
		<script data-goatcounter="https://binpickle.goatcounter.com/count"
		async src="//gc.zgo.at/count.js"></script>
		{% endblock %}

-52

docs/codecs.rst

		Codecs
		======

		.. py:module:: binpickle.codecs

		BinPickle supports codecs to compress buffer content.
		These are similar in spirit to numcodecs_, but automatically handle some cases
		such as splitting arrays into blocks and can reduce copying in some situations.

		.. _numcodecs: https://numcodecs.readthedocs.io/en/stable/

		.. toctree::

		.. autofunction:: make_codec

		Codec API
		---------

		.. autoclass:: Codec

		Codec Implementations
		---------------------

		Null codec
		~~~~~~~~~~

		.. autoclass:: Null

		Chain codec
		~~~~~~~~~~~

		.. autoclass:: Chain

		Blosc codec
		~~~~~~~~~~~

		.. autoclass:: Blosc


		Gzip codec
		~~~~~~~~~~

		.. autoclass:: GZ


		NumCodecs
		~~~~~~~~~

		BinPickle also supports any codec from numcodecs_ through the :class:`NC` wrapper. This
		is automatically used by the :func:`make_codec` function, so you can also pass a NumCodecs
		codec directly to :meth:`binpickle.BinPickler.compressed`.

-21

environment.yml

		# Dev environment
		name: binpickle
		channels:
		- conda-forge
		# - defaults
		dependencies:
		- python=3.8
		- pip
		- msgpack-python
		- python-blosc
		- numcodecs
		- numpy>=1.17
		- pandas>=1.0
		- pytest
		- pytest-cov
		- hypothesis
		- sphinx
		- flake8
		- twine
		- pip:
		- rstcheck

-21

LICENSE

		The MIT License (MIT)

		Copyright (c) 2020 Boise State University

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		The above copyright notice and this permission notice shall be included in
		all copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
		THE SOFTWARE.

-38

setup.py

		#!/usr/bin/env python
		# setup.py generated by flit for tools that don't yet use PEP 517

		from distutils.core import setup

		packages = \
		['binpickle', 'binpickle.codecs']

		package_data = \
		{'': ['*']}

		install_requires = \
		['msgpack >= 1.0']

		extras_require = \
		{":python_version < '3.8'": ['pickle5'],
		'blosc': ['blosc'],
		'dev': ['flake8', 'rstcheck'],
		'doc': ['sphinx'],
		'numcodecs': ['numcodecs >= 0.7'],
		'test': ['pytest >= 5',
		'pytest-cov',
		'hypothesis >= 6',
		'pandas >= 1.0',
		'numpy >= 1.17']}

		setup(name='binpickle',
		version='0.3.4',
		description='Optimized format for pickling binary data.',
		author='Michael Ekstrand',
		author_email='michaelekstrand@boisestate.edu',
		url='https://binpickle.lenskit.org',
		packages=packages,
		package_data=package_data,
		install_requires=install_requires,
		extras_require=extras_require,
		python_requires='>= 3.6.1',
		)

-185

tests/test_codecs.py

		import pytest
		import numpy as np

		from hypothesis import given, assume, settings
		import hypothesis.strategies as st
		from hypothesis.extra.numpy import arrays, integer_dtypes, floating_dtypes

		from binpickle.codecs import *
		if NC.AVAILABLE:
		from numcodecs import LZ4, LZMA

		KNOWN_CODECS = [c for c in CODECS.values() if c.NAME != 'numcodec'] # exclude numcodec from common tests

		need_blosc = pytest.mark.skipif(not Blosc.AVAILABLE, reason='Blosc not available')
		need_numcodecs = pytest.mark.skipif(not NC.AVAILABLE, reason='numcodecs not available')


		def test_make_codec_none():
		assert isinstance(make_codec(None), Null)


		def test_make_codec_null_str():
		assert isinstance(make_codec('null'), Null)


		def test_make_codec_gz_str():
		assert isinstance(make_codec('gz'), GZ)


		def test_make_codec_return():
		codec = GZ()
		assert make_codec(codec) is codec


		@need_numcodecs
		def test_make_codec_wrap():
		inner = LZ4()
		codec = make_codec(inner)
		assert isinstance(codec, NC)
		assert codec.codec is inner


		def test_make_codec_to_none():
		"Test internal-use none codec"
		assert make_codec(None, null_as_none=True) is None
		assert make_codec(Null(), null_as_none=True) is None


		def test_get_null_with_none():
		codec = get_codec(None, {})
		assert isinstance(codec, Null)


		def test_get_null():
		codec = get_codec('null', {})
		assert isinstance(codec, Null)


		def test_get_gz():
		codec = get_codec('gz', {})
		assert isinstance(codec, GZ)
		assert codec.level == 9


		def test_get_gz_level():
		codec = get_codec('gz', {'level': 5})
		assert isinstance(codec, GZ)
		assert codec.level == 5


		@need_blosc
		def test_get_blosc():
		codec = get_codec('blosc', {})
		assert isinstance(codec, Blosc)
		assert codec.level == 9


		@need_blosc
		def test_get_blosc_lvl():
		codec = get_codec('blosc', {'name': 'zstd', 'level': 5})
		assert isinstance(codec, Blosc)
		assert codec.name == 'zstd'
		assert codec.level == 5


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		@settings(deadline=500)
		@given(st.binary())
		def test_codec_roundtrip(codec, data):
		"Round-trip a codec"

		c = codec()
		enc = c.encode(data)
		dec = c.decode(enc)
		assert len(dec) == len(data)
		assert dec == data


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		@settings(deadline=500)
		@given(arrays(st.one_of(integer_dtypes(), floating_dtypes()),
		st.integers(10, 10000)))
		def test_codec_roundtrip_array(codec, data):
		"Round-trip a codec"
		assume(not any(np.isnan(data)))

		c = codec()
		enc = c.encode(data)
		dec = c.decode(enc)
		a2 = np.frombuffer(dec, dtype=data.dtype)
		assert len(a2) == len(data)
		assert all(a2 == data)


		@pytest.mark.parametrize('codec', KNOWN_CODECS)
		def test_codec_decode_oversize(codec):
		"Test decoding data to an oversized bytearray"
		c = codec()
		data = bytearray(np.random.randn(500))
		out = bytearray(len(data) * 2)
		enc = c.encode(data)
		c.decode_to(enc, out)
		assert len(out) == len(data)
		assert out == data


		@need_blosc
		def test_large_blosc_encode():
		"Test encoding Blosc data that needs to be split"
		c = Blosc(blocksize=4096)

		data = np.random.randn(10000)
		enc = c.encode(data)
		dec = c.decode(enc)
		assert len(enc) < len(dec) # we should have compressed
		assert len(dec) == data.nbytes
		assert dec == memoryview(data)

		a2 = np.frombuffer(data)
		assert len(a2) == len(data)
		assert all(a2 == data)


		@need_numcodecs
		@given(st.binary())
		def test_numcodec_roundtrip(data):
		c = NC(LZMA())
		buf = c.encode(data)
		d2 = c.decode(buf)
		assert len(d2) == len(data)
		assert d2 == data


		@need_numcodecs
		@given(st.binary())
		def test_chain(data):
		# Useless but a test
		codec = Chain([LZMA(), GZ()])
		buf = codec.encode(data)
		d2 = codec.decode(buf)

		assert len(d2) == len(data)
		assert d2 == data


		@need_numcodecs
		def test_chain_config():
		codec = Chain([LZMA(), GZ()])
		assert len(codec.codecs) == 2
		assert isinstance(codec.codecs[0], NC)
		assert isinstance(codec.codecs[1], GZ)

		cfg = codec.config()
		c2 = get_codec(Chain.NAME, cfg)
		assert len(codec.codecs) == 2
		assert isinstance(codec.codecs[0], NC)
		assert isinstance(codec.codecs[1], GZ)


		def test_is_not_numcodec():
		assert not numcodecs.is_numcodec(GZ())

		@need_numcodecs
		def test_is_numcodec():
		assert numcodecs.is_numcodec(LZ4())

binpickle - npm Package Compare versions

Worsened metrics