binpickle
Advanced tools
| # Changes here will be overwritten by Copier | ||
| _commit: 0e64af4 | ||
| _src_path: https://github.com/lenskit/lk-project-template | ||
| package_name: binpickle | ||
| project_descr: Optimized format for pickling binary data. | ||
| project_name: binpickle | ||
| project_title: BinPickle | ||
| require_lint: true | ||
| start_year: 2020 |
| * text=auto | ||
| *.sh text eol=lf | ||
| *.bat text eol=crlf | ||
| *.cmd text eol=crlf |
| name: Validate Source Rules | ||
| on: | ||
| push: | ||
| branches: | ||
| - main | ||
| pull_request: | ||
| concurrency: | ||
| group: check-${{github.ref}} | ||
| cancel-in-progress: true | ||
| jobs: | ||
| lint: | ||
| name: Check Source Style | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - name: 📥 Check out source code | ||
| uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: 🐍 Set up Python | ||
| uses: actions/setup-python@v4 | ||
| with: | ||
| python-version: "3.11" | ||
| cache: 'pip' | ||
| - name: 🛠️ Install tools | ||
| run: | | ||
| pip install ruff | ||
| - name: 🪮 Check source code formatting | ||
| id: format | ||
| run: | | ||
| if pipx run ruff format --diff $PKG_DIR; then | ||
| echo passed=yes >>"$GITHUB_OUTPUT" | ||
| else | ||
| echo passed=no >>"$GITHUB_OUTPUT" | ||
| echo "::error::source code not formatted" | ||
| fi | ||
| env: | ||
| PKG_DIR: binpickle | ||
| - name: 🐜 Check source code lint rules | ||
| id: lint | ||
| run: | | ||
| if pipx run ruff check --output-format=github $PKG_DIR; then | ||
| echo passed=yes >>"$GITHUB_OUTPUT" | ||
| else | ||
| echo passed=no >>"$GITHUB_OUTPUT" | ||
| echo "::error::source code lint check failed" | ||
| fi | ||
| env: | ||
| PKG_DIR: binpickle | ||
| - name: 🧾 Checking results | ||
| run: | | ||
| if [ "$FMT_PASSED" = no ]; then | ||
| echo "::error::format failed, failing build" | ||
| exit 1 | ||
| fi | ||
| if [ "$LINT_PASSED" = no ]; then | ||
| if [ "$LINT_REQUIRED" = true ]; then | ||
| echo "::error::lint failed, failing build" | ||
| exit 2 | ||
| else | ||
| echo "::error::lint failed but non-mandatory" | ||
| fi | ||
| fi | ||
| env: | ||
| FMT_PASSED: ${{ steps.fmt.outputs.passed }} | ||
| LINT_PASSED: ${{ steps.lint.outputs.passed }} | ||
| LINT_REQUIRED: True |
| name: Test and Package | ||
| on: | ||
| push: | ||
| branches: | ||
| - main | ||
| release: | ||
| types: [created,published] | ||
| pull_request: | ||
| concurrency: | ||
| group: test-${{github.ref}} | ||
| cancel-in-progress: true | ||
| jobs: | ||
| test: | ||
| name: Test with Python ${{matrix.python}} on ${{matrix.platform}} | ||
| runs-on: ${{matrix.platform}}-latest | ||
| strategy: | ||
| matrix: | ||
| platform: | ||
| - macos | ||
| - windows | ||
| - ubuntu | ||
| python: | ||
| - "3.10" | ||
| - "3.11" | ||
| - "3.12" | ||
| exclude: | ||
| - platform: macos | ||
| python: 3.9 | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v4 | ||
| with: | ||
| python-version: ${{matrix.python}} | ||
| - name: Set up dependencies | ||
| run: | | ||
| pip install -e '.[test]' | ||
| - name: Run tests | ||
| run: python -m pytest --cov=binpickle --cov-report=xml tests | ||
| - name: Save test results | ||
| uses: lenskit/lkbuild/actions/save-test-results@main | ||
| with: | ||
| artifact-name: test-${{matrix.platform}}-py${{matrix.python}} | ||
| report: | ||
| name: Process test results | ||
| runs-on: ubuntu-latest | ||
| needs: [test] | ||
| steps: | ||
| - uses: actions/checkout@v3 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Report test results | ||
| uses: lenskit/lkbuild/actions/report-test-results@main | ||
| sdist: | ||
| name: Build Source Packages | ||
| runs-on: ubuntu-latest | ||
| needs: [test] | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Fetch Git tags | ||
| run: git fetch --tags | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v2 | ||
| with: | ||
| python-version: "3.10" | ||
| - name: Install Python deps | ||
| run: pip install -U build | ||
| - name: Build distribution | ||
| run: python -m build | ||
| - name: Save archive | ||
| uses: actions/upload-artifact@v1 | ||
| with: | ||
| name: pypi-pkgs | ||
| path: dist | ||
| - name: List dist dir | ||
| run: ls -R dist | ||
| - name: Publish PyPI packages | ||
| if: github.event_name == 'release' | ||
| run: | | ||
| twine upload dist/* | ||
| shell: bash | ||
| env: | ||
| TWINE_NON_INTERACTIVE: y | ||
| TWINE_USERNAME: __token__ | ||
| TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }} |
| { | ||
| "mypy-type-checker.reportingScope": "workspace", | ||
| "[python]": { | ||
| "editor.defaultFormatter": "charliermarsh.ruff", | ||
| "editor.formatOnSave": true, | ||
| }, | ||
| } |
| Metadata-Version: 2.1 | ||
| Name: binpickle | ||
| Version: 0.4.0a1 | ||
| Summary: Optimized format for pickling binary data. | ||
| Author-email: Michael Ekstrand <mdekstrand@drexel.edu> | ||
| License: Copyright (c) 2020–2023 Boise State University | ||
| Copyright (c) 2023 Michael Ekstrand | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| > The above copyright notice and this permission notice shall be included in | ||
| > all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| SOFTWARE. | ||
| Project-URL: Homepage, https://binpickle.lenksit.org | ||
| Project-URL: GitHub, https://github.com/lenskit/binpickle | ||
| Classifier: License :: OSI Approved :: MIT License | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Programming Language :: Python :: 3.10 | ||
| Classifier: Programming Language :: Python :: 3.11 | ||
| Classifier: Operating System :: OS Independent | ||
| Requires-Python: >=3.10 | ||
| Description-Content-Type: text/markdown | ||
| License-File: LICENSE.md | ||
| Requires-Dist: msgpack>=1.0 | ||
| Requires-Dist: numcodecs>=0.12 | ||
| Requires-Dist: typing-extensions~=4.8 | ||
| Provides-Extra: dev | ||
| Requires-Dist: setuptools>=64; extra == "dev" | ||
| Requires-Dist: setuptools_scm>=8; extra == "dev" | ||
| Requires-Dist: ruff; extra == "dev" | ||
| Requires-Dist: mypy~=1.5; extra == "dev" | ||
| Requires-Dist: copier; extra == "dev" | ||
| Requires-Dist: sphinx-autobuild; extra == "dev" | ||
| Requires-Dist: humanize~=4.0; extra == "dev" | ||
| Requires-Dist: msgpack-types; extra == "dev" | ||
| Requires-Dist: pandas-stubs; extra == "dev" | ||
| Provides-Extra: test | ||
| Requires-Dist: pytest>=5; extra == "test" | ||
| Requires-Dist: pytest-cov; extra == "test" | ||
| Requires-Dist: hypothesis>=6; extra == "test" | ||
| Requires-Dist: pandas>=1.4; extra == "test" | ||
| Requires-Dist: numpy>=1.22; extra == "test" | ||
| Provides-Extra: doc | ||
| Requires-Dist: sphinx>=4.2; extra == "doc" | ||
| Requires-Dist: sphinxext-opengraph>=0.5; extra == "doc" | ||
| Requires-Dist: furo; extra == "doc" | ||
| # BinPickle - efficient binary pickled data | ||
| [](https://badge.fury.io/py/binpickle) | ||
|  | ||
| [](https://codecov.io/gh/lenskit/binpickle) | ||
| This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently | ||
| serialize large objects, particularly from scientific Python packages, to an | ||
| on-disk format. This format is designed to support two use cases: | ||
| 1. Serializing data-intensive statistical models in a memory-mappable format so | ||
| multiple processes can share the same (read-only) model memory. | ||
| 2. Serializing data-intensive statistical models with good compression for long-term | ||
| storage and cross-machine transportation. | ||
| BinPickle does this by using Pickle 5's out-of-band buffer serialization support to | ||
| write buffers uncompressed and page-aligned for memory mapping (use case 1) or with | ||
| per-buffer efficient compression with libraries like Blosc (use case 2). | ||
| ## Format Stability | ||
| We do **not** yet guarantee the stability of the BinPickle format. We will avoid gratuitous changes, | ||
| but BinPickle 1.0 will be the first with a stability guarantee. | ||
| ## Acknowledgements | ||
| This material is based upon work supported by the National Science Foundation under | ||
| Grant No. IIS 17-51278. Any opinions, findings, and conclusions or recommendations | ||
| expressed in this material are those of the author(s) and do not necessarily reflect | ||
| the views of the National Science Foundation. This page has not been approved by | ||
| Boise State University and does not reflect official university positions. |
| msgpack>=1.0 | ||
| numcodecs>=0.12 | ||
| typing-extensions~=4.8 | ||
| [dev] | ||
| setuptools>=64 | ||
| setuptools_scm>=8 | ||
| ruff | ||
| mypy~=1.5 | ||
| copier | ||
| sphinx-autobuild | ||
| humanize~=4.0 | ||
| msgpack-types | ||
| pandas-stubs | ||
| [doc] | ||
| sphinx>=4.2 | ||
| sphinxext-opengraph>=0.5 | ||
| furo | ||
| [test] | ||
| pytest>=5 | ||
| pytest-cov | ||
| hypothesis>=6 | ||
| pandas>=1.4 | ||
| numpy>=1.22 |
| .copier-answers.yml | ||
| .editorconfig | ||
| .gitattributes | ||
| .gitignore | ||
| .readthedocs.yml | ||
| LICENSE.md | ||
| README.md | ||
| codecov.yml | ||
| conftest.py | ||
| pyproject.toml | ||
| .github/workflows/check-sources.yml | ||
| .github/workflows/test.yml | ||
| .vscode/settings.json | ||
| binpickle/__init__.py | ||
| binpickle/_util.py | ||
| binpickle/encode.py | ||
| binpickle/errors.py | ||
| binpickle/format.py | ||
| binpickle/read.py | ||
| binpickle/write.py | ||
| binpickle.egg-info/PKG-INFO | ||
| binpickle.egg-info/SOURCES.txt | ||
| binpickle.egg-info/dependency_links.txt | ||
| binpickle.egg-info/requires.txt | ||
| binpickle.egg-info/top_level.txt | ||
| docs/conf.py | ||
| docs/format.rst | ||
| docs/index.rst | ||
| docs/read.rst | ||
| docs/write.rst | ||
| docs/_templates/base.html | ||
| stubs/numcodecs/__init__.pyi | ||
| stubs/numcodecs/abc.pyi | ||
| stubs/numcodecs/registry.pyi | ||
| tests/test_file_info.py | ||
| tests/test_format.py | ||
| tests/test_rw.py | ||
| tests/test_util.py | ||
| tests/test_validation.py |
| binpickle |
| """ | ||
| Internal utility functions for Binpickle. | ||
| """ | ||
| from __future__ import annotations | ||
| from typing import Optional, Any | ||
| import hashlib | ||
| from typing_extensions import Buffer | ||
| naturalsize: Optional[Any] | ||
| try: | ||
| from humanize import naturalsize | ||
| except ImportError: | ||
| naturalsize = None | ||
| def human_size(bytes: int | float) -> str: | ||
| if naturalsize: | ||
| return naturalsize(bytes, binary=True, format="%.2f") | ||
| else: | ||
| return "{:.2f} MiB".format(bytes / (1024 * 1024)) | ||
| def hash_buffer(buf: Buffer) -> bytes: | ||
| if not isinstance(buf, memoryview): | ||
| buf = memoryview(buf) | ||
| return hashlib.sha256(buf).digest() |
| """ | ||
| Support for encoding and decoding. | ||
| """ | ||
| from __future__ import annotations | ||
| from typing import Optional, TypeAlias, Callable, overload | ||
| from typing_extensions import Buffer | ||
| from numcodecs.abc import Codec | ||
| from numcodecs.registry import get_codec | ||
| from binpickle.format import CodecSpec | ||
| CodecFunc: TypeAlias = Callable[[Buffer], Codec | str | CodecSpec | None] | ||
| CodecArg: TypeAlias = Codec | str | CodecSpec | CodecFunc | ||
| ResolvedCodec: TypeAlias = Codec | CodecFunc | ||
| @overload | ||
| def resolve_codec(codec: CodecSpec) -> Codec: | ||
| ... | ||
| @overload | ||
| def resolve_codec(codec: CodecArg) -> ResolvedCodec: | ||
| ... | ||
| @overload | ||
| def resolve_codec(codec: CodecArg, buf: Buffer) -> Codec | None: | ||
| ... | ||
| def resolve_codec(codec: CodecArg, buf: Optional[Buffer] = None) -> ResolvedCodec | None: | ||
| """ | ||
| Resolve a codec arg into an instantiated codec. | ||
| """ | ||
| if isinstance(codec, str): | ||
| return resolve_codec({"id": codec}) | ||
| elif isinstance(codec, dict): | ||
| return get_codec(codec) | ||
| elif isinstance(codec, Codec): | ||
| return codec | ||
| elif hasattr(codec, "__call__"): | ||
| if buf is None: | ||
| return codec | ||
| else: | ||
| spec = codec(buf) | ||
| if spec is None: | ||
| return None | ||
| else: | ||
| return resolve_codec(spec, buf) | ||
| else: | ||
| raise TypeError(f"invalid codec argument {type(codec)}") |
| class BinPickleError(Exception): | ||
| """ | ||
| Base class for Binpickle errors. | ||
| """ | ||
| class FormatError(BinPickleError): | ||
| """ | ||
| The Binpickle file is invalid. | ||
| """ | ||
| class IntegrityError(BinPickleError): | ||
| """ | ||
| The Binpickle file failed an integrity check. | ||
| """ |
| {% extends '!base.html' %} | ||
| {% block theme_scripts %} | ||
| <script data-goatcounter="https://binpickle.goatcounter.com/count" | ||
| async src="//gc.zgo.at/count.js"></script> | ||
| {% endblock %} |
+20
| Copyright (c) 2020–2023 Boise State University | ||
| Copyright (c) 2023 Michael Ekstrand | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| > The above copyright notice and this permission notice shall be included in | ||
| > all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| SOFTWARE. |
| from . import abc | ||
| from . import registry |
| from abc import ABC | ||
| from typing_extensions import Buffer, Optional, Self | ||
| class Codec(ABC): | ||
| codec_id: Optional[str] | ||
| def encode(self, buf: Buffer) -> Buffer: ... | ||
| def decode(self, buf: Buffer, out: Optional[Buffer] = None) -> Buffer: ... | ||
| def get_config(self) -> dict: ... | ||
| @classmethod | ||
| def from_config(cls, cfg: dict) -> Self: ... |
| from .abc import Codec | ||
| codec_registry: dict[str, Codec] | ||
| def get_codec(config: dict) -> Codec: ... |
| from pathlib import Path | ||
| from binpickle import file_info | ||
| from binpickle.read import FileStatus | ||
| from binpickle.write import dump | ||
| def test_missing_file(tmp_path: Path): | ||
| file = tmp_path / "data.bpk" | ||
| info = file_info(file) | ||
| assert info.status == FileStatus.MISSING | ||
| assert not info.is_valid | ||
| def test_empty_file(tmp_path: Path): | ||
| file = tmp_path / "data.bpk" | ||
| file.write_bytes(b"") | ||
| info = file_info(file) | ||
| assert info.status == FileStatus.INVALID | ||
| assert not info.is_valid | ||
| def test_invalid_file(tmp_path: Path): | ||
| file = tmp_path / "data.bpk" | ||
| file.write_bytes(b"0" * 4096) | ||
| info = file_info(file) | ||
| assert info.status == FileStatus.INVALID | ||
| assert not info.is_valid | ||
| def test_valid_file(tmp_path: Path): | ||
| file = tmp_path / "data.bpk" | ||
| dump(None, file) | ||
| info = file_info(file) | ||
| assert info.status == FileStatus.BINPICKLE | ||
| assert info.is_valid | ||
| assert info.size == file.stat().st_size |
| import os | ||
| import logging | ||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
| from binpickle import dump, BinPickleFile | ||
| from binpickle.errors import IntegrityError | ||
| _log = logging.getLogger(__name__) | ||
| def test_verfy_index(tmp_path, rng: np.random.Generator): | ||
| "Index hash mismatch should fail" | ||
| file = tmp_path / "data.bpk" | ||
| df = pd.DataFrame( | ||
| { | ||
| "key": np.arange(0, 5000), | ||
| "count": rng.integers(0, 1000, 5000), | ||
| "score": rng.normal(10, 2, 5000), | ||
| } | ||
| ) | ||
| dump(df, file, codecs=["lz4"]) | ||
| # corrupt the file | ||
| stat = os.stat(file) | ||
| _log.info("%s: length %d", file, stat.st_size) | ||
| with open(file, "r+b") as f: | ||
| f.seek(stat.st_size - 2) | ||
| f.write(b"XX") | ||
| # try to read the file | ||
| with pytest.raises(IntegrityError, match=r"incorrect hash"): | ||
| with BinPickleFile(file) as _bpf: | ||
| pass | ||
| def test_verfy_buffer(tmp_path, rng: np.random.Generator): | ||
| "Corrupt buffer should fail hash." | ||
| file = tmp_path / "data.bpk" | ||
| df = pd.DataFrame( | ||
| { | ||
| "key": np.arange(0, 5000), | ||
| "count": rng.integers(0, 1000, 5000), | ||
| "score": rng.normal(10, 2, 5000), | ||
| } | ||
| ) | ||
| dump(df, file, codecs=["lz4"]) | ||
| # corrupt the file | ||
| stat = os.stat(file) | ||
| _log.info("%s: length %d", file, stat.st_size) | ||
| with open(file, "r+b") as f: | ||
| f.seek(32) | ||
| f.write(b"XXXXXXXX") | ||
| # try to read the file | ||
| with BinPickleFile(file) as bpf: | ||
| with pytest.raises(IntegrityError, match=r"incorrect hash"): | ||
| bpf.load() |
+16
-4
| root = true | ||
| [*] | ||
| indent_style = space | ||
| indent_size = 4 | ||
| charset = utf-8 | ||
| insert_final_newline = true | ||
| trim_trailing_whitespace = true | ||
| insert_final_newline = true | ||
| indent_size = 4 | ||
| indent_style = space | ||
| [{*.yaml,*.yml}] | ||
| [{*.json,*.yml,*.yaml,*.yml.jinja}] | ||
| indent_size = 2 | ||
| [*.toml] | ||
| indent_size = 2 | ||
| [*.sh] | ||
| end_of_line = lf | ||
| [*.{bat,cmd}] | ||
| end_of_line = crlf | ||
| [*.md] | ||
| trim_trailing_whitespace = false |
+48
-8
@@ -1,10 +0,50 @@ | ||
| /.vscode | ||
| /build | ||
| /dist | ||
| /.hypothesis | ||
| *.egg-info/ | ||
| *.dist-info/ | ||
| .coverage | ||
| # log and debug outputs | ||
| *.log | ||
| *.pdb | ||
| *.prof | ||
| *.lprof | ||
| emissions.csv | ||
| intel_power_gadget_log.csv | ||
| .coverage* | ||
| coverage.xml | ||
| cov-reports/ | ||
| test-logs/ | ||
| htmlcov/ | ||
| # caches and working directories | ||
| __pycache__/ | ||
| *.pyc | ||
| __pycache__ | ||
| .ipynb_checkpoints/ | ||
| dask-worker-space/ | ||
| .idea/ | ||
| .*_cache/ | ||
| .hypothesis/ | ||
| .tox/ | ||
| .vagrant/ | ||
| .venv/ | ||
| scratch/ | ||
| # build outputs | ||
| build/ | ||
| dist/ | ||
| *.egg-info | ||
| *.pyd | ||
| *.so | ||
| *.dll | ||
| *.exp | ||
| *.lib | ||
| *.o | ||
| *.obj | ||
| # environment locks that aren't committed | ||
| /*env*.yml | ||
| conda-lock.yml | ||
| *.lock | ||
| *.lock.yml | ||
| # Editor and OS cruft | ||
| .DS_Store | ||
| ._.DS_Store | ||
| *~ | ||
| *.tmp | ||
| .vs/ |
+5
-1
| version: 2 | ||
| build: | ||
| os: ubuntu-22.04 | ||
| tools: | ||
| python: "3.11" | ||
| sphinx: | ||
@@ -7,3 +12,2 @@ configuration: docs/conf.py | ||
| python: | ||
| version: 3.8 | ||
| install: | ||
@@ -10,0 +14,0 @@ - method: pip |
@@ -5,5 +5,13 @@ """ | ||
| __version__ = '0.3.4' | ||
| from importlib.metadata import version, PackageNotFoundError | ||
| from .write import dump, BinPickler # noqa: F401 | ||
| from .read import load, BinPickleFile # noqa: F401 | ||
| from .write import dump, BinPickler | ||
| from .read import load, BinPickleFile, file_info | ||
| try: | ||
| __version__ = version("binpickle") | ||
| except PackageNotFoundError: | ||
| # package is not installed | ||
| pass | ||
| __all__ = ["dump", "BinPickler", "load", "BinPickleFile", "file_info"] |
+56
-28
@@ -5,12 +5,27 @@ """ | ||
| from dataclasses import dataclass, field, fields | ||
| import struct | ||
| from typing import NamedTuple | ||
| from typing import TypeAlias | ||
| MAGIC = b'BPCK' | ||
| VERSION = 1 | ||
| HEADER_FORMAT = struct.Struct('!4sHHq') | ||
| TRAILER_FORMAT = struct.Struct('!QLL') | ||
| from binpickle.errors import FormatError | ||
| CodecSpec: TypeAlias = dict[str, str | bool | int | float | None] | ||
| """ | ||
| Type of codec specification dictionaries, to be passed to | ||
| :func:`numcodecs.registry.get_codec`. | ||
| """ | ||
| class FileHeader(NamedTuple): | ||
| BufferTypeInfo: TypeAlias = tuple[str, str, tuple[int, ...]] | ||
| """ | ||
| Type of buffer type (and size/shape) information. | ||
| """ | ||
| MAGIC = b"BPCK" | ||
| VERSION = 2 | ||
| HEADER_FORMAT = struct.Struct("!4sHHq") | ||
| TRAILER_FORMAT = struct.Struct("!QL32s") | ||
| @dataclass | ||
| class FileHeader: | ||
| """ | ||
@@ -20,7 +35,10 @@ File header for a BinPickle file. The header is a 16-byte sequence containing the | ||
| 1. File version (2 bytes, big-endian). Currently only version 1 exists. | ||
| 2. Reserved (2 bytes). Set to 0. | ||
| 1. File version (2 bytes, big-endian). | ||
| 2. Flags (2 bytes). Currently no flags are defined, so this is set to 0. | ||
| 3. File length (8 bytes, big-endian). Length is signed; if the file length is not known, | ||
| this field is set to -1. | ||
| """ | ||
| SIZE = HEADER_FORMAT.size | ||
| version: int = VERSION | ||
@@ -36,11 +54,14 @@ "The NumPy file version." | ||
| @classmethod | ||
| def decode(cls, buf, *, verify=True): | ||
| def decode(cls, buf: bytes, *, verify=True): | ||
| "Decode a file header from bytes." | ||
| m, v, pad, off = HEADER_FORMAT.unpack(buf) | ||
| if len(buf) != HEADER_FORMAT.size: | ||
| raise FormatError("incorrect header length") | ||
| m, v, flags, off = HEADER_FORMAT.unpack(buf) | ||
| if verify and m != MAGIC: | ||
| raise ValueError('invalid magic {}'.format(m)) | ||
| raise FormatError("invalid magic {}".format(m)) | ||
| if verify and v != VERSION: | ||
| raise ValueError('invalid version {}'.format(v)) | ||
| if verify and pad != 0: | ||
| raise ValueError('invalid padding') | ||
| raise FormatError("invalid version {}".format(v)) | ||
| if verify and flags != 0: | ||
| raise FormatError("unsupported flags") | ||
| return cls(v, off) | ||
@@ -58,3 +79,3 @@ | ||
| elif self.length > 0: | ||
| raise ValueError('file size {} not enough for BinPickle'.format(self.length)) | ||
| raise FormatError("file size {} not enough for BinPickle".format(self.length)) | ||
| else: | ||
@@ -64,5 +85,6 @@ return None # We do not know the file size | ||
| class FileTrailer(NamedTuple): | ||
| @dataclass | ||
| class FileTrailer: | ||
| """ | ||
| File trailer for a BinPickle file. The trailer is a 16-byte sequence that tells the | ||
| File trailer for a BinPickle file. The trailer is a 44-byte sequence that tells the | ||
| reader where to find the rest of the binpickle data. It consists of the following | ||
@@ -73,12 +95,14 @@ fields: | ||
| 2. Index length (4 bytes, big-endian). The number of bytes in the index. | ||
| 3. Index checksum (4 bytes, big-endian). The Adler32 checksum of the index data. | ||
| 3. Index digest (32 bytes). The SHA256 digest of the index data. | ||
| """ | ||
| SIZE = TRAILER_FORMAT.size | ||
| offset: int | ||
| length: int | ||
| checksum: int | ||
| hash: bytes | ||
| def encode(self): | ||
| "Encode the file trailer as bytes." | ||
| return TRAILER_FORMAT.pack(self.offset, self.length, self.checksum) | ||
| return TRAILER_FORMAT.pack(self.offset, self.length, self.hash) | ||
@@ -88,10 +112,12 @@ @classmethod | ||
| "Decode a file trailer from bytes." | ||
| o, l, c = TRAILER_FORMAT.unpack(buf) | ||
| return cls(o, l, c) | ||
| off, len, ck = TRAILER_FORMAT.unpack(buf) | ||
| return cls(off, len, ck) | ||
| class IndexEntry(NamedTuple): | ||
| @dataclass | ||
| class IndexEntry: | ||
| """ | ||
| Index entry for a buffer in the BinPickle index. | ||
| """ | ||
| offset: int | ||
@@ -103,10 +129,12 @@ "The position in the file where the buffer begins (bytes)." | ||
| "The decoded length of the buffer in bytes." | ||
| checksum: int | ||
| "The Adler-32 checksum of the encoded buffer data." | ||
| codec: tuple = None | ||
| "The codec used to encode the buffer, or None." | ||
| hash: bytes | ||
| "The SHA-256 checksum of the encoded buffer data." | ||
| info: BufferTypeInfo | None | ||
| "Type information for the buffer (if available)." | ||
| codecs: list[CodecSpec] = field(default_factory=list) | ||
| "The sequence of codecs used to encode the buffer." | ||
| def to_repr(self): | ||
| "Convert an index entry to its MsgPack-compatible representation" | ||
| return dict((k, getattr(self, k)) for k in self._fields) | ||
| return dict((f.name, getattr(self, f.name)) for f in fields(self)) | ||
@@ -113,0 +141,0 @@ @classmethod |
+126
-39
@@ -0,10 +1,18 @@ | ||
| from dataclasses import dataclass | ||
| from enum import Enum | ||
| import hashlib | ||
| import mmap | ||
| import logging | ||
| import io | ||
| from zlib import adler32 | ||
| from os import PathLike | ||
| from typing import Optional | ||
| from typing_extensions import Buffer | ||
| import pickle | ||
| import msgpack | ||
| from .compat import pickle | ||
| from binpickle.encode import resolve_codec | ||
| from binpickle.errors import BinPickleError, FormatError, IntegrityError | ||
| from .format import FileHeader, IndexEntry, FileTrailer | ||
| from .codecs import get_codec | ||
| from ._util import hash_buffer | ||
@@ -14,2 +22,18 @@ _log = logging.getLogger(__name__) | ||
| class FileStatus(Enum): | ||
| MISSING = 0 | ||
| INVALID = 1 | ||
| BINPICKLE = 2 | ||
| @dataclass | ||
| class BPKInfo: | ||
| status: FileStatus | ||
| size: int | ||
| @property | ||
| def is_valid(self): | ||
| return self.status == FileStatus.BINPICKLE | ||
| class BinPickleFile: | ||
@@ -27,11 +51,23 @@ """ | ||
| :meth:`close` is called. | ||
| verify(bool): | ||
| If ``True`` (the default), verify file checksums while reading. | ||
| """ | ||
| def __init__(self, filename, *, direct=False): | ||
| filename: str | PathLike | ||
| direct: bool | ||
| verify: bool | ||
| header: FileHeader | ||
| trailer: FileTrailer | ||
| _map: Optional[mmap.mmap] | ||
| _mv: Optional[memoryview] | ||
| _index_buf: Optional[memoryview] | ||
| entries: list[IndexEntry] | ||
| def __init__(self, filename, *, direct: bool = False, verify: bool = True): | ||
| self.filename = filename | ||
| self.direct = direct | ||
| with open(filename, 'rb') as bpf: | ||
| self.verify = verify | ||
| with open(filename, "rb") as bpf: | ||
| self.header = FileHeader.read(bpf) | ||
| self._map = mmap.mmap(bpf.fileno(), self.header.length, | ||
| access=mmap.ACCESS_READ) | ||
| self._map = mmap.mmap(bpf.fileno(), self.header.length, access=mmap.ACCESS_READ) | ||
| self._mv = memoryview(self._map) | ||
@@ -47,3 +83,3 @@ self._read_index() | ||
| def load(self): | ||
| def load(self) -> object: | ||
| """ | ||
@@ -53,6 +89,7 @@ Load the object from the binpickle file. | ||
| if not self.entries: | ||
| raise ValueError('empty pickle file has no objects') | ||
| raise ValueError("empty pickle file has no objects") | ||
| p_bytes = self._read_buffer(self.entries[-1], direct=True) | ||
| _log.debug('unpickling %d bytes and %d buffers', | ||
| len(p_bytes), len(self.entries) - 1) | ||
| _log.debug( | ||
| "unpickling %d bytes and %d buffers", memoryview(p_bytes).nbytes, len(self.entries) - 1 | ||
| ) | ||
@@ -63,3 +100,8 @@ buf_gen = (self._read_buffer(e) for e in self.entries[:-1]) | ||
| def find_errors(self): | ||
| @property | ||
| def is_mappable(self) -> bool: | ||
| "Query whether this file can be memory-mapped." | ||
| return all(not e.codecs for e in self.entries) | ||
| def find_errors(self) -> list[str]: | ||
| """ | ||
@@ -71,9 +113,10 @@ Verify binpickle data structure validity. If the file is invalid, returns | ||
| invalid msgpack formats in the index won't be detected here. This method checks | ||
| buffer checksums, offset overlaps, and such. | ||
| buffer hashes, offset overlaps, and such. | ||
| """ | ||
| errors = [] | ||
| assert self._index_buf is not None, "file not loaded" | ||
| i_sum = adler32(self._index_buf) | ||
| if i_sum != self.trailer.checksum: | ||
| errors.append(f'invalid index checksum ({i_sum} != {self.trailer.checksum})') | ||
| i_sum = hashlib.sha256(self._index_buf).digest() | ||
| if i_sum != self.trailer.hash: | ||
| errors.append("index hash mismatch") | ||
@@ -83,14 +126,14 @@ position = 16 | ||
| if e.offset < position: | ||
| errors.append(f'entry {i}: offset {e.offset} before expected start {position}') | ||
| errors.append(f"entry {i}: offset {e.offset} before expected start {position}") | ||
| buf = self._read_buffer(e, direct=True) | ||
| ndec = len(buf) | ||
| ndec = memoryview(buf).nbytes | ||
| if ndec != e.dec_length: | ||
| errors.append(f'entry {i}: decoded to {ndec} bytes, expected {e.dec_length}') | ||
| cks = adler32(self._read_buffer(e, direct=True, decode=False)) | ||
| if cks != e.checksum: | ||
| errors.append('entry {i}: invalid checksum ({cks} != {e.checksum}') | ||
| errors.append(f"entry {i}: decoded to {ndec} bytes, expected {e.dec_length}") | ||
| cks = hashlib.sha256(self._read_buffer(e, direct=True, decode=False)).digest() | ||
| if cks != e.hash: | ||
| errors.append("entry {i}: invalid digest") | ||
| return errors | ||
| def close(self): | ||
| def close(self) -> None: | ||
| """ | ||
@@ -106,9 +149,10 @@ Close the BinPickle file. If the file is in direct mode, all | ||
| def _read_index(self): | ||
| def _read_index(self) -> None: | ||
| tpos = self.header.trailer_pos() | ||
| if tpos is None: | ||
| raise ValueError('no file length, corrupt binpickle file?') | ||
| raise FormatError("no file length, corrupt binpickle file?") | ||
| assert self._mv is not None, "file not open" | ||
| buf = self._mv[tpos:] | ||
| assert len(buf) == 16 | ||
| assert len(buf) == 44 | ||
| self.trailer = FileTrailer.decode(buf) | ||
@@ -119,6 +163,17 @@ | ||
| self._index_buf = self._mv[i_start:i_end] | ||
| try: | ||
| self._verify_buffer(self._index_buf, self.trailer.hash, "index") | ||
| except Exception as e: | ||
| self._index_buf.release() | ||
| self._index_buf = None | ||
| raise e | ||
| self.entries = [IndexEntry.from_repr(e) for e in msgpack.unpackb(self._index_buf)] | ||
| _log.debug('read %d entries from file', len(self.entries)) | ||
| _log.debug("read %d entries from file", len(self.entries)) | ||
| def _read_buffer(self, entry: IndexEntry, *, direct=None, decode=True): | ||
| def _read_buffer( | ||
| self, entry: IndexEntry, *, direct: Optional[bool] = None, decode: bool = True | ||
| ) -> Buffer: | ||
| assert self._mv is not None, "file not open" | ||
| assert self._map is not None, "file not open" | ||
| start = entry.offset | ||
@@ -130,18 +185,35 @@ length = entry.enc_length | ||
| if decode and entry.codec: | ||
| name, cfg = entry.codec | ||
| _log.debug('decoding %d bytes from %d with %s', length, start, name) | ||
| out = bytearray(entry.dec_length) | ||
| codec = get_codec(name, cfg) | ||
| codec.decode_to(self._mv[start:end], out) | ||
| buf = self._mv[start:end] | ||
| try: | ||
| self._verify_buffer(buf, entry.hash) | ||
| except Exception as e: | ||
| # make sure we release the buffer, even if it's captured by the stack trace | ||
| buf.release() | ||
| raise e | ||
| _log.debug("decoding %d bytes from %d with %s", length, start, entry.codecs) | ||
| if decode and entry.codecs: | ||
| codecs = [resolve_codec(c) for c in entry.codecs] | ||
| out: Buffer = buf | ||
| for codec in codecs[::-1]: | ||
| out = codec.decode(out) | ||
| return out | ||
| if direct: | ||
| _log.debug('mapping %d bytes from %d', length, start) | ||
| return self._mv[start:end] | ||
| _log.debug("mapping %d bytes from %d", length, start) | ||
| return buf | ||
| else: | ||
| _log.debug('copying %d bytes from %d', length, start) | ||
| return self._map[start:end] | ||
| _log.debug("copying %d bytes from %d", length, start) | ||
| return buf.tobytes() | ||
| def _verify_buffer(self, buf: memoryview, hash: bytes, msg: str = "buffer"): | ||
| if self.verify: | ||
| _log.debug("verifying %s", msg) | ||
| bhash = hash_buffer(buf) | ||
| if bhash != hash: | ||
| raise IntegrityError(f"{msg} has incorrect hash, corrupt file?") | ||
| def load(file): | ||
| def load(file: str | PathLike) -> object: | ||
| """ | ||
@@ -156,1 +228,16 @@ Load an object from a BinPickle file. | ||
| return bpf.load() | ||
| def file_info(file: str | PathLike) -> BPKInfo: | ||
| """ | ||
| Test whether a file is a BinPickle file, and if so, return basic information | ||
| about it. | ||
| """ | ||
| try: | ||
| with open(file, "rb") as f: | ||
| info = FileHeader.read(f) | ||
| return BPKInfo(FileStatus.BINPICKLE, info.length) | ||
| except FileNotFoundError: | ||
| return BPKInfo(FileStatus.MISSING, 0) | ||
| except BinPickleError: | ||
| return BPKInfo(FileStatus.INVALID, 0) |
+103
-76
| import mmap | ||
| from os import PathLike | ||
| import warnings | ||
| import logging | ||
| import io | ||
| from zlib import adler32 | ||
| import hashlib | ||
| import pickle | ||
| import msgpack | ||
| from .compat import pickle | ||
| from .format import FileHeader, FileTrailer, IndexEntry | ||
| from . import codecs | ||
| from typing_extensions import Buffer, List, Optional, Self | ||
| import numpy as np | ||
| from .format import CodecSpec, FileHeader, FileTrailer, IndexEntry | ||
| from .encode import ResolvedCodec, resolve_codec, CodecArg | ||
| from ._util import human_size | ||
| _log = logging.getLogger(__name__) | ||
| def _align_pos(pos, size=mmap.PAGESIZE): | ||
| def _align_pos(pos: int, size: int = mmap.PAGESIZE) -> int: | ||
| "Advance a position to be aligned." | ||
@@ -24,23 +30,2 @@ rem = pos % size | ||
| class CKOut: | ||
| """ | ||
| Wrapper for binary output that computes checksums and sizes on the fly. | ||
| """ | ||
| def __init__(self, base): | ||
| self.bytes = 0 | ||
| self.checksum = 1 | ||
| self.delegate = base | ||
| def write(self, data): | ||
| # get a memory view so we have a portable count of bytes | ||
| mv = memoryview(data) | ||
| self.bytes += mv.nbytes | ||
| self.checksum = adler32(data, self.checksum) | ||
| return self.delegate.write(data) | ||
| def flush(self): | ||
| self.delegate.flush() | ||
| class BinPickler: | ||
@@ -68,13 +53,30 @@ """ | ||
| def __init__(self, filename, *, align=False, codec=None): | ||
| filename: str | PathLike | ||
| align: bool | ||
| codecs: list[ResolvedCodec] | ||
| entries: List[IndexEntry] | ||
| _file: io.BufferedWriter | ||
| def __init__( | ||
| self, | ||
| filename: str | PathLike, | ||
| *, | ||
| align=False, | ||
| codecs: Optional[list[CodecArg]] = None, | ||
| ): | ||
| self.filename = filename | ||
| self.align = align | ||
| self._file = open(filename, 'wb') | ||
| self._file = open(filename, "wb") | ||
| self.entries = [] | ||
| self.codec = codec | ||
| if codecs is None: | ||
| self.codecs = [] | ||
| else: | ||
| # pre-resolve the codecs | ||
| self.codecs = [resolve_codec(c) for c in codecs] | ||
| self._init_header() | ||
| @classmethod | ||
| def mappable(cls, filename): | ||
| def mappable(cls, filename: str | PathLike): | ||
| "Convenience method to construct a pickler for memory-mapped use." | ||
@@ -84,22 +86,32 @@ return cls(filename, align=True) | ||
| @classmethod | ||
| def compressed(cls, filename, codec=codecs.GZ()): | ||
| def compressed(cls, filename: str | PathLike, codec: CodecArg = "gzip"): | ||
| "Convenience method to construct a pickler for compressed storage." | ||
| return cls(filename, codec=codec) | ||
| return cls(filename, codecs=[codec]) | ||
| def dump(self, obj): | ||
| def dump(self, obj: object) -> None: | ||
| "Dump an object to the file. Can only be called once." | ||
| bio = io.BytesIO() | ||
| pk = pickle.Pickler(bio, protocol=pickle.HIGHEST_PROTOCOL, | ||
| buffer_callback=self._write_buffer) | ||
| pk = pickle.Pickler( | ||
| bio, protocol=pickle.HIGHEST_PROTOCOL, buffer_callback=self._write_buffer | ||
| ) | ||
| pk.dump(obj) | ||
| buf = bio.getbuffer() | ||
| _log.info('pickled %d bytes with %d buffers', buf.nbytes, len(self.entries)) | ||
| tot_enc = sum(e.enc_length for e in self.entries) | ||
| tot_dec = sum(e.dec_length for e in self.entries) | ||
| _log.info( | ||
| "pickled %d bytes with %d buffers totaling %s (%s encoded)", | ||
| buf.nbytes, | ||
| len(self.entries), | ||
| human_size(tot_dec), | ||
| human_size(tot_enc), | ||
| ) | ||
| self._write_buffer(buf) | ||
| self._finish_file() | ||
| def close(self): | ||
| def close(self) -> None: | ||
| "Close the bin pickler." | ||
| self._file.close() | ||
| def __enter__(self): | ||
| def __enter__(self) -> Self: | ||
| return self | ||
@@ -111,27 +123,30 @@ | ||
| def _init_header(self): | ||
| def _init_header(self) -> None: | ||
| pos = self._file.tell() | ||
| if pos > 0: | ||
| warnings.warn('BinPickler not at beginning of file') | ||
| warnings.warn("BinPickler not at beginning of file") | ||
| h = FileHeader() | ||
| _log.debug('initializing header for %s', self.filename) | ||
| _log.debug("initializing header for %s", self.filename) | ||
| self._file.write(h.encode()) | ||
| assert self._file.tell() == pos + 16 | ||
| assert self._file.tell() == pos + FileHeader.SIZE | ||
| def _encode_buffer(self, buf, out): | ||
| if self.codec is None: | ||
| out.write(buf) | ||
| return None | ||
| elif hasattr(self.codec, '__call__'): | ||
| # codec is callable, call it to get the codec | ||
| codec = self.codec(buf) | ||
| codec = codecs.make_codec(codec) | ||
| else: | ||
| codec = codecs.make_codec(self.codec) | ||
| def _encode_buffer( | ||
| self, | ||
| buf: Buffer, | ||
| ) -> tuple[Buffer, list[CodecSpec]]: | ||
| # fast-path empty buffers | ||
| if memoryview(buf).nbytes == 0: | ||
| return b"", [] | ||
| codec.encode_to(buf, out) | ||
| return (codec.NAME, codec.config()) | ||
| # resolve any deferred codecs | ||
| codecs = [resolve_codec(c, buf) for c in self.codecs] | ||
| def _write_buffer(self, buf): | ||
| mv = memoryview(buf) | ||
| for codec in codecs: | ||
| if codec is not None: | ||
| buf = codec.encode(buf) | ||
| return buf, [c.get_config() for c in codecs if c is not None] | ||
| def _write_buffer(self, buf: Buffer) -> None: | ||
| mv = buf.raw() if isinstance(buf, pickle.PickleBuffer) else memoryview(buf) | ||
| offset = self._file.tell() | ||
@@ -143,3 +158,3 @@ | ||
| nzeds = off2 - offset | ||
| zeds = b'\x00' * nzeds | ||
| zeds = b"\x00" * nzeds | ||
| self._file.write(zeds) | ||
@@ -151,30 +166,42 @@ assert self._file.tell() == off2 | ||
| _log.debug('writing %d bytes at position %d', length, offset) | ||
| cko = CKOut(self._file) | ||
| c_spec = self._encode_buffer(buf, cko) | ||
| _log.debug('encoded %d bytes to %d (%.2f%% saved)', length, cko.bytes, | ||
| (length - cko.bytes) / length * 100 if length else -0.0) | ||
| _log.debug('used codec %s', c_spec) | ||
| binfo = None | ||
| if isinstance(mv.obj, np.ndarray): | ||
| binfo = ("ndarray", str(mv.obj.dtype), mv.obj.shape) | ||
| assert self._file.tell() == offset + cko.bytes | ||
| _log.debug("writing %d bytes at position %d", length, offset) | ||
| buf, c_spec = self._encode_buffer(buf) | ||
| enc_len = memoryview(buf).nbytes | ||
| _log.debug( | ||
| "encoded %d bytes to %d (%.2f%% saved)", | ||
| length, | ||
| enc_len, | ||
| (length - enc_len) / length * 100 if length else -0.0, | ||
| ) | ||
| _log.debug("used codecs %s", c_spec) | ||
| hash = hashlib.sha256(buf) | ||
| _log.debug("has hash %s", hash.hexdigest()) | ||
| self._file.write(buf) | ||
| self.entries.append(IndexEntry(offset, cko.bytes, length, cko.checksum, | ||
| c_spec)) | ||
| assert self._file.tell() == offset + enc_len | ||
| def _write_index(self): | ||
| self.entries.append(IndexEntry(offset, enc_len, length, hash.digest(), binfo, c_spec)) | ||
| def _write_index(self) -> FileTrailer: | ||
| buf = msgpack.packb([e.to_repr() for e in self.entries]) | ||
| pos = self._file.tell() | ||
| nbs = len(buf) | ||
| _log.debug('writing %d index entries (%d bytes) at position %d', | ||
| len(self.entries), nbs, pos) | ||
| _log.debug( | ||
| "writing %d index entries (%d bytes) at position %d", len(self.entries), nbs, pos | ||
| ) | ||
| self._file.write(buf) | ||
| ft = FileTrailer(pos, nbs, adler32(buf)) | ||
| hash = hashlib.sha256(buf) | ||
| ft = FileTrailer(pos, nbs, hash.digest()) | ||
| self._file.write(ft.encode()) | ||
| return ft | ||
| def _finish_file(self): | ||
| def _finish_file(self) -> None: | ||
| self._write_index() | ||
| pos = self._file.tell() | ||
| _log.debug('finalizing file with length %d', pos) | ||
| _log.debug("finalizing file with length %d", pos) | ||
| h = FileHeader(length=pos) | ||
@@ -186,3 +213,3 @@ self._file.seek(0) | ||
| def dump(obj, file, *, mappable=False, codec=codecs.GZ()): | ||
| def dump(obj, file: str | PathLike, *, mappable: bool = False, codecs: list[CodecArg] = ["gzip"]): | ||
| """ | ||
@@ -206,4 +233,4 @@ Dump an object to a BinPickle file. This is a convenience wrapper | ||
| in this case. | ||
| codec(codecs.Codec): | ||
| The codec to use to compress the data, when not saving for | ||
| codecs: | ||
| The codecs to use to compress the data, when not saving for | ||
| memory-mapping. | ||
@@ -215,4 +242,4 @@ """ | ||
| else: | ||
| bpk = BinPickler(file, align=False, codec=codec) | ||
| bpk = BinPickler(file, align=False, codecs=codecs) | ||
| with bpk: | ||
| bpk.dump(obj) |
+2
-1
| ignore: | ||
| - build*.py | ||
| - build-tools/ | ||
| - build-tools/* | ||
| - lkbuild/* |
+12
-4
| from hypothesis import settings | ||
| import pytest | ||
| import numpy as np | ||
| @pytest.fixture | ||
| def rng(): | ||
| return np.random.default_rng() | ||
| # set up profiles | ||
| settings.register_profile('default', deadline=500) | ||
| settings.register_profile('large', max_examples=5000) | ||
| settings.register_profile('fast', max_examples=10) | ||
| settings.load_profile('default') | ||
| settings.register_profile("default", deadline=1000) | ||
| settings.register_profile("large", max_examples=5000) | ||
| settings.register_profile("fast", max_examples=10) | ||
| settings.load_profile("default") |
+19
-24
| import os | ||
| import sys | ||
| sys.path.insert(0, os.path.abspath('..')) | ||
| sys.path.insert(0, os.path.abspath("..")) | ||
| import sphinx_rtd_theme | ||
| import binpickle | ||
| project = 'BinPickle' | ||
| copyright = '2020 Boise State University' | ||
| author = 'Michael D. Ekstrand' | ||
| project = "BinPickle" | ||
| copyright = "2023 Michael Ekstrand" | ||
| author = "Michael D. Ekstrand" | ||
@@ -16,31 +14,28 @@ release = binpickle.__version__ | ||
| extensions = [ | ||
| 'sphinx.ext.napoleon', | ||
| 'sphinx.ext.autodoc', | ||
| 'sphinx.ext.autosummary', | ||
| 'sphinx.ext.intersphinx', | ||
| 'sphinx_rtd_theme' | ||
| "sphinx.ext.napoleon", | ||
| "sphinx.ext.autodoc", | ||
| "sphinx.ext.autosummary", | ||
| "sphinx.ext.intersphinx", | ||
| "sphinxext.opengraph", | ||
| ] | ||
| source_suffix = '.rst' | ||
| source_suffix = ".rst" | ||
| pygments_style = 'sphinx' | ||
| highlight_language = 'python3' | ||
| pygments_style = "sphinx" | ||
| highlight_language = "python3" | ||
| html_theme = 'sphinx_rtd_theme' | ||
| html_theme = "furo" | ||
| html_theme_options = { | ||
| 'github_user': 'lenskit', | ||
| 'github_repo': 'binpickle', | ||
| 'travis_button': False, | ||
| 'canonical_url': 'https://binpickle.lenskit.org/', | ||
| 'font_family': 'Georgia, Charter, serif' | ||
| } | ||
| templates_path = ['_templates'] | ||
| templates_path = ["_templates"] | ||
| intersphinx_mapping = { | ||
| 'python': ('https://docs.python.org/3/', None) | ||
| "python": ("https://docs.python.org/3/", None), | ||
| "numpy": ("https://docs.scipy.org/doc/numpy/", None), | ||
| "sklearn": ("https://scikit-learn.org/stable/", None), | ||
| } | ||
| autodoc_default_options = { | ||
| 'members': True, | ||
| 'member-order': 'bysource' | ||
| "members": True, | ||
| "member-order": "bysource" | ||
| } |
+31
-2
@@ -10,4 +10,6 @@ Format | ||
| Users will not need these classes. They are documented here in the interest of documenting | ||
| the file format. | ||
| the file format. The current format version is **2**, first used in binpickle 0.4.0; this | ||
| is not compatible with prior versions. | ||
| File Structure | ||
@@ -34,3 +36,3 @@ -------------- | ||
| 4. The file index, stored as a list of :py:class:`IndexEntry` objects encoded in MsgPack. | ||
| 5. 16-byte trailer (see :py:class:`FileTrailer`). | ||
| 5. 44-byte trailer (see :py:class:`FileTrailer`). | ||
@@ -50,1 +52,28 @@ The position and length of each buffer is stored in the index, so buffers can have arbitrary | ||
| .. autoclass:: IndexEntry | ||
| Format History | ||
| -------------- | ||
| The current file format version is **2**, introduced in BinPickle 0.4.0. | ||
| .. _format-v2: | ||
| Version 2 | ||
| ~~~~~~~~~ | ||
| Version 2 introduced the following: | ||
| * Replaced Adler32 checksums with SHA-256 digests. | ||
| * Replaced the single ``codec`` field with a ``codecs`` list field. The new | ||
| field directly specifies a list of :py:mod:`numcodecs` codec configurations | ||
| in the order they were applied to encode the buffer. The old native codecs | ||
| have been removed, all codecs come from numcodecs. | ||
| * Added the ``info`` field to :py:class:`IndexEntry` to store information about | ||
| the buffer's data, when available (currently stores NumPy data type and shape | ||
| when serializing a NumPy array). | ||
| Version 1 | ||
| ~~~~~~~~~ | ||
| Version 1 is the original BinPickle format, used through the 0.3 release series. It | ||
| is no longer supported. |
+19
-7
@@ -15,8 +15,9 @@ BinPickle | ||
| BinPickle wraps Python's pickling functionality, so any object that can be pickled | ||
| (including SciKit models) can be stored with BinPickle. If the object supports | ||
| Pickle Protocol 5 (or stores most of its data in NumPy arrays, which in recent | ||
| versions support Pickle 5), then large array data will be efficiently stored, | ||
| either compressed (with Blosc compression by default) or page-aligned and ready | ||
| for memory-mapping, possibly into multiple processes simultaneously. | ||
| BinPickle wraps Python's pickling functionality, so any object that can be | ||
| pickled (including SciKit models) can be stored with BinPickle. If the object | ||
| supports Pickle Protocol 5 (or stores most of its data in NumPy arrays, which in | ||
| recent versions support Pickle 5), then large array data will be efficiently | ||
| stored, either compressed (using any compressor supported by | ||
| :py:mod:`numcodecs`) or page-aligned and ready for memory-mapping, possibly into | ||
| multiple processes simultaneously. | ||
@@ -43,3 +44,2 @@ Quick Start | ||
| read | ||
| codecs | ||
| format | ||
@@ -55,1 +55,13 @@ | ||
| .. _`joblib`: https://github.com/joblib/joblib | ||
| Acknowledgements | ||
| ---------------- | ||
| This material is based upon work supported by the National Science Foundation under | ||
| Grant No. `IIS 17-51278`_. Any | ||
| opinions, findings, and conclusions or recommendations expressed in this material | ||
| are those of the author(s) and do not necessarily reflect the views of the | ||
| National Science Foundation. This page has not been approved by | ||
| Boise State University and does not reflect official university positions. | ||
| .. _`IIS 17-51278`: https://md.ekstrandom.net/research/career |
+0
-0
@@ -0,0 +0,0 @@ Reading BinPickle Files |
+0
-0
@@ -0,0 +0,0 @@ Writing BinPickle Files |
+89
-5
@@ -1,7 +0,91 @@ | ||
| Metadata-Version: 1.1 | ||
| Metadata-Version: 2.1 | ||
| Name: binpickle | ||
| Version: 0.3.4 | ||
| Version: 0.4.0a1 | ||
| Summary: Optimized format for pickling binary data. | ||
| Home-page: https://binpickle.lenskit.org | ||
| Author: Michael Ekstrand | ||
| Author-email: michaelekstrand@boisestate.edu | ||
| Author-email: Michael Ekstrand <mdekstrand@drexel.edu> | ||
| License: Copyright (c) 2020–2023 Boise State University | ||
| Copyright (c) 2023 Michael Ekstrand | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| > The above copyright notice and this permission notice shall be included in | ||
| > all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| SOFTWARE. | ||
| Project-URL: Homepage, https://binpickle.lenksit.org | ||
| Project-URL: GitHub, https://github.com/lenskit/binpickle | ||
| Classifier: License :: OSI Approved :: MIT License | ||
| Classifier: Programming Language :: Python :: 3 | ||
| Classifier: Programming Language :: Python :: 3.10 | ||
| Classifier: Programming Language :: Python :: 3.11 | ||
| Classifier: Operating System :: OS Independent | ||
| Requires-Python: >=3.10 | ||
| Description-Content-Type: text/markdown | ||
| License-File: LICENSE.md | ||
| Requires-Dist: msgpack>=1.0 | ||
| Requires-Dist: numcodecs>=0.12 | ||
| Requires-Dist: typing-extensions~=4.8 | ||
| Provides-Extra: dev | ||
| Requires-Dist: setuptools>=64; extra == "dev" | ||
| Requires-Dist: setuptools_scm>=8; extra == "dev" | ||
| Requires-Dist: ruff; extra == "dev" | ||
| Requires-Dist: mypy~=1.5; extra == "dev" | ||
| Requires-Dist: copier; extra == "dev" | ||
| Requires-Dist: sphinx-autobuild; extra == "dev" | ||
| Requires-Dist: humanize~=4.0; extra == "dev" | ||
| Requires-Dist: msgpack-types; extra == "dev" | ||
| Requires-Dist: pandas-stubs; extra == "dev" | ||
| Provides-Extra: test | ||
| Requires-Dist: pytest>=5; extra == "test" | ||
| Requires-Dist: pytest-cov; extra == "test" | ||
| Requires-Dist: hypothesis>=6; extra == "test" | ||
| Requires-Dist: pandas>=1.4; extra == "test" | ||
| Requires-Dist: numpy>=1.22; extra == "test" | ||
| Provides-Extra: doc | ||
| Requires-Dist: sphinx>=4.2; extra == "doc" | ||
| Requires-Dist: sphinxext-opengraph>=0.5; extra == "doc" | ||
| Requires-Dist: furo; extra == "doc" | ||
| # BinPickle - efficient binary pickled data | ||
| [](https://badge.fury.io/py/binpickle) | ||
|  | ||
| [](https://codecov.io/gh/lenskit/binpickle) | ||
| This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently | ||
| serialize large objects, particularly from scientific Python packages, to an | ||
| on-disk format. This format is designed to support two use cases: | ||
| 1. Serializing data-intensive statistical models in a memory-mappable format so | ||
| multiple processes can share the same (read-only) model memory. | ||
| 2. Serializing data-intensive statistical models with good compression for long-term | ||
| storage and cross-machine transportation. | ||
| BinPickle does this by using Pickle 5's out-of-band buffer serialization support to | ||
| write buffers uncompressed and page-aligned for memory mapping (use case 1) or with | ||
| per-buffer efficient compression with libraries like Blosc (use case 2). | ||
| ## Format Stability | ||
| We do **not** yet guarantee the stability of the BinPickle format. We will avoid gratuitous changes, | ||
| but BinPickle 1.0 will be the first with a stability guarantee. | ||
| ## Acknowledgements | ||
| This material is based upon work supported by the National Science Foundation under | ||
| Grant No. IIS 17-51278. Any opinions, findings, and conclusions or recommendations | ||
| expressed in this material are those of the author(s) and do not necessarily reflect | ||
| the views of the National Science Foundation. This page has not been approved by | ||
| Boise State University and does not reflect official university positions. |
+70
-37
| [build-system] | ||
| requires = ["flit_core >=2,<4"] | ||
| build-backend = "flit_core.buildapi" | ||
| requires = ["setuptools>=64", "setuptools_scm>=8"] | ||
| build-backend = "setuptools.build_meta" | ||
| [tool.flit.metadata] | ||
| module = "binpickle" | ||
| author = "Michael Ekstrand" | ||
| author-email = "michaelekstrand@boisestate.edu" | ||
| home-page = "https://binpickle.lenskit.org" | ||
| classifiers = ["License :: OSI Approved :: MIT License"] | ||
| description-file = "README.md" | ||
| requires-python = ">= 3.6.1" | ||
| requires = [ | ||
| "msgpack >= 1.0", | ||
| "pickle5; python_version < '3.8'" | ||
| [project] | ||
| name = "binpickle" | ||
| description = "Optimized format for pickling binary data." | ||
| authors = [ | ||
| {name="Michael Ekstrand", email="mdekstrand@drexel.edu"} | ||
| ] | ||
| classifiers = [ | ||
| "License :: OSI Approved :: MIT License", | ||
| "Programming Language :: Python :: 3", | ||
| "Programming Language :: Python :: 3.10", | ||
| "Programming Language :: Python :: 3.11", | ||
| "Operating System :: OS Independent", | ||
| ] | ||
| requires-python = ">= 3.10" | ||
| readme = "README.md" | ||
| license = { file = "LICENSE.md" } | ||
| dynamic = ["version"] | ||
| dependencies = [ | ||
| "msgpack >= 1.0", | ||
| "numcodecs >= 0.12", | ||
| "typing-extensions ~= 4.8", | ||
| ] | ||
| [tool.flit.metadata.urls] | ||
| GitHub = "https://github.com/lenskit/binpickle" | ||
| [tool.flit.metadata.requires-extra] | ||
| blosc = [ "blosc" ] | ||
| numcodecs = [ "numcodecs >= 0.7" ] | ||
| [project.optional-dependencies] | ||
| dev = [ | ||
| "setuptools>=64", | ||
| "setuptools_scm>=8", | ||
| "ruff", | ||
| "mypy ~=1.5", | ||
| "copier", | ||
| "sphinx-autobuild", | ||
| "humanize ~=4.0", | ||
| "msgpack-types", | ||
| "pandas-stubs", | ||
| ] | ||
| test = [ | ||
| "pytest >= 5", | ||
| "pytest-cov", | ||
| "hypothesis >= 6", | ||
| "pandas >= 1.0", | ||
| "numpy >= 1.17" | ||
| "pytest >= 5", | ||
| "pytest-cov", | ||
| "hypothesis >= 6", | ||
| "pandas >= 1.4", | ||
| "numpy >= 1.22", | ||
| ] | ||
| doc = ["sphinx"] | ||
| dev = [ | ||
| "flake8", | ||
| "rstcheck" | ||
| doc = [ | ||
| "sphinx >=4.2", | ||
| "sphinxext-opengraph >= 0.5", | ||
| "furo", | ||
| ] | ||
| [tools.flit.sdist] | ||
| include = ["tests/*"] | ||
| [project.urls] | ||
| Homepage = "https://binpickle.lenksit.org" | ||
| GitHub = "https://github.com/lenskit/binpickle" | ||
| # configure build tools | ||
| [tool.setuptools] | ||
| packages = ["binpickle"] | ||
| [tool.setuptools_scm] | ||
| version_scheme = "release-branch-semver" | ||
| # settings for generating conda environments for dev & CI, when needed | ||
| [tool.pyproject2conda] | ||
| channels = ["conda-forge"] | ||
| [tool.ruff] | ||
| line-length = 100 | ||
| target-version = "py310" | ||
| exclude = [ | ||
| ".github" | ||
| ".git", | ||
| "__pycache__", | ||
| "docs/conf.py", | ||
| "build", | ||
| "dist", | ||
| ] | ||
| [tool.envtool.conda] | ||
| name = "binpickle" | ||
| channels = ["conda-forge"] | ||
| [tool.envtool.conda.overrides] | ||
| msgpack = "msgpack-python" | ||
| [tool.mypy] | ||
| mypy_path = "$MYPY_CONFIG_FILE_DIR/stubs" | ||
| exclude = "^docs/" |
+3
-3
@@ -7,5 +7,5 @@ # BinPickle - efficient binary pickled data | ||
| This package uses the new Pickle Protocol 5 in Python 3.8 (or its `pickle5` backport) | ||
| to efficiently serialize large objects, particularly from scientific Python packages, | ||
| to an on-disk format. This format is designed to support two use cases: | ||
| This package uses the new Pickle Protocol 5 added in Python 3.8 to efficiently | ||
| serialize large objects, particularly from scientific Python packages, to an | ||
| on-disk format. This format is designed to support two use cases: | ||
@@ -12,0 +12,0 @@ 1. Serializing data-intensive statistical models in a memory-mappable format so |
+3
-11
@@ -1,12 +0,4 @@ | ||
| [flake8] | ||
| max-line-length = 100 | ||
| exclude = | ||
| .git | ||
| __pycache__ | ||
| docs/conf.py | ||
| build | ||
| dist | ||
| tests | ||
| [egg_info] | ||
| tag_build = | ||
| tag_date = 0 | ||
| [pep8] | ||
| max-line-length = 100 |
+14
-13
| from pytest import raises | ||
| from binpickle.format import * | ||
| from binpickle.errors import FormatError | ||
| from binpickle.format import FileHeader, FileTrailer, HEADER_FORMAT, TRAILER_FORMAT | ||
@@ -8,3 +9,5 @@ | ||
| assert HEADER_FORMAT.size == 16 | ||
| assert TRAILER_FORMAT.size == 16 | ||
| assert FileHeader.SIZE == 16 | ||
| assert TRAILER_FORMAT.size == 44 | ||
| assert FileTrailer.SIZE == 44 | ||
@@ -39,18 +42,16 @@ | ||
| def test_catch_bad_magic(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BNPQ\x00\x00\x00\x00' + (b'\x00' * 8)) | ||
| assert 'magic' in str(exc.value) | ||
| with raises(FormatError) as exc: | ||
| FileHeader.decode(b"BNPQ\x00\x00\x00\x00" + (b"\x00" * 8)) | ||
| assert "magic" in str(exc.value) | ||
| def test_catch_bad_version(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BPCK\x00\x02\x00\x00' + (b'\x00' * 8)) | ||
| assert 'version' in str(exc.value) | ||
| with raises(FormatError) as exc: | ||
| FileHeader.decode(b"BPCK\x00\x12\x00\x00" + (b"\x00" * 8)) | ||
| assert "invalid version" in str(exc.value) | ||
| def test_catch_bad_padding(): | ||
| with raises(ValueError) as exc: | ||
| FileHeader.decode(b'BPCK\x00\x01\x00\xff' + (b'\x00' * 8)) | ||
| assert 'padding' in str(exc.value) | ||
| with raises(FormatError) as exc: | ||
| FileHeader.decode(b"BPCK\x00\x02\x00\xff" + (b"\x00" * 8)) | ||
| assert "unsupported flags" in str(exc.value) |
+73
-69
@@ -7,2 +7,5 @@ import itertools as it | ||
| import pandas as pd | ||
| import numcodecs as nc | ||
| from numcodecs.registry import codec_registry | ||
| from numcodecs.abc import Codec | ||
@@ -16,34 +19,27 @@ import pytest | ||
| from binpickle.write import BinPickler, dump | ||
| from binpickle import codecs | ||
| RW_CTORS = [BinPickler, BinPickler.mappable, BinPickler.compressed] | ||
| RW_CODECS = [st.just(None), st.builds(codecs.GZ)] | ||
| if codecs.Blosc.AVAILABLE: | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Blosc('zstd', 5))) | ||
| RW_CODECS.append(st.builds(codecs.Blosc)) | ||
| RW_CODECS.append(st.builds(codecs.Blosc, st.just('zstd'))) | ||
| if codecs.NC.AVAILABLE: | ||
| import numcodecs | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, numcodecs.LZMA())) | ||
| RW_CODECS.append(st.builds(codecs.NC, st.just(numcodecs.LZMA()))) | ||
| # also build a chain test | ||
| RW_CTORS.append(lambda f: BinPickler.compressed(f, codecs.Chain([numcodecs.MsgPack(), codecs.GZ()]))) | ||
| RW_CTORS = [ | ||
| BinPickler, | ||
| BinPickler.mappable, | ||
| BinPickler.compressed, | ||
| lambda f: BinPickler.compressed(f, nc.LZMA()), | ||
| ] | ||
| RW_CODECS: list[st.SearchStrategy[Codec | str | None]] = [ | ||
| st.just(None), | ||
| st.just("gzip"), | ||
| st.builds(nc.GZip), | ||
| st.builds(nc.LZMA), | ||
| ] | ||
| if "blosc" in codec_registry: | ||
| RW_CODECS.append(st.builds(nc.Blosc)) | ||
| RW_CODECS.append(st.builds(nc.Blosc, st.one_of(st.just("zstd"), st.just("lz4")))) | ||
| RW_CONFIGS = it.product( | ||
| RW_CTORS, | ||
| [False, True] | ||
| ) | ||
| RW_PARAMS = ['writer', 'direct'] | ||
| RW_CONFIGS = it.product(RW_CTORS, [False, True]) | ||
| RW_PARAMS = ["writer", "direct"] | ||
| @pytest.fixture | ||
| def rng(): | ||
| return np.random.default_rng() | ||
| def test_empty(tmp_path): | ||
| "Write a file with nothing in it" | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
@@ -53,3 +49,3 @@ with BinPickler(file) as w: | ||
| assert file.stat().st_size == 33 | ||
| assert file.stat().st_size == 61 | ||
@@ -62,5 +58,5 @@ with BinPickleFile(file) as bpf: | ||
| "Write a file with a single array" | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
| a = rng.integers(0, 5000, 1024, dtype='i4') | ||
| a = rng.integers(0, 5000, 1024, dtype="i4") | ||
@@ -78,3 +74,3 @@ with BinPickler(file) as w: | ||
| assert b2.nbytes == e.dec_length | ||
| a2 = np.frombuffer(b2, dtype='i4') | ||
| a2 = np.frombuffer(b2, dtype="i4") | ||
| assert len(a2) == len(a) | ||
@@ -87,9 +83,8 @@ assert all(a2 == a) | ||
| @settings(deadline=None) | ||
| @given(st.lists(st.binary()), | ||
| st.one_of(RW_CODECS)) | ||
| @given(st.lists(st.binary()), st.one_of(RW_CODECS)) | ||
| def test_write_encoded_arrays(arrays, codec): | ||
| with TemporaryDirectory('.test', 'binpickle-') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with TemporaryDirectory(".test", "binpickle-") as path: | ||
| file = Path(path) / "data.bpk" | ||
| with BinPickler.compressed(file, codec) as w: | ||
| with BinPickler(file, codecs=[codec] if codec else []) as w: | ||
| for a in arrays: | ||
@@ -104,4 +99,4 @@ w._write_buffer(a) | ||
| try: | ||
| if codec is not None: | ||
| assert e.codec | ||
| if codec is not None and e.dec_length > 0: | ||
| assert e.codecs | ||
| assert e.dec_length == len(a) | ||
@@ -118,5 +113,5 @@ dat = bpf._read_buffer(e) | ||
| "Pickle a NumPy array" | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
| a = rng.integers(0, 5000, 1024, dtype='i4') | ||
| a = rng.integers(0, 5000, 1024, dtype="i4") | ||
@@ -136,9 +131,11 @@ with BinPickler(file) as w: | ||
| "Pickle a Pandas data frame" | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| df = pd.DataFrame( | ||
| { | ||
| "key": np.arange(0, 5000), | ||
| "count": rng.integers(0, 1000, 5000), | ||
| "score": rng.normal(10, 2, 5000), | ||
| } | ||
| ) | ||
@@ -158,11 +155,12 @@ with writer(file) as w: | ||
| @pytest.mark.skipif(not codecs.NC.AVAILABLE, reason='numcodecs not available') | ||
| def test_pickle_frame_dyncodec(tmp_path, rng: np.random.Generator): | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000, dtype='i4'), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| df = pd.DataFrame( | ||
| { | ||
| "key": np.arange(0, 5000, dtype="i4"), | ||
| "count": rng.integers(0, 1000, 5000), | ||
| "score": rng.normal(10, 2, 5000), | ||
| } | ||
| ) | ||
@@ -172,8 +170,8 @@ def codec(buf): | ||
| if isinstance(obj, np.ndarray) and obj.dtype == np.float64: | ||
| print('compacting double array') | ||
| return codecs.Chain([numcodecs.AsType('f4', 'f8'), codecs.Blosc('zstd', 9)]) | ||
| print("compacting double array") | ||
| return nc.AsType("f4", "f8") | ||
| else: | ||
| return codecs.Blosc('zstd', 9) | ||
| None | ||
| with BinPickler.compressed(file, codec) as w: | ||
| with BinPickler(file, codecs=[codec, nc.Blosc("zstd", 3)]) as w: | ||
| w.dump(df) | ||
@@ -186,6 +184,7 @@ | ||
| assert all(df2.columns == df.columns) | ||
| assert all(df2['key'] == df['key']) | ||
| assert all(df2['count'] == df['count']) | ||
| assert all(df2['score'].astype('f4') == df['score'].astype('f4')) | ||
| assert all(df2["key"] == df["key"]) | ||
| assert all(df2["count"] == df["count"]) | ||
| assert all(df2["score"].astype("f4") == df["score"].astype("f4")) | ||
| del df2 | ||
| assert bpf.entries[0].info | ||
@@ -195,12 +194,14 @@ | ||
| "Pickle a Pandas data frame" | ||
| file = tmp_path / 'data.bpk' | ||
| file = tmp_path / "data.bpk" | ||
| df = pd.DataFrame({ | ||
| 'key': np.arange(0, 5000), | ||
| 'count': rng.integers(0, 1000, 5000), | ||
| 'score': rng.normal(10, 2, 5000) | ||
| }) | ||
| df = pd.DataFrame( | ||
| { | ||
| "key": np.arange(0, 5000), | ||
| "count": rng.integers(0, 1000, 5000), | ||
| "score": rng.normal(10, 2, 5000), | ||
| } | ||
| ) | ||
| dump(df, file) | ||
| df2 = load(file) | ||
| df2: pd.DataFrame = load(file) | ||
@@ -212,2 +213,3 @@ assert all(df2.columns == df.columns) | ||
| @settings(deadline=None) | ||
| @given(arrays(scalar_dtypes(), st.integers(500, 10000))) | ||
@@ -218,4 +220,4 @@ def test_compress_many_arrays(a): | ||
| with TemporaryDirectory('.test', 'binpickle') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with TemporaryDirectory(".test", "binpickle") as path: | ||
| file = Path(path) / "data.bpk" | ||
@@ -227,2 +229,3 @@ with BinPickler.compressed(file) as w: | ||
| assert not bpf.find_errors() | ||
| assert not bpf.is_mappable | ||
| assert len(bpf.entries) in (1, 2) | ||
@@ -239,4 +242,4 @@ a2 = bpf.load() | ||
| assume(not any(np.isnan(a))) | ||
| with TemporaryDirectory('.test', 'binpickle') as path: | ||
| file = Path(path) / 'data.bpk' | ||
| with TemporaryDirectory(".test", "binpickle") as path: | ||
| file = Path(path) / "data.bpk" | ||
@@ -248,2 +251,3 @@ with BinPickler.mappable(file) as w: | ||
| assert not bpf.find_errors() | ||
| assert bpf.is_mappable | ||
| assert len(bpf.entries) in (1, 2) | ||
@@ -250,0 +254,0 @@ a2 = bpf.load() |
+2
-104
| import logging | ||
| import io | ||
| import zlib | ||
| import functools as ft | ||
| import numpy as np | ||
| from hypothesis import given, settings, HealthCheck | ||
| from hypothesis import given | ||
| import hypothesis.strategies as st | ||
| import pytest | ||
| from binpickle.write import _align_pos, CKOut | ||
| from binpickle.write import _align_pos | ||
@@ -17,7 +11,2 @@ _log = logging.getLogger(__name__) | ||
| def _split_blocks(*args): | ||
| blosc = pytest.importorskip('binpickle.codecs.blosc') | ||
| return blosc._split_blocks(*args) | ||
| @given(st.integers(100, 10000000)) | ||
@@ -28,92 +17,1 @@ def test_align(n): | ||
| assert res % 1024 == 0 | ||
| @given(st.binary()) | ||
| def test_checksum_bytes(data): | ||
| out = io.BytesIO() | ||
| cko = CKOut(out) | ||
| cko.write(data) | ||
| assert out.getbuffer() == data | ||
| assert cko.bytes == len(data) | ||
| assert cko.checksum == zlib.adler32(data) | ||
| @given(st.lists(st.binary(), min_size=1, max_size=10)) | ||
| def test_checksum_multi_bytes(arrays): | ||
| out = io.BytesIO() | ||
| cko = CKOut(out) | ||
| for a in arrays: | ||
| cko.write(a) | ||
| cat = ft.reduce(lambda b1, b2: b1 + b2, arrays) | ||
| assert out.getbuffer() == cat | ||
| assert cko.bytes == len(cat) | ||
| assert cko.checksum == zlib.adler32(cat) | ||
| def test_split_empty_block(): | ||
| blocks = _split_blocks(memoryview(b''), 10) | ||
| assert len(blocks) == 1 | ||
| assert blocks[0] == b'' | ||
| def test_split_one_block(): | ||
| blocks = _split_blocks(memoryview(b'asdf'), 10) | ||
| assert len(blocks) == 1 | ||
| assert blocks[0] == b'asdf' | ||
| def test_split_two_blocks(): | ||
| blocks = _split_blocks(memoryview(b'asdf'), 2) | ||
| assert len(blocks) == 2 | ||
| assert blocks[0] == b'as' | ||
| assert blocks[1] == b'df' | ||
| assert blocks[0].nbytes == 2 | ||
| assert blocks[1].nbytes == 2 | ||
| def test_split_blocks_mismatch(): | ||
| blocks = _split_blocks(memoryview(b'asdfg'), 2) | ||
| assert len(blocks) == 3 | ||
| assert blocks[0] == b'as' | ||
| assert blocks[0].nbytes == 2 | ||
| assert blocks[1] == b'df' | ||
| assert blocks[1].nbytes == 2 | ||
| assert blocks[2] == b'g' | ||
| assert blocks[2].nbytes == 1 | ||
| @settings(suppress_health_check=[HealthCheck.too_slow]) | ||
| @given(st.data()) | ||
| def test_split_blocks(data): | ||
| bs = data.draw(st.integers(8, 4096)) | ||
| input = data.draw(st.binary(min_size=bs//2, max_size=bs*8)) | ||
| _log.info('input size %d, block size %d', len(input), bs) | ||
| blocks = _split_blocks(memoryview(input), bs) | ||
| _log.info('split into %d blocks', len(blocks)) | ||
| assert all(b.nbytes <= bs for b in blocks) | ||
| assert all(len(b) <= bs for b in blocks) | ||
| assert sum(b.nbytes for b in blocks) == len(input) | ||
| reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes()) | ||
| assert len(reconst) == len(input) | ||
| assert reconst == input | ||
| @settings(suppress_health_check=[HealthCheck.too_slow]) | ||
| @given(st.data()) | ||
| def test_split_arrays(data): | ||
| bs = data.draw(st.integers(8, 4096)) | ||
| size = data.draw(st.integers(bs//8, bs*4)) | ||
| array = np.random.randn(size) | ||
| input = memoryview(array) | ||
| _log.info('input size %d (%d bytes), block size %d', len(input), input.nbytes, bs) | ||
| blocks = _split_blocks(memoryview(input), bs) | ||
| _log.info('split into %d blocks', len(blocks)) | ||
| assert all(b.nbytes <= bs for b in blocks) | ||
| assert all(len(b) <= bs for b in blocks) | ||
| assert sum(b.nbytes for b in blocks) == input.nbytes | ||
| reconst = ft.reduce(lambda buf, block: buf + block, blocks, bytes()) | ||
| assert len(reconst) == input.nbytes | ||
| rcv = memoryview(reconst).cast(input.format) | ||
| assert rcv == input | ||
| a2 = np.frombuffer(reconst, array.dtype) | ||
| assert all(a2 == array) |
| { | ||
| "problemMatcher": [ | ||
| { | ||
| "owner": "flake8", | ||
| "pattern": [ | ||
| { | ||
| "regexp": "^([^:]*):(\\d+):(\\d+): (error|warning): (\\w\\d\\d\\d) (.*)$", | ||
| "file": 1, | ||
| "line": 2, | ||
| "column": 3, | ||
| "severity": 4, | ||
| "message": 6 | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } |
| template: | | ||
| ## Merged PRs | ||
| $CHANGES |
| name: Draft Release | ||
| on: | ||
| push: | ||
| # branches to consider in the event; optional, defaults to all | ||
| branches: | ||
| - master | ||
| jobs: | ||
| update_release_draft: | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| # Drafts your next Release notes as Pull Requests are merged into "master" | ||
| - uses: release-drafter/release-drafter@v5 | ||
| env: | ||
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| name: Test and Package | ||
| on: | ||
| push: | ||
| branches: | ||
| - main | ||
| release: | ||
| types: [created,published] | ||
| pull_request: | ||
| jobs: | ||
| lint: | ||
| name: Check Source Style | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Set up Python ${{matrix.python}} | ||
| uses: actions/setup-python@v2 | ||
| with: | ||
| python-version: 3.8 | ||
| - name: Prep Pip caching | ||
| id: pip-cache | ||
| run: | | ||
| echo "::set-output name=dir::$(pip cache dir)" | ||
| shell: bash | ||
| - name: Cache Pip wheels | ||
| uses: actions/cache@v1 | ||
| with: | ||
| path: ${{ steps.pip-cache.outputs.dir }} | ||
| key: py38-lint-pip-${{ hashFiles('*.egg-info/requires.txt')}} | ||
| - name: Install environment | ||
| run: | | ||
| pip install -U flit | ||
| - name: Install package | ||
| run: | | ||
| flit install --pth-file --extras dev | ||
| - name: Run lint | ||
| run: | | ||
| # Flake8 problem matcher & transform regex from https://github.com/TrueBrain/actions-flake8 | ||
| echo "::add-matcher::.github/flake8-matcher.json" | ||
| set -o pipefail | ||
| flake8 |sed -r 's/: ([^W][0-9][0-9][0-9])/: error: \1/;s/: (W[0-9][0-9][0-9])/: warning: \1/' | ||
| echo "::remove-matcher owner=flake8::" | ||
| test: | ||
| name: Test with Python ${{matrix.python}} on ${{matrix.platform}} | ||
| runs-on: ${{matrix.platform}}-latest | ||
| strategy: | ||
| matrix: | ||
| platform: | ||
| - macos | ||
| - windows | ||
| - ubuntu | ||
| python: | ||
| - 3.6 | ||
| - 3.7 | ||
| - 3.8 | ||
| - 3.9 | ||
| exclude: | ||
| - platform: macos | ||
| python: 3.9 | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Set up Python ${{matrix.python}} | ||
| uses: actions/setup-python@v2 | ||
| with: | ||
| python-version: ${{matrix.python}} | ||
| - name: Prep Pip cache | ||
| id: pip-cache | ||
| run: | | ||
| echo "::set-output name=dir::$(pip cache dir)" | ||
| shell: bash | ||
| - name: Cache Pip wheels | ||
| uses: actions/cache@v1 | ||
| with: | ||
| path: ${{ steps.pip-cache.outputs.dir }} | ||
| key: ${{ matrix.platform }}-py${{ matrix.python }}-pip-${{ hashFiles('*.egg-info/requires.txt')}} | ||
| - name: Install environment | ||
| run: | | ||
| pip install -U flit | ||
| - name: Install package | ||
| run: | | ||
| flit install --pth-file --extras all | ||
| - name: Run tests | ||
| run: python -m pytest --cov=binpickle --cov-report=xml tests | ||
| - name: Upload coverage | ||
| uses: codecov/codecov-action@v1 | ||
| no-extras: | ||
| name: Test without extras | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Set up Python ${{matrix.python}} | ||
| uses: actions/setup-python@v2 | ||
| with: | ||
| python-version: 3.8 | ||
| - name: Get Pip cache dir | ||
| id: pip-cache | ||
| run: | | ||
| echo "::set-output name=dir::$(pip cache dir)" | ||
| shell: bash | ||
| - name: Cache Pip wheels | ||
| uses: actions/cache@v1 | ||
| with: | ||
| path: ${{ steps.pip-cache.outputs.dir }} | ||
| key: no-extras-pip-${{ hashFiles('*.egg-info/requires.txt')}} | ||
| - name: Install environment | ||
| run: | | ||
| pip install -U flit | ||
| - name: Install package | ||
| run: | | ||
| flit install --pth-file --extras dev,test | ||
| - name: Run tests | ||
| run: python -m pytest --cov=binpickle --cov-report=xml tests | ||
| - name: Upload coverage | ||
| uses: codecov/codecov-action@v1 | ||
| sdist: | ||
| name: Build Source Packages | ||
| runs-on: ubuntu-latest | ||
| needs: [test, lint, no-extras] | ||
| steps: | ||
| - uses: actions/checkout@v2 | ||
| with: | ||
| fetch-depth: 0 | ||
| - name: Fetch Git tags | ||
| run: git fetch --tags | ||
| - name: Set up Python | ||
| uses: actions/setup-python@v2 | ||
| with: | ||
| python-version: 3.8 | ||
| - name: Install Python deps | ||
| run: pip install -U flit | ||
| - name: Build distribution | ||
| run: flit build | ||
| - name: Save archive | ||
| uses: actions/upload-artifact@v1 | ||
| with: | ||
| name: pypi-pkgs | ||
| path: dist | ||
| - name: List dist dir | ||
| run: ls -R dist | ||
| - name: Publish PyPI packages | ||
| if: github.event_name == 'release' | ||
| run: | | ||
| flit publish | ||
| shell: bash | ||
| env: | ||
| TWINE_NON_INTERACTIVE: y | ||
| FLIT_USERNAME: __token__ | ||
| FLIT_PASSWORD: ${{ secrets.TWINE_TOKEN }} |
| """ | ||
| Codecs for encoding and decoding buffers in BinPickle. | ||
| This is similar in spirit to numcodecs_, but automatically handles some cases | ||
| such as splitting arrays into blocks. | ||
| .. _numcodecs: https://numcodecs.readthedocs.io/en/stable/ | ||
| """ | ||
| from ._base import Codec # noqa: F401 | ||
| import logging | ||
| from . import null | ||
| from . import gz | ||
| from . import blosc | ||
| from . import numcodecs | ||
| _log = logging.getLogger(__name__) | ||
| CODECS = {} | ||
| Null = null.Null | ||
| GZ = gz.GZ | ||
| Blosc = blosc.Blosc | ||
| NC = numcodecs.NC | ||
| def register(cls): | ||
| CODECS[cls.NAME] = cls | ||
| def make_codec(codec, *, null_as_none=False, list_is_tuple=False): | ||
| """ | ||
| Resolve a codec into a BinPickle-compatible codec. | ||
| Args: | ||
| codec(obj): | ||
| The codec to resolve into a codec. Can be one of: | ||
| * ``None`` (returns :class:`Null`) | ||
| * A :class:`Codec` object (returned as-is) | ||
| * A string (look up codec by name and return with default options) | ||
| * A tuple ``(name, config)`` (pass to :func:`get_config`) | ||
| * A list (wrapped in :class:`Chain`) | ||
| * A :class:`numcodecs.abc.Codec` (wrapped in :class:`NC` and returned) | ||
| Returns: | ||
| Codec: the codec. | ||
| """ | ||
| if codec is None and not null_as_none: | ||
| return Null() | ||
| elif isinstance(codec, str): | ||
| return CODECS[codec]() | ||
| elif isinstance(codec, tuple) or (list_is_tuple and isinstance(codec, list)): | ||
| name, config = codec | ||
| return get_codec(name, config) | ||
| elif isinstance(codec, list): | ||
| return Chain(codec) | ||
| elif numcodecs.is_numcodec(codec): | ||
| return NC(codec) | ||
| elif isinstance(codec, Null) and null_as_none: | ||
| return None | ||
| else: | ||
| return codec | ||
| def get_codec(name, config): | ||
| """ | ||
| Get a codec by name and configuration (as stored in the BinPickle manifest). | ||
| Args: | ||
| name(str or None): the codec name. | ||
| config: the codec configuration, as returned by :meth:`Codec.config`. | ||
| Returns: | ||
| Codec: the configured codec. | ||
| """ | ||
| if name is None: | ||
| return Null() | ||
| elif name in CODECS: | ||
| _log.debug('configuring %s: %s', name, config) | ||
| return CODECS[name](**config) | ||
| else: | ||
| raise ValueError(f'unknown codec {name}') | ||
| from .chain import Chain # noqa: E402 | ||
| register(Null) | ||
| register(Chain) | ||
| register(GZ) | ||
| if Blosc.AVAILABLE: | ||
| register(Blosc) | ||
| if NC.AVAILABLE: | ||
| register(NC) |
| from abc import ABC, abstractmethod | ||
| import io | ||
| class Codec(ABC): | ||
| """ | ||
| Base class for a codec. | ||
| Attributes: | ||
| NAME(str): the name for this codec, used by :func:`get_codec` and in index entries. | ||
| """ | ||
| def encode(self, buf): | ||
| """ | ||
| Encode a buffer. | ||
| Args: | ||
| buf(bytes-like): the buffer to encode. | ||
| Returns: | ||
| bytes-like: the encoded data | ||
| """ | ||
| out = io.BytesIO() | ||
| self.encode_to(buf, out) | ||
| return out.getbuffer() | ||
| @abstractmethod | ||
| def encode_to(self, buf, out): | ||
| """ | ||
| Encode a buffer to a binary output stream. | ||
| Args: | ||
| buf(bytes-like): the buffer to encode. | ||
| out(file-like): | ||
| the output stream. Must have a ``write`` method | ||
| taking a :class:`bytes`. | ||
| """ | ||
| def decode(self, buf): | ||
| """ | ||
| Decode a buffer. | ||
| Args: | ||
| buf(bytes-like): the buffer to decode. | ||
| Returns: | ||
| bytes-like: the decoded data | ||
| """ | ||
| out = bytearray() | ||
| self.decode_to(buf, out) | ||
| return out | ||
| @abstractmethod | ||
| def decode_to(self, buf, out): | ||
| """ | ||
| Decode a buffer into a bytearray. | ||
| Args: | ||
| buf(bytes-like): the buffer to decode. | ||
| out(bytearray): | ||
| the bytearray to receive the output. This method will resize the | ||
| bytearray as needed to accomodate the output. | ||
| """ | ||
| @abstractmethod | ||
| def config(self): | ||
| """ | ||
| Get a JSON-serializable configuration for this codec. It should be able | ||
| to be passed as ``**kwargs`` to the constructor. | ||
| """ |
| import logging | ||
| import msgpack | ||
| from importlib.util import find_spec | ||
| from ._base import Codec | ||
| DEFAULT_BLOCKSIZE = 1024 * 1024 * 1024 | ||
| _log = logging.getLogger(__name__) | ||
| def _split_blocks(buf, blocksize): | ||
| if buf.itemsize > 1: | ||
| buf = buf.cast('B') | ||
| length = buf.nbytes | ||
| chunks = [] | ||
| for start in range(0, length, blocksize): | ||
| end = start + blocksize | ||
| if end > length: | ||
| end = length | ||
| chunks.append(buf[start:end]) | ||
| if not chunks: | ||
| chunks.append(memoryview(b'')) | ||
| return chunks | ||
| class Blosc(Codec): | ||
| """ | ||
| Blosc codec. | ||
| """ | ||
| NAME = 'blosc' | ||
| AVAILABLE = find_spec('blosc') is not None | ||
| def __init__(self, name='blosclz', level=9, | ||
| shuffle=1, blocksize=DEFAULT_BLOCKSIZE): | ||
| if not self.AVAILABLE: | ||
| raise ImportError('blosc is not available') | ||
| self.name = name | ||
| self.level = level | ||
| self.shuffle = shuffle | ||
| self.blocksize = blocksize | ||
| def encode_to(self, buf, out): | ||
| # We have to encode by chunks | ||
| import blosc | ||
| pack = msgpack.Packer() | ||
| mv = memoryview(buf) | ||
| _log.debug('encoding %d bytes (itemsize=%d, format=%s)', | ||
| mv.nbytes, mv.itemsize, mv.format) | ||
| _log.debug('splitting with block size %d', self.blocksize) | ||
| blocks = _split_blocks(mv, self.blocksize) | ||
| out.write(pack.pack_array_header(len(blocks))) | ||
| for block in blocks: | ||
| assert block.nbytes <= self.blocksize | ||
| comp = blosc.compress(block, cname=self.name, clevel=self.level, | ||
| shuffle=self.shuffle, typesize=mv.itemsize) | ||
| out.write(pack.pack(comp)) | ||
| block.release() | ||
| def decode_to(self, buf, out): | ||
| import blosc | ||
| blocks = msgpack.unpackb(buf, use_list=True) | ||
| pos = 0 | ||
| for block in blocks: | ||
| dec = blosc.decompress(block) | ||
| dmv = memoryview(dec) # to reduce copies | ||
| n = len(dec) | ||
| e1 = min(pos + n, len(out)) | ||
| n1 = e1 - pos | ||
| out[pos:e1] = dmv[:n1] | ||
| if n1 < n: | ||
| out.extend(dmv[n1:]) | ||
| pos += n | ||
| if len(out) > pos: | ||
| del out[pos:] | ||
| def config(self): | ||
| return { | ||
| 'name': self.name, | ||
| 'level': self.level, | ||
| 'shuffle': self.shuffle | ||
| } |
| from ._base import Codec | ||
| from . import make_codec | ||
| class Chain(Codec): | ||
| """ | ||
| Codec that chains together other codecs in sequence. The codecs are applied | ||
| in the provided order for encoding, and reverse order for decoding. | ||
| """ | ||
| NAME = 'chain' | ||
| def __init__(self, codecs=()): | ||
| self.codecs = [make_codec(c, list_is_tuple=True) for c in codecs] | ||
| def encode(self, buf): | ||
| data = buf | ||
| for codec in self.codecs: | ||
| data = codec.encode(data) | ||
| return data | ||
| def encode_to(self, buf, w): | ||
| w.write(self.encode(buf)) | ||
| def decode(self, buf): | ||
| data = buf | ||
| for codec in self.codecs[::-1]: | ||
| data = codec.decode(data) | ||
| return data | ||
| def decode_to(self, buf, out): | ||
| out[:] = self.decode(buf) | ||
| def config(self): | ||
| return { | ||
| 'codecs': [(c.NAME, c.config()) for c in self.codecs] | ||
| } |
| import zlib | ||
| from ._base import Codec | ||
| class GZ(Codec): | ||
| """ | ||
| Zlib (gzip-compatible) codec. | ||
| """ | ||
| NAME = 'gz' | ||
| def __init__(self, level=9): | ||
| self.level = level | ||
| def encode(self, buf): | ||
| return zlib.compress(buf, self.level) | ||
| def encode_to(self, buf, out): | ||
| # We have to encode by chunks | ||
| out.write(self.encode(buf)) | ||
| def decode(self, buf): | ||
| return zlib.decompress(buf) | ||
| def decode_to(self, buf, out): | ||
| out[:] = self.decode(buf) | ||
| def config(self): | ||
| return { | ||
| 'level': self.level | ||
| } |
| from ._base import Codec | ||
| class Null(Codec): | ||
| """ | ||
| Null codec (passthrough). | ||
| """ | ||
| NAME = 'null' | ||
| def encode(self, buf): | ||
| return buf | ||
| def encode_to(self, buf, out): | ||
| out.write(buf) | ||
| def decode(self, buf, length=None): | ||
| return buf | ||
| def decode_to(self, buf, out): | ||
| out[:] = buf | ||
| def config(self): | ||
| return {} |
| from importlib.util import find_spec | ||
| from ._base import Codec | ||
| def is_numcodec(codec): | ||
| "Test whether a codec is a NumCodecs codec." | ||
| if NC.AVAILABLE: | ||
| import numcodecs | ||
| return isinstance(codec, numcodecs.abc.Codec) | ||
| else: | ||
| return False # if numcodecs aren't available, it can't be one | ||
| class NC(Codec): | ||
| """ | ||
| NumCodec wrapper. | ||
| """ | ||
| NAME = 'numcodec' | ||
| AVAILABLE = find_spec('numcodecs') is not None | ||
| def __init__(self, codec=None, **kwargs): | ||
| if codec is None: | ||
| import numcodecs | ||
| self.codec = numcodecs.get_codec(kwargs) | ||
| else: | ||
| self.codec = codec | ||
| def encode(self, buf): | ||
| return self.codec.encode(buf) | ||
| def encode_to(self, buf, w): | ||
| w.write(self.encode(buf)) | ||
| def decode(self, buf): | ||
| return memoryview(self.codec.decode(buf)) | ||
| def decode_to(self, buf, out): | ||
| out[:] = self.decode(buf) | ||
| def config(self): | ||
| return self.codec.get_config() |
| """ | ||
| Compatibility support. | ||
| """ | ||
| import pickle | ||
| # Make sure we have Pickle 5 | ||
| if pickle.HIGHEST_PROTOCOL < 5: | ||
| import pickle5 as pickle |
| """ | ||
| Environment management tool to instantiate Conda environments from Flit. | ||
| Requires flit-core and packaging to be installed. | ||
| """ | ||
| import os | ||
| import sys | ||
| import tempfile | ||
| import subprocess | ||
| from pathlib import Path | ||
| import argparse | ||
| from flit_core.config import read_flit_config, toml | ||
| from packaging.requirements import Requirement | ||
| from packaging.markers import default_environment | ||
| def write_env(obj, out): | ||
| try: | ||
| import yaml | ||
| yaml.safe_dump(obj, out) | ||
| except ImportError: | ||
| import json | ||
| json.dump(obj, out, indent=2) | ||
| def parse_args(): | ||
| parser = argparse.ArgumentParser(description='Manage development environments.') | ||
| parser.add_argument('--python-version', '-V', metavar='VER', | ||
| help='use Python version VER') | ||
| parser.add_argument('--extra', '-E', metavar='EXTRA', action='append', | ||
| help='include EXTRA') | ||
| parser.add_argument('--name', '-n', metavar='NAME', | ||
| help='name Conda environment NAME') | ||
| parser.add_argument('--no-dev', action='store_true', help='skip dev dependencies') | ||
| parser.add_argument('--save-env', metavar='FILE', | ||
| help='save environment to FILE') | ||
| parser.add_argument('--create-env', action='store_true', | ||
| help='create Conda environment') | ||
| parser.add_argument('--update-env', action='store_true', | ||
| help='update Conda environment') | ||
| args = parser.parse_args() | ||
| return args | ||
| def load_project(): | ||
| tp = Path('pyproject.toml') | ||
| fc = read_flit_config(tp) | ||
| pyp = toml.loads(tp.read_text()) | ||
| return pyp, fc | ||
| class conda_config: | ||
| def __init__(self, project): | ||
| cfg = project.get('tool', {}) | ||
| cfg = cfg.get('envtool', {}) | ||
| self.config = cfg.get('conda', {}) | ||
| @property | ||
| def name(self): | ||
| return str(self.config.get('name', 'dev-env')) | ||
| @property | ||
| def channels(self): | ||
| return [str(c) for c in self.config.get('channels', [])] | ||
| @property | ||
| def extras(self): | ||
| return self.config.get('extras', {}) | ||
| def get_override(self, dep): | ||
| ovr = self.config.get('overrides', {}) | ||
| dep_over = ovr.get(dep, {}) | ||
| if isinstance(dep_over, str): | ||
| dep_over = {'name': dep_over} | ||
| return dep_over | ||
| def source(self, dep): | ||
| dov = self.get_override(dep) | ||
| return dov.get('source', None) | ||
| def conda_name(self, dep): | ||
| dov = self.get_override(dep) | ||
| return str(dov.get('name', dep)) | ||
| def marker_env(args): | ||
| "Get the marker environment" | ||
| env = {} | ||
| env.update(default_environment()) | ||
| if args.python_version: | ||
| env['python_version'] = args.python_version | ||
| env['python_full_version'] = args.python_version | ||
| return env | ||
| def req_active(env, req): | ||
| if req.marker: | ||
| return req.marker.evaluate(env) | ||
| else: | ||
| return True | ||
| def dep_str(cfg, req): | ||
| dep = cfg.conda_name(req.name) | ||
| if req.specifier: | ||
| dep += f' {req.specifier}' | ||
| return dep | ||
| def conda_env(args, pyp, flp): | ||
| cfg = conda_config(pyp) | ||
| mkenv = marker_env(args) | ||
| name = args.name | ||
| if name is None: | ||
| name = cfg.name | ||
| env = {'name': name} | ||
| if cfg.channels: | ||
| env['channels'] = cfg.channels | ||
| deps = [] | ||
| if args.python_version: | ||
| deps.append(f'python ={args.python_version}') | ||
| elif flp.metadata['requires_python']: | ||
| deps.append('python ' + str(flp.metadata['requires_python'])) | ||
| deps.append('pip') | ||
| extras = set(['.none']) | ||
| if not args.no_dev: | ||
| extras |= set(['dev', 'doc', 'test']) | ||
| if args.extra: | ||
| for e in args.extra: | ||
| if e == 'all': | ||
| extras |= set(flp.reqs_by_extra.keys()) | ||
| else: | ||
| extras.add(e) | ||
| pip_deps = [] | ||
| for e in extras: | ||
| for req in flp.reqs_by_extra.get(e, []): | ||
| req = Requirement(req) | ||
| if req_active(mkenv, req): | ||
| if req.url or cfg.source(req.name) == 'pip': | ||
| pip_deps.append(req) | ||
| else: | ||
| deps.append(dep_str(cfg, req)) | ||
| for cr in cfg.extras.get(e, []): | ||
| deps.append(str(cr)) | ||
| if pip_deps: | ||
| deps.append({'pip': [str(r) for r in pip_deps]}) | ||
| env['dependencies'] = deps | ||
| return env | ||
| def env_command(env, cmd): | ||
| with tempfile.TemporaryDirectory() as td: | ||
| path = Path(td) | ||
| ef = path / 'environment.yml' | ||
| with ef.open('w') as f: | ||
| write_env(env, f) | ||
| print(cmd, 'environment', ef) | ||
| subprocess.run(['conda', 'env', cmd, '-f', os.fspath(ef)], check=True) | ||
| def main(args): | ||
| py_p, flit_p = load_project() | ||
| env = conda_env(args, py_p, flit_p) | ||
| if args.save_env: | ||
| with open(args.save_env, 'w') as ef: | ||
| write_env(env, ef) | ||
| elif args.create_env: | ||
| env_command(env, 'create') | ||
| elif args.update_env: | ||
| env_command(env, 'update') | ||
| else: | ||
| write_env(env, sys.stdout) | ||
| if __name__ == '__main__': | ||
| main(parse_args()) |
| The MIT License (MIT) | ||
| Copyright (c) 2021 Boise State University | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| The above copyright notice and this permission notice shall be included in | ||
| all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| THE SOFTWARE. |
| {% extends '!footer.html' %} | ||
| {% block extrafooter %} | ||
| <p>This material is based upon work supported by the National Science Foundation under | ||
| Grant No. <a href="https://md.ekstrandom.net/research/career">IIS 17-51278</a>. Any | ||
| opinions, findings, and conclusions or recommendations expressed in this material | ||
| are those of the author(s) and do not necessarily reflect the views of the | ||
| National Science Foundation. This page has not been approved by | ||
| Boise State University and does not reflect official university positions.</p> | ||
| <script data-goatcounter="https://binpickle.goatcounter.com/count" | ||
| async src="//gc.zgo.at/count.js"></script> | ||
| {% endblock %} |
| Codecs | ||
| ====== | ||
| .. py:module:: binpickle.codecs | ||
| BinPickle supports codecs to compress buffer content. | ||
| These are similar in spirit to numcodecs_, but automatically handle some cases | ||
| such as splitting arrays into blocks and can reduce copying in some situations. | ||
| .. _numcodecs: https://numcodecs.readthedocs.io/en/stable/ | ||
| .. toctree:: | ||
| .. autofunction:: make_codec | ||
| Codec API | ||
| --------- | ||
| .. autoclass:: Codec | ||
| Codec Implementations | ||
| --------------------- | ||
| Null codec | ||
| ~~~~~~~~~~ | ||
| .. autoclass:: Null | ||
| Chain codec | ||
| ~~~~~~~~~~~ | ||
| .. autoclass:: Chain | ||
| Blosc codec | ||
| ~~~~~~~~~~~ | ||
| .. autoclass:: Blosc | ||
| Gzip codec | ||
| ~~~~~~~~~~ | ||
| .. autoclass:: GZ | ||
| NumCodecs | ||
| ~~~~~~~~~ | ||
| BinPickle also supports any codec from numcodecs_ through the :class:`NC` wrapper. This | ||
| is automatically used by the :func:`make_codec` function, so you can also pass a NumCodecs | ||
| codec directly to :meth:`binpickle.BinPickler.compressed`. | ||
| # Dev environment | ||
| name: binpickle | ||
| channels: | ||
| - conda-forge | ||
| # - defaults | ||
| dependencies: | ||
| - python=3.8 | ||
| - pip | ||
| - msgpack-python | ||
| - python-blosc | ||
| - numcodecs | ||
| - numpy>=1.17 | ||
| - pandas>=1.0 | ||
| - pytest | ||
| - pytest-cov | ||
| - hypothesis | ||
| - sphinx | ||
| - flake8 | ||
| - twine | ||
| - pip: | ||
| - rstcheck |
-21
| The MIT License (MIT) | ||
| Copyright (c) 2020 Boise State University | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| The above copyright notice and this permission notice shall be included in | ||
| all copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| THE SOFTWARE. |
-38
| #!/usr/bin/env python | ||
| # setup.py generated by flit for tools that don't yet use PEP 517 | ||
| from distutils.core import setup | ||
| packages = \ | ||
| ['binpickle', 'binpickle.codecs'] | ||
| package_data = \ | ||
| {'': ['*']} | ||
| install_requires = \ | ||
| ['msgpack >= 1.0'] | ||
| extras_require = \ | ||
| {":python_version < '3.8'": ['pickle5'], | ||
| 'blosc': ['blosc'], | ||
| 'dev': ['flake8', 'rstcheck'], | ||
| 'doc': ['sphinx'], | ||
| 'numcodecs': ['numcodecs >= 0.7'], | ||
| 'test': ['pytest >= 5', | ||
| 'pytest-cov', | ||
| 'hypothesis >= 6', | ||
| 'pandas >= 1.0', | ||
| 'numpy >= 1.17']} | ||
| setup(name='binpickle', | ||
| version='0.3.4', | ||
| description='Optimized format for pickling binary data.', | ||
| author='Michael Ekstrand', | ||
| author_email='michaelekstrand@boisestate.edu', | ||
| url='https://binpickle.lenskit.org', | ||
| packages=packages, | ||
| package_data=package_data, | ||
| install_requires=install_requires, | ||
| extras_require=extras_require, | ||
| python_requires='>= 3.6.1', | ||
| ) |
| import pytest | ||
| import numpy as np | ||
| from hypothesis import given, assume, settings | ||
| import hypothesis.strategies as st | ||
| from hypothesis.extra.numpy import arrays, integer_dtypes, floating_dtypes | ||
| from binpickle.codecs import * | ||
| if NC.AVAILABLE: | ||
| from numcodecs import LZ4, LZMA | ||
| KNOWN_CODECS = [c for c in CODECS.values() if c.NAME != 'numcodec'] # exclude numcodec from common tests | ||
| need_blosc = pytest.mark.skipif(not Blosc.AVAILABLE, reason='Blosc not available') | ||
| need_numcodecs = pytest.mark.skipif(not NC.AVAILABLE, reason='numcodecs not available') | ||
| def test_make_codec_none(): | ||
| assert isinstance(make_codec(None), Null) | ||
| def test_make_codec_null_str(): | ||
| assert isinstance(make_codec('null'), Null) | ||
| def test_make_codec_gz_str(): | ||
| assert isinstance(make_codec('gz'), GZ) | ||
| def test_make_codec_return(): | ||
| codec = GZ() | ||
| assert make_codec(codec) is codec | ||
| @need_numcodecs | ||
| def test_make_codec_wrap(): | ||
| inner = LZ4() | ||
| codec = make_codec(inner) | ||
| assert isinstance(codec, NC) | ||
| assert codec.codec is inner | ||
| def test_make_codec_to_none(): | ||
| "Test internal-use none codec" | ||
| assert make_codec(None, null_as_none=True) is None | ||
| assert make_codec(Null(), null_as_none=True) is None | ||
| def test_get_null_with_none(): | ||
| codec = get_codec(None, {}) | ||
| assert isinstance(codec, Null) | ||
| def test_get_null(): | ||
| codec = get_codec('null', {}) | ||
| assert isinstance(codec, Null) | ||
| def test_get_gz(): | ||
| codec = get_codec('gz', {}) | ||
| assert isinstance(codec, GZ) | ||
| assert codec.level == 9 | ||
| def test_get_gz_level(): | ||
| codec = get_codec('gz', {'level': 5}) | ||
| assert isinstance(codec, GZ) | ||
| assert codec.level == 5 | ||
| @need_blosc | ||
| def test_get_blosc(): | ||
| codec = get_codec('blosc', {}) | ||
| assert isinstance(codec, Blosc) | ||
| assert codec.level == 9 | ||
| @need_blosc | ||
| def test_get_blosc_lvl(): | ||
| codec = get_codec('blosc', {'name': 'zstd', 'level': 5}) | ||
| assert isinstance(codec, Blosc) | ||
| assert codec.name == 'zstd' | ||
| assert codec.level == 5 | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| @settings(deadline=500) | ||
| @given(st.binary()) | ||
| def test_codec_roundtrip(codec, data): | ||
| "Round-trip a codec" | ||
| c = codec() | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| assert len(dec) == len(data) | ||
| assert dec == data | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| @settings(deadline=500) | ||
| @given(arrays(st.one_of(integer_dtypes(), floating_dtypes()), | ||
| st.integers(10, 10000))) | ||
| def test_codec_roundtrip_array(codec, data): | ||
| "Round-trip a codec" | ||
| assume(not any(np.isnan(data))) | ||
| c = codec() | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| a2 = np.frombuffer(dec, dtype=data.dtype) | ||
| assert len(a2) == len(data) | ||
| assert all(a2 == data) | ||
| @pytest.mark.parametrize('codec', KNOWN_CODECS) | ||
| def test_codec_decode_oversize(codec): | ||
| "Test decoding data to an oversized bytearray" | ||
| c = codec() | ||
| data = bytearray(np.random.randn(500)) | ||
| out = bytearray(len(data) * 2) | ||
| enc = c.encode(data) | ||
| c.decode_to(enc, out) | ||
| assert len(out) == len(data) | ||
| assert out == data | ||
| @need_blosc | ||
| def test_large_blosc_encode(): | ||
| "Test encoding Blosc data that needs to be split" | ||
| c = Blosc(blocksize=4096) | ||
| data = np.random.randn(10000) | ||
| enc = c.encode(data) | ||
| dec = c.decode(enc) | ||
| assert len(enc) < len(dec) # we should have compressed | ||
| assert len(dec) == data.nbytes | ||
| assert dec == memoryview(data) | ||
| a2 = np.frombuffer(data) | ||
| assert len(a2) == len(data) | ||
| assert all(a2 == data) | ||
| @need_numcodecs | ||
| @given(st.binary()) | ||
| def test_numcodec_roundtrip(data): | ||
| c = NC(LZMA()) | ||
| buf = c.encode(data) | ||
| d2 = c.decode(buf) | ||
| assert len(d2) == len(data) | ||
| assert d2 == data | ||
| @need_numcodecs | ||
| @given(st.binary()) | ||
| def test_chain(data): | ||
| # Useless but a test | ||
| codec = Chain([LZMA(), GZ()]) | ||
| buf = codec.encode(data) | ||
| d2 = codec.decode(buf) | ||
| assert len(d2) == len(data) | ||
| assert d2 == data | ||
| @need_numcodecs | ||
| def test_chain_config(): | ||
| codec = Chain([LZMA(), GZ()]) | ||
| assert len(codec.codecs) == 2 | ||
| assert isinstance(codec.codecs[0], NC) | ||
| assert isinstance(codec.codecs[1], GZ) | ||
| cfg = codec.config() | ||
| c2 = get_codec(Chain.NAME, cfg) | ||
| assert len(codec.codecs) == 2 | ||
| assert isinstance(codec.codecs[0], NC) | ||
| assert isinstance(codec.codecs[1], GZ) | ||
| def test_is_not_numcodec(): | ||
| assert not numcodecs.is_numcodec(GZ()) | ||
| @need_numcodecs | ||
| def test_is_numcodec(): | ||
| assert numcodecs.is_numcodec(LZ4()) |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
61336
-9.43%960
-29.98%