diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc87a2e..ac6d6d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,9 @@ jobs: python -m pip install --upgrade pip setuptools pip install -r requirements.d/dev.txt - name: Install borghash - run: pip install -ve . + run: | + python setup.py build_ext --inplace + python -m build + pip install -v dist/borghash*.tar.gz - name: run tox env run: tox --skip-missing-interpreters diff --git a/.gitignore b/.gitignore index b88f4f8..6353c31 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ .idea .pytest_cache .tox +build dist __pycache__ -src/borghash.egg-info src/borghash/_version.py -src/borghash/borghash.cpp -src/*.so +**/*.so +**/*.c +**/*.egg-info diff --git a/README.rst b/README.rst index 947ab41..a04c307 100644 --- a/README.rst +++ b/README.rst @@ -129,6 +129,15 @@ Results on an Apple MacBook Pro (M3 Pro CPU) are like: HashTableNT serialization (count=50000): write: 0.020s, read: 0.021s. +Building / Installing +--------------------- +:: + + python setup.py build_ext --inplace + python -m build + pip install dist/borghash*.tar.gz + + State of this project --------------------- diff --git a/pyproject.toml b/pyproject.toml index 0507d42..4ab1dba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,19 +30,10 @@ dependencies = [] "Changelog" = "https://github.com/borgbackup/borghash/blob/master/changes.rst" [project.scripts] -borghash-demo = "borghash:demo" - -[tool.setuptools] -# See also the MANIFEST.in file. -# We want to install all the files in the package directories... -include-package-data = true - -[tool.setuptools.exclude-package-data] -# ...except the source files which have been compiled (C extensions): -"*" = ["*.c", "*.h", "*.pyx"] +borghash-demo = "borghash.__main__:demo" [build-system] -requires = ["setuptools", "wheel", "Cython>=3.0.3", "setuptools_scm[toml]>=6.2"] +requires = ["setuptools", "wheel", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] diff --git a/requirements.d/dev.txt b/requirements.d/dev.txt index 12626f0..41584bc 100644 --- a/requirements.d/dev.txt +++ b/requirements.d/dev.txt @@ -3,3 +3,4 @@ pytest pytest-benchmark build twine +Cython diff --git a/setup.py b/setup.py index 7e41827..27475bb 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,21 @@ -from setuptools import setup -from Cython.Build import cythonize +from setuptools import Extension, setup + +try: + from Cython.Build import cythonize +except ImportError: + cythonize = None # we don't have cython installed + +ext = '.pyx' if cythonize else '.c' + +extensions = [ + Extension("borghash.HashTable", ["src/borghash/HashTable" + ext]), + Extension("borghash.HashTableNT", ["src/borghash/HashTableNT" + ext]), +] + +if cythonize: + extensions = cythonize(extensions, language_level="3str") setup( - package_data=dict(borghash=["borghash.pxd"]), - ext_modules=cythonize("borghash.pyx") + package_data={"borghash": ["*.pxd", "*.pyx"]}, + ext_modules=extensions, ) diff --git a/borghash.pxd b/src/borghash/HashTable.pxd similarity index 75% rename from borghash.pxd rename to src/borghash/HashTable.pxd index 1a3584d..3315427 100644 --- a/borghash.pxd +++ b/src/borghash/HashTable.pxd @@ -2,7 +2,8 @@ from libc.stdint cimport uint8_t, uint32_t cdef class HashTable: cdef int ksize, vsize - cdef int initial_capacity, capacity, used, tombstones + cdef readonly int capacity, used + cdef int initial_capacity, tombstones cdef float max_load_factor, min_load_factor, shrink_factor, grow_factor cdef uint32_t* table cdef int kv_capacity, kv_used @@ -16,11 +17,3 @@ cdef class HashTable: cdef int _lookup_index(self, uint8_t* key_ptr, int* index_ptr) cdef void _resize_table(self, int new_capacity) cdef void _resize_kv(self, int new_capacity) - - -cdef class HashTableNT: - cdef int key_size - cdef object value_type - cdef object value_struct - cdef int value_size - cdef HashTable inner diff --git a/borghash.pyx b/src/borghash/HashTable.pyx similarity index 59% rename from borghash.pyx rename to src/borghash/HashTable.pyx index c03e5ce..4614bf1 100644 --- a/borghash.pyx +++ b/src/borghash/HashTable.pyx @@ -1,12 +1,8 @@ """ -borghash - hashtable implementations in cython. - HashTable: low-level ht mapping fully random bytes keys to bytes values. key and value length can be chosen, but is fixed afterwards. the keys and values are stored in arrays separate from the hashtable. the hashtable only stores the 32bit indexes into the key/value arrays. - -HashTableNT: wrapper around HashTable, providing namedtuple values and serialization. """ from __future__ import annotations from typing import BinaryIO, Iterator, Any @@ -15,10 +11,7 @@ from libc.stdlib cimport malloc, free, realloc from libc.string cimport memcpy, memset, memcmp from libc.stdint cimport uint8_t, uint32_t -from collections import namedtuple from collections.abc import Mapping -import json -import struct MAGIC = b"BORGHASH" assert len(MAGIC) == 8 @@ -335,246 +328,3 @@ cdef class HashTable: "resize_table": self.stats_resize_table, "resize_kv": self.stats_resize_kv, } - - -cdef class HashTableNT: - def __init__(self, items=None, *, - key_size: int = 0, value_format: str = "", value_type: Any = None, - capacity: int = MIN_CAPACITY) -> None: - if not key_size: - raise ValueError("key_size must be specified and must be > 0.") - if not value_format: - raise ValueError("value_format must be specified and must be non-empty.") - if value_type is None: - raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).") - self.key_size = key_size - self.value_struct = struct.Struct(value_format) - self.value_size = self.value_struct.size - self.value_type = value_type - self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) - _fill(self, items) - - def clear(self) -> None: - self.inner.clear() - - def _check_key(self, key: bytes) -> None: - if not isinstance(key, bytes): - raise TypeError(f"Expected an instance of bytes, got {type(key)}") - if len(key) != self.key_size: - raise ValueError(f"Key must be {self.key_size} bytes long") - - def _to_binary_value(self, value: Any) -> bytes: - if not isinstance(value, self.value_type): - if isinstance(value, tuple): - value = self.value_type(*value) - else: - raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}") - return self.value_struct.pack(*value) - - def _to_namedtuple_value(self, binary_value: bytes) -> Any: - unpacked_data = self.value_struct.unpack(binary_value) - return self.value_type(*unpacked_data) - - def _set_raw(self, key: bytes, value: bytes) -> None: - self.inner[key] = value - - def _get_raw(self, key: bytes) -> bytes: - return self.inner[key] - - def __setitem__(self, key: bytes, value: Any) -> None: - self._check_key(key) - self.inner[key] = self._to_binary_value(value) - - def __getitem__(self, key: bytes) -> Any: - self._check_key(key) - binary_value = self.inner[key] - return self._to_namedtuple_value(binary_value) - - def __delitem__(self, key: bytes) -> None: - self._check_key(key) - del self.inner[key] - - def __contains__(self, key: bytes) -> bool: - self._check_key(key) - return key in self.inner - - def items(self) -> Iterator[tuple[bytes, Any]]: - for key, binary_value in self.inner.items(): - yield (key, self._to_namedtuple_value(binary_value)) - - def __len__(self) -> int: - return len(self.inner) - - def get(self, key: bytes, default: Any = None) -> Any: - self._check_key(key) - try: - binary_value = self.inner[key] - except KeyError: - return default - else: - return self._to_namedtuple_value(binary_value) - - def setdefault(self, key: bytes, default: Any) -> Any: - self._check_key(key) - binary_default = self._to_binary_value(default) - binary_value = self.inner.setdefault(key, binary_default) - return self._to_namedtuple_value(binary_value) - - def pop(self, key: bytes, default: Any = _NoDefault) -> Any: - self._check_key(key) - try: - binary_value = self.inner.pop(key) - except KeyError: - if default is _NoDefault: - raise - return default - else: - return self._to_namedtuple_value(binary_value) - - def k_to_idx(self, key: bytes) -> int: - return self.inner.k_to_idx(key) - - def idx_to_k(self, idx: int) -> bytes: - return self.inner.idx_to_k(idx) - - def kv_to_idx(self, key: bytes, value: Any) -> int: - binary_value = self._to_binary_value(value) - return self.inner.kv_to_idx(key, binary_value) - - def idx_to_kv(self, idx: int) -> tuple[bytes, Any]: - key, binary_value = self.inner.idx_to_kv(idx) - return key, self._to_namedtuple_value(binary_value) - - @property - def stats(self) -> dict[str, int]: - return self.inner.stats - - def write(self, file: BinaryIO|str|bytes): - if isinstance(file, (str, bytes)): - with open(file, 'wb') as fd: - self._write_fd(fd) - else: - self._write_fd(file) - - def _write_fd(self, fd: BinaryIO): - meta = { - 'key_size': self.key_size, - 'value_size': self.value_size, - 'value_format': self.value_struct.format, - 'value_type_name': self.value_type.__name__, - 'value_type_fields': self.value_type._fields, - 'capacity': self.inner.capacity, - 'used': self.inner.used, # count of keys / values - } - meta_bytes = json.dumps(meta).encode("utf-8") - meta_size = len(meta_bytes) - header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size) - fd.write(header_bytes) - fd.write(meta_bytes) - count = 0 - for key, value in self.inner.items(): - fd.write(key) - fd.write(value) - count += 1 - assert count == self.inner.used - - @classmethod - def read(cls, file: BinaryIO|str|bytes): - if isinstance(file, (str, bytes)): - with open(file, 'rb') as fd: - return cls._read_fd(fd) - else: - return cls._read_fd(file) - - @classmethod - def _read_fd(cls, fd: BinaryIO): - header_size = struct.calcsize(HEADER_FMT) - header_bytes = fd.read(header_size) - if len(header_bytes) < header_size: - raise ValueError(f"Invalid file, file is too short.") - magic, version, meta_size = struct.unpack(HEADER_FMT, header_bytes) - if magic != MAGIC: - raise ValueError(f"Invalid file, magic {MAGIC.decode()} not found.") - if version != VERSION: - raise ValueError(f"Unsupported file version {version}.") - meta_bytes = fd.read(meta_size) - if len(meta_bytes) < meta_size: - raise ValueError(f"Invalid file, file is too short.") - meta = json.loads(meta_bytes.decode("utf-8")) - value_type = namedtuple(meta['value_type_name'], meta['value_type_fields']) - ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity']) - count = 0 - ksize, vsize = meta['key_size'], meta['value_size'] - for i in range(meta['used']): - key = fd.read(ksize) - value = fd.read(vsize) - ht._set_raw(key, value) - return ht - - def size(self) -> int: - """ - do a rough worst-case estimate of the on-disk size when using .write(). - - the serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads. - """ - one_time_overheads = 4096 # very rough - N = self.inner.used - return int(N * (self.key_size + self.value_size) + one_time_overheads) - - -def demo(): - print("BorgHash demo") - print("=============") - print("Code:") - code = """ -from tempfile import NamedTemporaryFile -from time import time - -count = 50000 -value_type = namedtuple("Chunk", ["refcount", "size"]) -# 256bit (32Byte) key, 2x 32bit (4Byte) values -ht = HashTableNT(key_size=32, value_format=" None: + if not key_size: + raise ValueError("key_size must be specified and must be > 0.") + if not value_format: + raise ValueError("value_format must be specified and must be non-empty.") + if value_type is None: + raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).") + self.key_size = key_size + self.value_struct = struct.Struct(value_format) + self.value_size = self.value_struct.size + self.value_type = value_type + self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) + _fill(self, items) + + def clear(self) -> None: + self.inner.clear() + + def _check_key(self, key: bytes) -> None: + if not isinstance(key, bytes): + raise TypeError(f"Expected an instance of bytes, got {type(key)}") + if len(key) != self.key_size: + raise ValueError(f"Key must be {self.key_size} bytes long") + + def _to_binary_value(self, value: Any) -> bytes: + if not isinstance(value, self.value_type): + if isinstance(value, tuple): + value = self.value_type(*value) + else: + raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}") + return self.value_struct.pack(*value) + + def _to_namedtuple_value(self, binary_value: bytes) -> Any: + unpacked_data = self.value_struct.unpack(binary_value) + return self.value_type(*unpacked_data) + + def _set_raw(self, key: bytes, value: bytes) -> None: + self.inner[key] = value + + def _get_raw(self, key: bytes) -> bytes: + return self.inner[key] + + def __setitem__(self, key: bytes, value: Any) -> None: + self._check_key(key) + self.inner[key] = self._to_binary_value(value) + + def __getitem__(self, key: bytes) -> Any: + self._check_key(key) + binary_value = self.inner[key] + return self._to_namedtuple_value(binary_value) + + def __delitem__(self, key: bytes) -> None: + self._check_key(key) + del self.inner[key] + + def __contains__(self, key: bytes) -> bool: + self._check_key(key) + return key in self.inner + + def items(self) -> Iterator[tuple[bytes, Any]]: + for key, binary_value in self.inner.items(): + yield (key, self._to_namedtuple_value(binary_value)) + + def __len__(self) -> int: + return len(self.inner) + + def get(self, key: bytes, default: Any = None) -> Any: + self._check_key(key) + try: + binary_value = self.inner[key] + except KeyError: + return default + else: + return self._to_namedtuple_value(binary_value) + + def setdefault(self, key: bytes, default: Any) -> Any: + self._check_key(key) + binary_default = self._to_binary_value(default) + binary_value = self.inner.setdefault(key, binary_default) + return self._to_namedtuple_value(binary_value) + + def pop(self, key: bytes, default: Any = _NoDefault) -> Any: + self._check_key(key) + try: + binary_value = self.inner.pop(key) + except KeyError: + if default is _NoDefault: + raise + return default + else: + return self._to_namedtuple_value(binary_value) + + def k_to_idx(self, key: bytes) -> int: + return self.inner.k_to_idx(key) + + def idx_to_k(self, idx: int) -> bytes: + return self.inner.idx_to_k(idx) + + def kv_to_idx(self, key: bytes, value: Any) -> int: + binary_value = self._to_binary_value(value) + return self.inner.kv_to_idx(key, binary_value) + + def idx_to_kv(self, idx: int) -> tuple[bytes, Any]: + key, binary_value = self.inner.idx_to_kv(idx) + return key, self._to_namedtuple_value(binary_value) + + @property + def stats(self) -> dict[str, int]: + return self.inner.stats + + def write(self, file: BinaryIO|str|bytes): + if isinstance(file, (str, bytes)): + with open(file, 'wb') as fd: + self._write_fd(fd) + else: + self._write_fd(file) + + def _write_fd(self, fd: BinaryIO): + meta = { + 'key_size': self.key_size, + 'value_size': self.value_size, + 'value_format': self.value_struct.format, + 'value_type_name': self.value_type.__name__, + 'value_type_fields': self.value_type._fields, + 'capacity': self.inner.capacity, + 'used': self.inner.used, # count of keys / values + } + meta_bytes = json.dumps(meta).encode("utf-8") + meta_size = len(meta_bytes) + header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size) + fd.write(header_bytes) + fd.write(meta_bytes) + count = 0 + for key, value in self.inner.items(): + fd.write(key) + fd.write(value) + count += 1 + assert count == self.inner.used + + @classmethod + def read(cls, file: BinaryIO|str|bytes): + if isinstance(file, (str, bytes)): + with open(file, 'rb') as fd: + return cls._read_fd(fd) + else: + return cls._read_fd(file) + + @classmethod + def _read_fd(cls, fd: BinaryIO): + header_size = struct.calcsize(HEADER_FMT) + header_bytes = fd.read(header_size) + if len(header_bytes) < header_size: + raise ValueError(f"Invalid file, file is too short.") + magic, version, meta_size = struct.unpack(HEADER_FMT, header_bytes) + if magic != MAGIC: + raise ValueError(f"Invalid file, magic {MAGIC.decode()} not found.") + if version != VERSION: + raise ValueError(f"Unsupported file version {version}.") + meta_bytes = fd.read(meta_size) + if len(meta_bytes) < meta_size: + raise ValueError(f"Invalid file, file is too short.") + meta = json.loads(meta_bytes.decode("utf-8")) + value_type = namedtuple(meta['value_type_name'], meta['value_type_fields']) + ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity']) + count = 0 + ksize, vsize = meta['key_size'], meta['value_size'] + for i in range(meta['used']): + key = fd.read(ksize) + value = fd.read(vsize) + ht._set_raw(key, value) + return ht + + def size(self) -> int: + """ + do a rough worst-case estimate of the on-disk size when using .write(). + + the serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads. + """ + one_time_overheads = 4096 # very rough + N = self.inner.used + return int(N * (self.key_size + self.value_size) + one_time_overheads) diff --git a/src/borghash/__init__.pxd b/src/borghash/__init__.pxd new file mode 100644 index 0000000..e69de29 diff --git a/src/borghash/__init__.py b/src/borghash/__init__.py new file mode 100644 index 0000000..a5d2f3b --- /dev/null +++ b/src/borghash/__init__.py @@ -0,0 +1,5 @@ +""" +borghash - hashtable implementations in cython. +""" +from .HashTable import HashTable +from .HashTableNT import HashTableNT diff --git a/src/borghash/__main__.py b/src/borghash/__main__.py new file mode 100644 index 0000000..ff85df9 --- /dev/null +++ b/src/borghash/__main__.py @@ -0,0 +1,67 @@ +""" +Demonstration of borghash. +""" + +def demo(): + print("BorgHash demo") + print("=============") + print("Code:") + code = """ +from tempfile import NamedTemporaryFile +from time import time +from collections import namedtuple + +from .HashTableNT import HashTableNT + +count = 50000 +value_type = namedtuple("Chunk", ["refcount", "size"]) +# 256bit (32Byte) key, 2x 32bit (4Byte) values +ht = HashTableNT(key_size=32, value_format="