diff --git a/README.rst b/README.rst index 7716441..ed64a37 100644 --- a/README.rst +++ b/README.rst @@ -34,6 +34,9 @@ Testing Examples -------- + +Create dataset +************** .. code:: python In [1]: import scipy.sparse as ss @@ -47,9 +50,17 @@ Examples ...: [1, 1, 0]], ...: dtype=np.float64) - In [3]: with h5sparse.File("test.h5") as h5f: - ...: h5f.create_dataset("sparse/matrix", data=sparse_matrix) - ...: + In [3]: # create dataset from scipy sparse matrix + ...: with h5sparse.File("test.h5") as h5f: + ...: h5f.create_dataset('sparse/matrix', data=sparse_matrix) + + In [4]: # you can also create dataset from another dataset + ...: with h5sparse.File("test.h5") as h5f: + ...: h5f.create_dataset('sparse/matrix2', data=h5f['sparse/matrix']) + +Read dataset +************ +.. code:: python In [5]: h5f = h5sparse.File("test.h5") @@ -114,6 +125,33 @@ Examples <4x3 sparse matrix of type '' with 4 stored elements in Compressed Sparse Row format> +Append dataset +************** +.. code:: python + + In [22]: to_append = ss.csr_matrix([[0, 1, 1], + ...: [1, 0, 0]], + ...: dtype=np.float64) + + In [23]: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,), + ...: maxshape=(None,)) + + In [24]: h5f['matrix'].append(to_append) + + In [25]: h5f['matrix'].value + Out[25]: + <6x3 sparse matrix of type '' + with 7 stored elements in Compressed Sparse Row format> + + In [26]: h5f['matrix'].value.toarray() + Out[26]: + array([[ 0., 1., 0.], + [ 0., 0., 1.], + [ 0., 0., 0.], + [ 1., 1., 0.], + [ 0., 1., 1.], + [ 1., 0., 0.]]) + Version scheme -------------- diff --git a/h5sparse/h5sparse.py b/h5sparse/h5sparse.py index a007015..b60ff02 100644 --- a/h5sparse/h5sparse.py +++ b/h5sparse/h5sparse.py @@ -1,5 +1,6 @@ import six import h5py +import numpy as np import scipy.sparse as ss @@ -49,7 +50,8 @@ def __getitem__(self, key): raise ValueError("Unexpected item type.") def create_dataset(self, name, shape=None, dtype=None, data=None, - format='csr', **kwargs): + format='csr', indptr_dtype=np.int64, indices_dtype=np.int32, + **kwargs): """Create 4 datasets in a group to represent the sparse array.""" if data is None: raise NotImplementedError("Only support create_dataset with " @@ -58,16 +60,21 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, group = self.h5py_group.create_group(name) group.attrs['h5sparse_format'] = data.h5py_group.attrs['h5sparse_format'] group.attrs['h5sparse_shape'] = data.h5py_group.attrs['h5sparse_shape'] - group.create_dataset('data', data=data.h5py_group['data'], **kwargs) - group.create_dataset('indices', data=data.h5py_group['indices'], **kwargs) - group.create_dataset('indptr', data=data.h5py_group['indptr'], **kwargs) + group.create_dataset('data', data=data.h5py_group['data'], + dtype=dtype, **kwargs) + group.create_dataset('indices', data=data.h5py_group['indices'], + dtype=indices_dtype, **kwargs) + group.create_dataset('indptr', data=data.h5py_group['indptr'], + dtype=indptr_dtype, **kwargs) else: group = self.h5py_group.create_group(name) group.attrs['h5sparse_format'] = get_format_str(data) group.attrs['h5sparse_shape'] = data.shape - group.create_dataset('data', data=data.data, **kwargs) - group.create_dataset('indices', data=data.indices, **kwargs) - group.create_dataset('indptr', data=data.indptr, **kwargs) + group.create_dataset('data', data=data.data, dtype=dtype, **kwargs) + group.create_dataset('indices', data=data.indices, + dtype=indices_dtype, **kwargs) + group.create_dataset('indptr', data=data.indptr, + dtype=indptr_dtype, **kwargs) class File(Group): @@ -131,3 +138,42 @@ def value(self): shape = self.h5py_group.attrs['h5sparse_shape'] format_class = get_format_class(self.h5py_group.attrs['h5sparse_format']) return format_class((data, indices, indptr), shape=shape) + + def append(self, sparse_matrix): + shape = self.h5py_group.attrs['h5sparse_shape'] + format_str = self.h5py_group.attrs['h5sparse_format'] + + if format_str != get_format_str(sparse_matrix): + raise ValueError("Format not the same.") + + if format_str == 'csr': + # data + data = self.h5py_group['data'] + orig_data_size = data.shape[0] + new_shape = (orig_data_size + sparse_matrix.data.shape[0],) + data.resize(new_shape) + data[orig_data_size:] = sparse_matrix.data + + # indptr + indptr = self.h5py_group['indptr'] + orig_data_size = indptr.shape[0] + append_offset = indptr[-1] + new_shape = (orig_data_size + sparse_matrix.indptr.shape[0] - 1,) + indptr.resize(new_shape) + indptr[orig_data_size:] = (sparse_matrix.indptr[1:].astype(np.int64) + + append_offset) + + # indices + indices = self.h5py_group['indices'] + orig_data_size = indices.shape[0] + new_shape = (orig_data_size + sparse_matrix.indices.shape[0],) + indices.resize(new_shape) + indices[orig_data_size:] = sparse_matrix.indices + + # shape + self.h5py_group.attrs['h5sparse_shape'] = ( + shape[0] + sparse_matrix.shape[0], + max(shape[1], sparse_matrix.shape[1])) + else: + raise NotImplementedError("The append method for format {} is not " + "implemented.".format(format_str)) diff --git a/h5sparse/tests.py b/h5sparse/tests.py index 0fa5078..cd42336 100644 --- a/h5sparse/tests.py +++ b/h5sparse/tests.py @@ -45,3 +45,24 @@ def test_create_dataset_from_dataset(): os.remove(from_h5_path) os.remove(to_h5_path) + + +def test_dataset_append(): + h5_path = mkstemp(suffix=".h5")[1] + sparse_matrix = ss.csr_matrix([[0, 1, 0], + [0, 0, 1], + [0, 0, 0], + [1, 1, 0]], + dtype=np.float64) + to_append = ss.csr_matrix([[0, 1, 1], + [1, 0, 0]], + dtype=np.float64) + appended_matrix = ss.vstack((sparse_matrix, to_append)) + + with h5sparse.File(h5_path) as h5f: + h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,), + maxshape=(None,)) + h5f['matrix'].append(to_append) + assert (h5f['matrix'].value != appended_matrix).size == 0 + + os.remove(h5_path) diff --git a/setup.py b/setup.py index fc5baad..622b1ff 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ setup( name='h5sparse', - version="0.0.2", + version="0.0.3", description=description, long_description=long_description, author='Appier Inc.',