Skip to content

Commit

Permalink
Merge pull request #3 from appier/resize
Browse files Browse the repository at this point in the history
append
  • Loading branch information
ianlini authored Feb 6, 2017
2 parents 0e4b58d + 574c0a3 commit 6c2e8a3
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 11 deletions.
44 changes: 41 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ Testing
Examples
--------

Create dataset
**************
.. code:: python
In [1]: import scipy.sparse as ss
Expand All @@ -47,9 +50,17 @@ Examples
...: [1, 1, 0]],
...: dtype=np.float64)
In [3]: with h5sparse.File("test.h5") as h5f:
...: h5f.create_dataset("sparse/matrix", data=sparse_matrix)
...:
In [3]: # create dataset from scipy sparse matrix
...: with h5sparse.File("test.h5") as h5f:
...: h5f.create_dataset('sparse/matrix', data=sparse_matrix)
In [4]: # you can also create dataset from another dataset
...: with h5sparse.File("test.h5") as h5f:
...: h5f.create_dataset('sparse/matrix2', data=h5f['sparse/matrix'])
Read dataset
************
.. code:: python
In [5]: h5f = h5sparse.File("test.h5")
Expand Down Expand Up @@ -114,6 +125,33 @@ Examples
<4x3 sparse matrix of type '<class 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
Append dataset
**************
.. code:: python
In [22]: to_append = ss.csr_matrix([[0, 1, 1],
...: [1, 0, 0]],
...: dtype=np.float64)
In [23]: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,),
...: maxshape=(None,))
In [24]: h5f['matrix'].append(to_append)
In [25]: h5f['matrix'].value
Out[25]:
<6x3 sparse matrix of type '<class 'numpy.float64'>'
with 7 stored elements in Compressed Sparse Row format>
In [26]: h5f['matrix'].value.toarray()
Out[26]:
array([[ 0., 1., 0.],
[ 0., 0., 1.],
[ 0., 0., 0.],
[ 1., 1., 0.],
[ 0., 1., 1.],
[ 1., 0., 0.]])
Version scheme
--------------
Expand Down
60 changes: 53 additions & 7 deletions h5sparse/h5sparse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import six
import h5py
import numpy as np
import scipy.sparse as ss


Expand Down Expand Up @@ -49,7 +50,8 @@ def __getitem__(self, key):
raise ValueError("Unexpected item type.")

def create_dataset(self, name, shape=None, dtype=None, data=None,
format='csr', **kwargs):
format='csr', indptr_dtype=np.int64, indices_dtype=np.int32,
**kwargs):
"""Create 4 datasets in a group to represent the sparse array."""
if data is None:
raise NotImplementedError("Only support create_dataset with "
Expand All @@ -58,16 +60,21 @@ def create_dataset(self, name, shape=None, dtype=None, data=None,
group = self.h5py_group.create_group(name)
group.attrs['h5sparse_format'] = data.h5py_group.attrs['h5sparse_format']
group.attrs['h5sparse_shape'] = data.h5py_group.attrs['h5sparse_shape']
group.create_dataset('data', data=data.h5py_group['data'], **kwargs)
group.create_dataset('indices', data=data.h5py_group['indices'], **kwargs)
group.create_dataset('indptr', data=data.h5py_group['indptr'], **kwargs)
group.create_dataset('data', data=data.h5py_group['data'],
dtype=dtype, **kwargs)
group.create_dataset('indices', data=data.h5py_group['indices'],
dtype=indices_dtype, **kwargs)
group.create_dataset('indptr', data=data.h5py_group['indptr'],
dtype=indptr_dtype, **kwargs)
else:
group = self.h5py_group.create_group(name)
group.attrs['h5sparse_format'] = get_format_str(data)
group.attrs['h5sparse_shape'] = data.shape
group.create_dataset('data', data=data.data, **kwargs)
group.create_dataset('indices', data=data.indices, **kwargs)
group.create_dataset('indptr', data=data.indptr, **kwargs)
group.create_dataset('data', data=data.data, dtype=dtype, **kwargs)
group.create_dataset('indices', data=data.indices,
dtype=indices_dtype, **kwargs)
group.create_dataset('indptr', data=data.indptr,
dtype=indptr_dtype, **kwargs)


class File(Group):
Expand Down Expand Up @@ -131,3 +138,42 @@ def value(self):
shape = self.h5py_group.attrs['h5sparse_shape']
format_class = get_format_class(self.h5py_group.attrs['h5sparse_format'])
return format_class((data, indices, indptr), shape=shape)

def append(self, sparse_matrix):
shape = self.h5py_group.attrs['h5sparse_shape']
format_str = self.h5py_group.attrs['h5sparse_format']

if format_str != get_format_str(sparse_matrix):
raise ValueError("Format not the same.")

if format_str == 'csr':
# data
data = self.h5py_group['data']
orig_data_size = data.shape[0]
new_shape = (orig_data_size + sparse_matrix.data.shape[0],)
data.resize(new_shape)
data[orig_data_size:] = sparse_matrix.data

# indptr
indptr = self.h5py_group['indptr']
orig_data_size = indptr.shape[0]
append_offset = indptr[-1]
new_shape = (orig_data_size + sparse_matrix.indptr.shape[0] - 1,)
indptr.resize(new_shape)
indptr[orig_data_size:] = (sparse_matrix.indptr[1:].astype(np.int64)
+ append_offset)

# indices
indices = self.h5py_group['indices']
orig_data_size = indices.shape[0]
new_shape = (orig_data_size + sparse_matrix.indices.shape[0],)
indices.resize(new_shape)
indices[orig_data_size:] = sparse_matrix.indices

# shape
self.h5py_group.attrs['h5sparse_shape'] = (
shape[0] + sparse_matrix.shape[0],
max(shape[1], sparse_matrix.shape[1]))
else:
raise NotImplementedError("The append method for format {} is not "
"implemented.".format(format_str))
21 changes: 21 additions & 0 deletions h5sparse/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,24 @@ def test_create_dataset_from_dataset():

os.remove(from_h5_path)
os.remove(to_h5_path)


def test_dataset_append():
h5_path = mkstemp(suffix=".h5")[1]
sparse_matrix = ss.csr_matrix([[0, 1, 0],
[0, 0, 1],
[0, 0, 0],
[1, 1, 0]],
dtype=np.float64)
to_append = ss.csr_matrix([[0, 1, 1],
[1, 0, 0]],
dtype=np.float64)
appended_matrix = ss.vstack((sparse_matrix, to_append))

with h5sparse.File(h5_path) as h5f:
h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,),
maxshape=(None,))
h5f['matrix'].append(to_append)
assert (h5f['matrix'].value != appended_matrix).size == 0

os.remove(h5_path)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setup(
name='h5sparse',
version="0.0.2",
version="0.0.3",
description=description,
long_description=long_description,
author='Appier Inc.',
Expand Down

0 comments on commit 6c2e8a3

Please sign in to comment.