Skip to content

Commit

Permalink
Add nice error if scipy not install
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlKCarlK committed Nov 3, 2023
1 parent 69b2077 commit 3fa073a
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 35 deletions.
56 changes: 42 additions & 14 deletions bed_reader/_open_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

import numpy as np

try:
from scipy import sparse
except ImportError:
sparse = None

# cmk remove pandas
import pandas as pd

Expand Down Expand Up @@ -1187,15 +1192,17 @@ def _read_fam_or_bim(self, suffix):
output = np.array(output, dtype=mm.dtype)
self.properties_dict[key] = output

def read_csc_inputs(
def read_sparse(
self,
index: Optional[Any] = None,
dtype: Optional[Union[type, str]] = "float32",
batch_size: Optional[int] = None,
format: Optional[str] = "csc",
force_python_only: Optional[bool] = False,
num_threads=None,
) -> np.ndarray:
) -> (sparse.csc_matrix | sparse.csr_matrix) if sparse is not None else None:
"""
Read genotype information for use by `scipy.sparse.csc_matrix`.
Read genotype information into a `scipy` sparse matrix.
Parameters
----------
Expand All @@ -1210,12 +1217,17 @@ def read_csc_inputs(
dtype: {'float32' (default), 'float64', 'int8'}, optional
The desired data-type for the returned array.
order : {'F','C'}, optional
The desired memory layout for the returned array.
Defaults to ``F`` (Fortran order, which is SNP-major).
batch_size: Used internally. Number of dense SNPs (variants) to read at a time.
Defaults to round(sqrt(total-number-of-SNPs-to-read)).
batch_size: Used internally. Number of dense columns (or rows) to read at a time.
Defaults to round(sqrt(total-number-of-columns-or-rows-to-read)).
Larger values will be faster. Smaller values will use less memory.
Format 'csc' reads dense columns of SNPs (variants).
Format 'csr' reads dense rows of individuals (samples).
format : {'csc','csr'}, optional
The desired format of the sparse matrix.
Defaults to ``csc`` (Compressed Sparse Column, which is SNP-major).
force_python_only: bool, optional
If False (default), uses the faster Rust code; otherwise it uses the slower
pure Python code.
num_threads: None or int, optional
The number of threads with which to read data. Defaults to all available
processors.
Expand All @@ -1225,10 +1237,7 @@ def read_csc_inputs(
Returns
-------
three numpy.ndarray arrays (data, indices, indptr) and a shape.
These can be used to create
a sparse matrix with
``scipy.sparse.csc_matrix(data, indices, indptr, shape)``.
a `sciypy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`
Rows represent individuals (samples). Columns represent SNPs (variants).
Expand Down Expand Up @@ -1302,10 +1311,21 @@ def read_csc_inputs(
>>> del bed # optional: delete bed object
"""
if sparse is None:
raise ImportError(
"The function read_sparse() requires scipy. "
+ "Install it with 'pip install --upgrade bed-reader[sparse]'."
)
iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index)

dtype = np.dtype(dtype)
order = "F"

if format == "csc":
order = "F"
elif format == "csr":
order = "C"
else:
raise ValueError(f"format '{format}' not known. Expected 'csc' or 'csr'.")

# Similar code in read().
# Later happy with _iid_range and _sid_range or could it be done with
Expand Down Expand Up @@ -1389,7 +1409,15 @@ def read_csc_inputs(
data = np.concatenate(data)
indices = np.concatenate(indices)
indptr = np.concatenate(indptr)
return ((data, indices, indptr), (len(iid_index), len(sid_index)))

if format == "csc":
return sparse.csc_matrix(
(data, indices, indptr), (len(iid_index), len(sid_index))
)
else: # cmk not likely to be correct
return sparse.csr_matrix(
(data, indices, indptr), (len(iid_index), len(sid_index))
)


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions bed_reader/_sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,10 @@ def sample_file(filepath: Union[str, Path]) -> str:
The local file name is '...small.bed'
"""
if pooch is None:
print(
"The function 'sample_file' requires the 'pooch' package, which is not installed."
raise ImportError(
"The function sample_file() requires pooch. "
+ "Install it with 'pip install --upgrade bed-reader[sample]'."
)
print("You can do: pip install --upgrade bed-reader[samples]")
return None

filepath = Path(filepath)
file_string = str(filepath)
Expand Down
30 changes: 13 additions & 17 deletions bed_reader/tests/test_open_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

import numpy as np
import pytest
import scipy as sp

from bed_reader import open_bed, subset_f64_f64, to_bed

# cmk somehow add test that get nice message for no pooch or scipy.


def test_read1(shared_datadir):
import math
Expand All @@ -23,8 +24,8 @@ def test_read1(shared_datadir):
val = bed.read(dtype="int8")
# really shouldn't do mean on data where -127 represents missing
assert val.mean() == -13.142
val = sp.sparse.csc_matrix(*bed.read_csc_inputs(dtype="int8"))
assert math.isclose(val.mean(), -13.142, rel_tol=1e-9)
val_sparse = bed.read_sparse(dtype="int8")
assert math.isclose(val_sparse.mean(), -13.142, rel_tol=1e-9)
assert bed.chromosome[-1] == "1"
assert bed.bp_position[-1] == 100

Expand All @@ -45,7 +46,7 @@ def test_write(tmp_path, shared_datadir):
to_bed(out_file, val0, properties=properties0)
with open_bed(out_file) as bed1:
assert np.allclose(val0, bed1.read(), equal_nan=True)
val_sparse = sp.sparse.csc_matrix(*bed1.read_csc_inputs())
val_sparse = bed1.read_sparse()
assert np.allclose(val0, val_sparse.toarray(), equal_nan=True)
assert np.array_equal(bed.fid, properties0["fid"])
assert np.array_equal(bed.iid, properties0["iid"])
Expand Down Expand Up @@ -181,7 +182,7 @@ def test_bad_dtype_or_order(shared_datadir):
with pytest.raises(ValueError):
open_bed(shared_datadir / "some_missing.bed").read(order="X")
with pytest.raises(ValueError):
open_bed(shared_datadir / "some_missing.bed").read_csc_inputs(dtype=np.int32)
open_bed(shared_datadir / "some_missing.bed").read_sparse(dtype=np.int32)


def setting_generator(seq_dict, seed=9392):
Expand Down Expand Up @@ -270,7 +271,7 @@ def _not_set_to_none(settings, key):
len(iid_list),
len(sid_list),
)
val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs())
val_sparse = bed.read_sparse()
assert np.allclose(val, val_sparse.toarray(), equal_nan=True)
if settings["iid_after_read"]:
if _not_set_to_none(settings, "iid"):
Expand Down Expand Up @@ -300,7 +301,7 @@ def test_c_reader_bed(shared_datadir):
ref_val = ref_val * -1 + 2
assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True)

val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs())
val_sparse = bed.read_sparse()
assert val_sparse.dtype == np.float32
assert np.allclose(
ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True
Expand All @@ -320,7 +321,7 @@ def test_c_reader_bed(shared_datadir):
)
ref_val = reference_val(shared_datadir)
assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True)
val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs(dtype="float64"))
val_sparse = bed.read_sparse(dtype="float64")
assert np.allclose(
ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True
)
Expand Down Expand Up @@ -361,18 +362,14 @@ def test_bed_int8(tmp_path, shared_datadir):
),
ref_val,
)
val_sparse = sp.sparse.csc_matrix(
*bed2.read_csc_inputs(dtype="int8")
)
val_sparse = bed2.read_sparse(dtype="int8")
assert np.allclose(val_sparse.toarray(), ref_val)


def test_write1_bed_f64cpp(tmp_path, shared_datadir):
with open_bed(shared_datadir / "some_missing.bed") as bed:
for iid_index in [0, 1, 5]:
val_sparse = sp.sparse.csc_matrix(
*bed.read_csc_inputs(np.s_[0:iid_index, :], dtype=np.float64)
)
val_sparse = bed.read_sparse(np.s_[0:iid_index, :], dtype=np.float64)
assert val_sparse.shape == (iid_index, 100)
for force_python_only in [False, True]:
val = bed.read(
Expand Down Expand Up @@ -834,9 +831,8 @@ def test_sparse():

file_name = sample_file("small.bed")
with open_bed(file_name, count_A1=False) as bed:
csc_inputs = bed.read_csc_inputs(index=np.s_[:, :3], dtype="int8")
val = sp.sparse.csc_matrix(*csc_inputs)
print(val.shape)
val_sparse = bed.read_sparse(index=np.s_[:, :3], dtype="int8")
print(val_sparse.shape)


if __name__ == "__main__":
Expand Down

0 comments on commit 3fa073a

Please sign in to comment.