Add nice error if scipy not install

fastlmm · Nov 3, 2023 · 3fa073a · 3fa073a
1 parent 69b2077
commit 3fa073a
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 35 deletions.
diff --git a/bed_reader/_open_bed.py b/bed_reader/_open_bed.py
@@ -8,6 +8,11 @@
 
 import numpy as np
 
+try:
+    from scipy import sparse
+except ImportError:
+    sparse = None
+
 # cmk remove pandas
 import pandas as pd
 
@@ -1187,15 +1192,17 @@ def _read_fam_or_bim(self, suffix):
                     output = np.array(output, dtype=mm.dtype)
             self.properties_dict[key] = output
 
-    def read_csc_inputs(
+    def read_sparse(
         self,
         index: Optional[Any] = None,
         dtype: Optional[Union[type, str]] = "float32",
         batch_size: Optional[int] = None,
+        format: Optional[str] = "csc",
+        force_python_only: Optional[bool] = False,
         num_threads=None,
-    ) -> np.ndarray:
+    ) -> (sparse.csc_matrix | sparse.csr_matrix) if sparse is not None else None:
         """
-        Read genotype information for use by `scipy.sparse.csc_matrix`.
+        Read genotype information into a `scipy` sparse matrix.
 
         Parameters
         ----------
@@ -1210,12 +1217,17 @@ def read_csc_inputs(
 
         dtype: {'float32' (default), 'float64', 'int8'}, optional
             The desired data-type for the returned array.
-        order : {'F','C'}, optional
-            The desired memory layout for the returned array.
-            Defaults to ``F`` (Fortran order, which is SNP-major).
-        batch_size: Used internally. Number of dense SNPs (variants) to read at a time.
-            Defaults to round(sqrt(total-number-of-SNPs-to-read)).
+        batch_size: Used internally. Number of dense columns (or rows) to read at a time.
+            Defaults to round(sqrt(total-number-of-columns-or-rows-to-read)).
             Larger values will be faster. Smaller values will use less memory.
+            Format 'csc' reads dense columns of SNPs (variants).
+            Format 'csr' reads dense rows of individuals (samples).
+        format : {'csc','csr'}, optional
+            The desired format of the sparse matrix.
+            Defaults to ``csc`` (Compressed Sparse Column, which is SNP-major).
+        force_python_only: bool, optional
+            If False (default), uses the faster Rust code; otherwise it uses the slower
+            pure Python code.
         num_threads: None or int, optional
             The number of threads with which to read data. Defaults to all available
             processors.
@@ -1225,10 +1237,7 @@ def read_csc_inputs(
 
         Returns
         -------
-        three numpy.ndarray arrays (data, indices, indptr) and a shape.
-            These can be used to create
-            a sparse matrix with
-            ``scipy.sparse.csc_matrix(data, indices, indptr, shape)``.
+        a `sciypy.sparse.csc_matrix` or `scipy.sparse.csr_matrix`
 
         Rows represent individuals (samples). Columns represent SNPs (variants).
 
@@ -1302,10 +1311,21 @@ def read_csc_inputs(
             >>> del bed  # optional: delete bed object
 
         """
+        if sparse is None:
+            raise ImportError(
+                "The function read_sparse() requires scipy. "
+                + "Install it with 'pip install --upgrade bed-reader[sparse]'."
+            )
         iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index)
 
         dtype = np.dtype(dtype)
-        order = "F"
+
+        if format == "csc":
+            order = "F"
+        elif format == "csr":
+            order = "C"
+        else:
+            raise ValueError(f"format '{format}' not known. Expected 'csc' or 'csr'.")
 
         # Similar code in read().
         # Later happy with _iid_range and _sid_range or could it be done with
@@ -1389,7 +1409,15 @@ def read_csc_inputs(
         data = np.concatenate(data)
         indices = np.concatenate(indices)
         indptr = np.concatenate(indptr)
-        return ((data, indices, indptr), (len(iid_index), len(sid_index)))
+
+        if format == "csc":
+            return sparse.csc_matrix(
+                (data, indices, indptr), (len(iid_index), len(sid_index))
+            )
+        else:  # cmk not likely to be correct
+            return sparse.csr_matrix(
+                (data, indices, indptr), (len(iid_index), len(sid_index))
+            )
 
 
 if __name__ == "__main__":

diff --git a/bed_reader/_sample_data.py b/bed_reader/_sample_data.py
@@ -62,11 +62,10 @@ def sample_file(filepath: Union[str, Path]) -> str:
         The local file name is '...small.bed'
     """
     if pooch is None:
-        print(
-            "The function 'sample_file' requires the 'pooch' package, which is not installed."
+        raise ImportError(
+            "The function sample_file() requires pooch. "
+            + "Install it with 'pip install --upgrade bed-reader[sample]'."
         )
-        print("You can do: pip install --upgrade bed-reader[samples]")
-        return None
 
     filepath = Path(filepath)
     file_string = str(filepath)

diff --git a/bed_reader/tests/test_open_bed.py b/bed_reader/tests/test_open_bed.py
@@ -5,10 +5,11 @@
 
 import numpy as np
 import pytest
-import scipy as sp
 
 from bed_reader import open_bed, subset_f64_f64, to_bed
 
+# cmk somehow add test that get nice message for no pooch or scipy.
+
 
 def test_read1(shared_datadir):
     import math
@@ -23,8 +24,8 @@ def test_read1(shared_datadir):
         val = bed.read(dtype="int8")
         # really shouldn't do mean on data where -127 represents missing
         assert val.mean() == -13.142
-        val = sp.sparse.csc_matrix(*bed.read_csc_inputs(dtype="int8"))
-        assert math.isclose(val.mean(), -13.142, rel_tol=1e-9)
+        val_sparse = bed.read_sparse(dtype="int8")
+        assert math.isclose(val_sparse.mean(), -13.142, rel_tol=1e-9)
         assert bed.chromosome[-1] == "1"
         assert bed.bp_position[-1] == 100
 
@@ -45,7 +46,7 @@ def test_write(tmp_path, shared_datadir):
         to_bed(out_file, val0, properties=properties0)
         with open_bed(out_file) as bed1:
             assert np.allclose(val0, bed1.read(), equal_nan=True)
-            val_sparse = sp.sparse.csc_matrix(*bed1.read_csc_inputs())
+            val_sparse = bed1.read_sparse()
             assert np.allclose(val0, val_sparse.toarray(), equal_nan=True)
             assert np.array_equal(bed.fid, properties0["fid"])
             assert np.array_equal(bed.iid, properties0["iid"])
@@ -181,7 +182,7 @@ def test_bad_dtype_or_order(shared_datadir):
     with pytest.raises(ValueError):
         open_bed(shared_datadir / "some_missing.bed").read(order="X")
     with pytest.raises(ValueError):
-        open_bed(shared_datadir / "some_missing.bed").read_csc_inputs(dtype=np.int32)
+        open_bed(shared_datadir / "some_missing.bed").read_sparse(dtype=np.int32)
 
 
 def setting_generator(seq_dict, seed=9392):
@@ -270,7 +271,7 @@ def _not_set_to_none(settings, key):
                 len(iid_list),
                 len(sid_list),
             )
-            val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs())
+            val_sparse = bed.read_sparse()
             assert np.allclose(val, val_sparse.toarray(), equal_nan=True)
             if settings["iid_after_read"]:
                 if _not_set_to_none(settings, "iid"):
@@ -300,7 +301,7 @@ def test_c_reader_bed(shared_datadir):
         ref_val = ref_val * -1 + 2
         assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True)
 
-        val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs())
+        val_sparse = bed.read_sparse()
         assert val_sparse.dtype == np.float32
         assert np.allclose(
             ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True
@@ -320,7 +321,7 @@ def test_c_reader_bed(shared_datadir):
             )
             ref_val = reference_val(shared_datadir)
             assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True)
-            val_sparse = sp.sparse.csc_matrix(*bed.read_csc_inputs(dtype="float64"))
+            val_sparse = bed.read_sparse(dtype="float64")
             assert np.allclose(
                 ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True
             )
@@ -361,18 +362,14 @@ def test_bed_int8(tmp_path, shared_datadir):
                             ),
                             ref_val,
                         )
-                        val_sparse = sp.sparse.csc_matrix(
-                            *bed2.read_csc_inputs(dtype="int8")
-                        )
+                        val_sparse = bed2.read_sparse(dtype="int8")
                         assert np.allclose(val_sparse.toarray(), ref_val)
 
 
 def test_write1_bed_f64cpp(tmp_path, shared_datadir):
     with open_bed(shared_datadir / "some_missing.bed") as bed:
         for iid_index in [0, 1, 5]:
-            val_sparse = sp.sparse.csc_matrix(
-                *bed.read_csc_inputs(np.s_[0:iid_index, :], dtype=np.float64)
-            )
+            val_sparse = bed.read_sparse(np.s_[0:iid_index, :], dtype=np.float64)
             assert val_sparse.shape == (iid_index, 100)
             for force_python_only in [False, True]:
                 val = bed.read(
@@ -834,9 +831,8 @@ def test_sparse():
 
     file_name = sample_file("small.bed")
     with open_bed(file_name, count_A1=False) as bed:
-        csc_inputs = bed.read_csc_inputs(index=np.s_[:, :3], dtype="int8")
-        val = sp.sparse.csc_matrix(*csc_inputs)
-        print(val.shape)
+        val_sparse = bed.read_sparse(index=np.s_[:, :3], dtype="int8")
+        print(val_sparse.shape)
 
 
 if __name__ == "__main__":