scverse · ivirshup · Mar 22, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/docs/release-notes/1.11.0.md b/docs/release-notes/1.11.0.md
@@ -2,6 +2,7 @@
 
 ```{rubric} Features
 ```
+* Support sparse chunks in dask {func}`~scanpy.pp.scale`, {func}`~scanpy.pp.filter_cells`, {func}`~scanpy.pp.filter_genes`, {func}`~scanpy.pp.normalize_total` and {func}`~scanpy.pp.highly_variable_genes` (`seurat` and `cell-ranger` tested) {pr}`2856` {smaller}`ilan-gold`
 
 ```{rubric} Docs
 ```

diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py
@@ -14,9 +14,18 @@
 from contextlib import contextmanager
 from enum import Enum
 from functools import partial, singledispatch, wraps
+from operator import mul, truediv
 from textwrap import dedent
 from types import MethodType, ModuleType
-from typing import TYPE_CHECKING, Any, Callable, Literal, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Literal,
+    TypeVar,
+    Union,
+    overload,
+)
 from weakref import WeakSet
 
 import numpy as np
@@ -560,6 +569,221 @@
     return da.map_blocks(elem_mul, x, y)
 
 
+Scaling_T = TypeVar("Scaling_T", DaskArray, np.ndarray)
+
+
+def broadcast_axis(divisor: Scaling_T, axis: Literal[0, 1]) -> Scaling_T:
+    divisor = np.ravel(divisor)
+    if axis:
+        return divisor[None, :]
+    return divisor[:, None]
+
+
+@singledispatch
+def axis_mul_or_truediv(
+    X: sparse.spmatrix,
+    scaling_array,
+    axis: Literal[0, 1],
+    op: Callable[[Any, Any], Any],
+    *,
+    allow_divide_by_zero: bool = True,
+    out: sparse.spmatrix | None = None,
+) -> sparse.spmatrix:
+    if op not in {truediv, mul}:
+        raise ValueError(f"{op} not one of truediv or mul")
+    if out is not None:
+        if X.data is not out.data:
+            raise ValueError(
+                "`out` argument provided but not equal to X.  This behavior is not supported for sparse matrix scaling."
+            )
+    if not allow_divide_by_zero and op is truediv:
+        scaling_array = scaling_array.copy() + (scaling_array == 0)
+
+    row_scale = axis == 0
+    column_scale = axis == 1
+    if row_scale:
+
+        def new_data_op(x):
+            return op(x.data, np.repeat(scaling_array, np.diff(x.indptr)))
+
+    elif column_scale:
+
+        def new_data_op(x):
+            return op(x.data, scaling_array.take(x.indices, mode="clip"))
+
+    if X.format == "csr":
+        indices = X.indices
+        indptr = X.indptr
+        if out is not None:
+            X.data = new_data_op(X)
+            return X
+        return sparse.csr_matrix(
+            (new_data_op(X), indices.copy(), indptr.copy()), shape=X.shape
+        )
+    transposed = X.T
+    return axis_mul_or_truediv(
+        transposed,
+        scaling_array,
+        op=op,
+        axis=1 - axis,
+        out=transposed,
+        allow_divide_by_zero=allow_divide_by_zero,
+    ).T
+
+
+@axis_mul_or_truediv.register(np.ndarray)
+def _(
+    X: np.ndarray,
+    scaling_array: np.ndarray,
+    axis: Literal[0, 1],
+    op: Callable[[Any, Any], Any],
+    *,
+    allow_divide_by_zero: bool = True,
+    out: np.ndarray | None = None,
+) -> np.ndarray:
+    if op not in {truediv, mul}:
+        raise ValueError(f"{op} not one of truediv or mul")
+    scaling_array = broadcast_axis(scaling_array, axis)
+    if op is mul:
+        return np.multiply(X, scaling_array, out=out)
+    if not allow_divide_by_zero:
+        scaling_array = scaling_array.copy() + (scaling_array == 0)
+    return np.true_divide(X, scaling_array, out=out)
+
+
+def make_axis_chunks(X: DaskArray, axis: Literal[0, 1], pad=True) -> tuple[tuple[int]]:
+    if axis == 0:
+        if pad:
+            return (X.chunks[axis], (1,))
+        return X.chunks[axis]
+    if pad:
+        return ((1,), X.chunks[axis])
+    return X.chunks[axis]
+
+
+@axis_mul_or_truediv.register(DaskArray)
+def _(
+    X: DaskArray,
+    scaling_array: Scaling_T,
+    axis: Literal[0, 1],
+    op: Callable[[Any, Any], Any],
+    *,
+    allow_divide_by_zero: bool = True,
+    out: None = None,
+) -> DaskArray:
+    if op not in {truediv, mul}:
+        raise ValueError(f"{op} not one of truediv or mul")
+    if out is not None:
+        raise TypeError(
+            "`out` is not `None`. Do not do in-place modifications on dask arrays."
+        )
+
+    import dask.array as da
+
+    scaling_array = broadcast_axis(scaling_array, axis)
+    row_scale = axis == 0
+    column_scale = axis == 1
+
+    if isinstance(scaling_array, DaskArray):
+        if (row_scale and not X.chunksize[0] == scaling_array.chunksize[0]) or (
+            column_scale
+            and (
+                (
+                    len(scaling_array.chunksize) == 1
+                    and X.chunksize[1] != scaling_array.chunksize[0]
+                )
+                or (
+                    len(scaling_array.chunksize) == 2
+                    and X.chunksize[1] != scaling_array.chunksize[1]
+                )
+            )
+        ):
+            warnings.warn("Rechunking scaling_array in user operation", UserWarning)
+            scaling_array = scaling_array.rechunk(
+                make_axis_chunks(X, axis, pad=len(scaling_array.shape) == 2)
+            )
+    else:
+        scaling_array = da.from_array(
+            scaling_array,
+            chunks=make_axis_chunks(X, axis, pad=len(scaling_array.shape) == 2),
+        )
+    return da.map_blocks(
+        axis_mul_or_truediv,
+        X,
+        scaling_array,
+        axis,
+        op,
+        meta=X._meta,
+        out=out,
+        allow_divide_by_zero=allow_divide_by_zero,
+    )
+
+
+@overload
+def axis_sum(
+    X: sparse.spmatrix,
+    *,
+    axis: tuple[Literal[0, 1], ...] | Literal[0, 1] | None = None,
+    dtype: np.typing.DTypeLike | None = None,
+) -> np.matrix:
+    ...
+
+
+@singledispatch
+def axis_sum(
+    X: np.ndarray,
+    *,
+    axis: tuple[Literal[0, 1], ...] | Literal[0, 1] | None = None,
+    dtype: np.typing.DTypeLike | None = None,
+) -> np.ndarray:
+    return np.sum(X, axis=axis, dtype=dtype)
+
+
+@axis_sum.register(DaskArray)
+def _(
+    X: DaskArray,
+    *,
+    axis: tuple[Literal[0, 1], ...] | Literal[0, 1] | None = None,
+    dtype: np.typing.DTypeLike | None = None,
+) -> DaskArray:
+    import dask.array as da
+
+    if dtype is None:
+        dtype = getattr(np.zeros(1, dtype=X.dtype).sum(), "dtype", object)
+
+    if isinstance(X._meta, np.ndarray) and not isinstance(X._meta, np.matrix):
+        return X.sum(axis=axis, dtype=dtype)
+
+    def sum_drop_keepdims(*args, **kwargs):
+        kwargs.pop("computing_meta", None)
+        # masked operations on sparse produce which numpy matrices gives the same API issues handled here
+        if isinstance(X._meta, (sparse.spmatrix, np.matrix)) or isinstance(
+            args[0], (sparse.spmatrix, np.matrix)
+        ):
+            kwargs.pop("keepdims", None)
+            axis = kwargs["axis"]
+            if isinstance(axis, tuple):
+                if len(axis) != 1:
+                    raise ValueError(
+                        f"`axis_sum` can only sum over one axis when `axis` arg is provided but got {axis} instead"
+                    )
+                kwargs["axis"] = axis[0]
+        # returns a np.matrix normally, which is undesireable
+        return np.array(np.sum(*args, dtype=dtype, **kwargs))
+
+    def aggregate_sum(*args, **kwargs):
+        return np.sum(args[0], dtype=dtype, **kwargs)
+
+    return da.reduction(
+        X,
+        sum_drop_keepdims,
+        aggregate_sum,
+        axis=axis,
+        dtype=dtype,
+        meta=np.array([], dtype=dtype),
+    )
+
+
 @singledispatch
 def check_nonnegative_integers(X: _SupportedArray) -> bool | DaskArray:
     """Checks values of X to ensure it is count data"""

diff --git a/scanpy/preprocessing/_distributed.py b/scanpy/preprocessing/_distributed.py
@@ -35,9 +35,11 @@
 
 
 def materialize_as_ndarray(
-    a: ArrayLike | tuple[ArrayLike | ZappyArray | DaskArray, ...],
+    a: DaskArray | ArrayLike | tuple[ArrayLike | ZappyArray | DaskArray, ...],
 ) -> tuple[np.ndarray] | np.ndarray:
     """Compute distributed arrays and convert them to numpy ndarrays."""
+    if isinstance(a, DaskArray):
+        return a.compute()
     if not isinstance(a, tuple):
         return np.asarray(a)
 

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
@@ -1,17 +1,24 @@
 from __future__ import annotations
 
+from operator import truediv
 from typing import TYPE_CHECKING, Literal
 from warnings import warn
 
 import numpy as np
 from scipy.sparse import issparse
-from sklearn.utils import sparsefuncs
 
 from .. import logging as logg
 from .._compat import DaskArray, old_positionals
-from .._utils import view_to_actual
+from .._utils import axis_mul_or_truediv, axis_sum, view_to_actual
 from ..get import _get_obs_rep, _set_obs_rep
 
+try:
+    import dask
+    import dask.array as da
+except ImportError:
+    da = None
+    dask = None
+
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
@@ -22,21 +29,30 @@
     X = X.copy() if copy else X
     if issubclass(X.dtype.type, (int, np.integer)):
         X = X.astype(np.float32)  # TODO: Check if float64 should be used
-    if isinstance(counts, DaskArray):
-        counts_greater_than_zero = counts[counts > 0].compute_chunk_sizes()
-    else:
-        counts_greater_than_zero = counts[counts > 0]
+    if after is None:
+        if isinstance(counts, DaskArray):
+
+            def nonzero_median(x):
+                return np.ma.median(np.ma.masked_array(x, x == 0)).item()
 
-    after = np.median(counts_greater_than_zero, axis=0) if after is None else after
-    counts += counts == 0
+            after = da.from_delayed(
+                dask.delayed(nonzero_median)(counts),
+                shape=(),
+                meta=counts._meta,
+                dtype=counts.dtype,
+            )
+        else:
+            counts_greater_than_zero = counts[counts > 0]
+            after = np.median(counts_greater_than_zero, axis=0)
     counts = counts / after
-    if issparse(X):
-        sparsefuncs.inplace_row_scale(X, 1 / counts)
-    elif isinstance(counts, np.ndarray):
-        np.divide(X, counts[:, None], out=X)
-    else:
-        X = np.divide(X, counts[:, None])  # dask does not support kwarg "out"
-    return X
+    return axis_mul_or_truediv(
+        X,
+        counts,
+        op=truediv,
+        out=X if isinstance(X, np.ndarray) or issparse(X) else None,
+        allow_divide_by_zero=False,
+        axis=0,
+    )
 
 
 @old_positionals(
@@ -78,6 +94,11 @@
     Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
     [Zheng17]_ or SPRING [Weinreb17]_.
 
+    .. note::
+        When used with a :class:`~dask.array.Array` in `adata.X`, this function will have to
+        call functions that trigger `.compute()` on the :class:`~dask.array.Array` if `exclude_highly_expressed`
+        is `True`, `layer_norm` is not `None`, or if `key_added` is not `None`.
+
     Params
     ------
     adata
@@ -92,7 +113,8 @@
         normalization factor (size factor) for each cell. A gene is considered
         highly expressed, if it has more than `max_fraction` of the total counts
         in at least one cell. The not-excluded genes will sum up to
-        `target_sum`.
+        `target_sum`.  Providing this argument when `adata.X` is a :class:`~dask.array.Array`
+        will incur blocking `.compute()` calls on the array.
     max_fraction
         If `exclude_highly_expressed=True`, consider cells as highly expressed
         that have more counts than `max_fraction` of the original total counts
@@ -187,27 +209,27 @@
 
     gene_subset = None
     msg = "normalizing counts per cell"
+
+    counts_per_cell = axis_sum(X, axis=1)
     if exclude_highly_expressed:
-        counts_per_cell = X.sum(1)  # original counts per cell
         counts_per_cell = np.ravel(counts_per_cell)
 
         # at least one cell as more than max_fraction of counts per cell
 
-        gene_subset = (X > counts_per_cell[:, None] * max_fraction).sum(0)
+        gene_subset = axis_sum((X > counts_per_cell[:, None] * max_fraction), axis=0)
         gene_subset = np.asarray(np.ravel(gene_subset) == 0)
 
         msg += (
             ". The following highly-expressed genes are not considered during "
             f"normalization factor computation:\n{adata.var_names[~gene_subset].tolist()}"
         )
-        counts_per_cell = X[:, gene_subset].sum(1)
-    else:
-        counts_per_cell = X.sum(1)
+        counts_per_cell = axis_sum(X[:, gene_subset], axis=1)
+
     start = logg.info(msg)
     counts_per_cell = np.ravel(counts_per_cell)
 
     cell_subset = counts_per_cell > 0
-    if not np.all(cell_subset):
+    if not isinstance(cell_subset, DaskArray) and not np.all(cell_subset):
         warn(UserWarning("Some cells have zero counts"))
 
     if inplace: