diff --git a/src/rapids_singlecell/__init__.py b/src/rapids_singlecell/__init__.py index d002ec00..74997228 100644 --- a/src/rapids_singlecell/__init__.py +++ b/src/rapids_singlecell/__init__.py @@ -1,2 +1,3 @@ from . import cunnData, dcg, gr, pp, tl, utils from ._version import __version__ +from .read import read_mtx diff --git a/src/rapids_singlecell/read.py b/src/rapids_singlecell/read.py new file mode 100644 index 00000000..99bd635a --- /dev/null +++ b/src/rapids_singlecell/read.py @@ -0,0 +1,76 @@ +from typing import Literal + +import numpy as np +import pandas as pd +from anndata import AnnData + + +def read_mtx( + filename, + backend: Literal["cudf", "dask_cudf"] = "cudf", + output: Literal["CPU", "GPU"] = "CPU", +): + """ + Read mtx using using GPU, the matrix is transposed by default + + Parameters + ---------- + filename + Name of the matrix file, in mtx or compressed gz format. + backend + Which backend to use, `dask_cudf` comes handy when there is not enough GPU memory, in such case the output will be automatically sent to CPU. + output + Where to keep the matrix, either keep to the GPU memory, or send it to RAM. + """ + import cupyx.scipy.sparse as csp + import scipy.sparse as sp + + mtxinfo = pd.read_csv(filename, nrows=1, sep=" ", comment="%", header=None).values[ + 0 + ] + shape = tuple((mtxinfo[[1, 0]]).astype(int)) + + if backend == "cudf": + import cudf + + mtx_data = cudf.read_csv( + filename, + sep=" ", + dtype=["float32" for i in range(3)], + comment="%", + header=None, + skiprows=2, + ) + # offseting row and column indices to fit python indexing + mtx_data["0"] = mtx_data["0"] - 1 + mtx_data["1"] = mtx_data["1"] - 1 + + mtx_data = mtx_data.to_cupy() + + mtx_data = csp.coo_matrix( + (mtx_data[:, 2], (mtx_data[:, 1], mtx_data[:, 0])), + shape=shape, + dtype=np.float32, + ) + toadata = mtx_data.get().tocsr() if output == "CPU" else mtx_data.tocsr() + + elif backend == "dask_cudf": + import dask_cudf + + output = "CPU" + mtx_data = dask_cudf.read_csv( + filename, + sep=" ", + dtype=["float32" for i in range(3)], + comment="%", + header=None, + ) + mtx_data = mtx_data.to_dask_dataframe() # loading back to host + toadata = sp.coo_matrix( + (mtx_data["2"][1:], (mtx_data["1"][1:] - 1, mtx_data["0"][1:] - 1)), + shape=shape, + dtype=np.float32, + ) + toadata = toadata.tocsr() + + return AnnData(toadata)