diff --git a/src/rapids_singlecell/__init__.py b/src/rapids_singlecell/__init__.py
index d002ec00..74997228 100644
--- a/src/rapids_singlecell/__init__.py
+++ b/src/rapids_singlecell/__init__.py
@@ -1,2 +1,3 @@
 from . import cunnData, dcg, gr, pp, tl, utils
 from ._version import __version__
+from .read import read_mtx
diff --git a/src/rapids_singlecell/read.py b/src/rapids_singlecell/read.py
new file mode 100644
index 00000000..99bd635a
--- /dev/null
+++ b/src/rapids_singlecell/read.py
@@ -0,0 +1,76 @@
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+
+
+def read_mtx(
+    filename,
+    backend: Literal["cudf", "dask_cudf"] = "cudf",
+    output: Literal["CPU", "GPU"] = "CPU",
+):
+    """
+    Read mtx using using GPU, the matrix is transposed by default
+
+    Parameters
+    ----------
+    filename
+        Name of the matrix file, in mtx or compressed gz format.
+    backend
+        Which backend to use, `dask_cudf` comes handy when there is not enough GPU memory, in such case the output will be automatically sent to CPU.
+    output
+        Where to keep the matrix, either keep to the GPU memory, or send it to RAM.
+    """
+    import cupyx.scipy.sparse as csp
+    import scipy.sparse as sp
+
+    mtxinfo = pd.read_csv(filename, nrows=1, sep=" ", comment="%", header=None).values[
+        0
+    ]
+    shape = tuple((mtxinfo[[1, 0]]).astype(int))
+
+    if backend == "cudf":
+        import cudf
+
+        mtx_data = cudf.read_csv(
+            filename,
+            sep=" ",
+            dtype=["float32" for i in range(3)],
+            comment="%",
+            header=None,
+            skiprows=2,
+        )
+        # offseting row and column indices to fit python indexing
+        mtx_data["0"] = mtx_data["0"] - 1
+        mtx_data["1"] = mtx_data["1"] - 1
+
+        mtx_data = mtx_data.to_cupy()
+
+        mtx_data = csp.coo_matrix(
+            (mtx_data[:, 2], (mtx_data[:, 1], mtx_data[:, 0])),
+            shape=shape,
+            dtype=np.float32,
+        )
+        toadata = mtx_data.get().tocsr() if output == "CPU" else mtx_data.tocsr()
+
+    elif backend == "dask_cudf":
+        import dask_cudf
+
+        output = "CPU"
+        mtx_data = dask_cudf.read_csv(
+            filename,
+            sep=" ",
+            dtype=["float32" for i in range(3)],
+            comment="%",
+            header=None,
+        )
+        mtx_data = mtx_data.to_dask_dataframe()  # loading back to host
+        toadata = sp.coo_matrix(
+            (mtx_data["2"][1:], (mtx_data["1"][1:] - 1, mtx_data["0"][1:] - 1)),
+            shape=shape,
+            dtype=np.float32,
+        )
+        toadata = toadata.tocsr()
+
+    return AnnData(toadata)