diff --git a/.gitignore b/.gitignore
index 667392bd..7acc6f84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,7 @@
 __pycache__/
 /*cache/
 .ipynb_checkpoints/
-/data/
+data/
 
 # Distribution / packaging
 /dist/
@@ -16,3 +16,6 @@ __pycache__/
 
 # Venvs
 *venv/
+
+# asv
+.asv/
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..d6215c47
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,81 @@
+# AnnData Benchmarks
+
+This repo contains some work in progress benchmarks for [rapids_singlecell](https://github.com/scverse/rapids_singlecell) using [asv](https://asv.readthedocs.io).
+
+## Setup
+
+I definitley recommend reading through the asv docs. Currently, this assumes the benchmark suite can reach the `rapids_singlecell` repo via the path `../src/rapids_singlecell`. Otherwise, all you'll need to do is create a [machine file](https://asv.readthedocs.io/en/stable/commands.html#asv-machine) for your system and make sure `anndata`s dependencies are installable via `conda`.
+
+```shell
+pip install chardet
+conda install mamba
+```
+
+### Data
+
+Data will need to be retrieved for these benchmarks. This can be downloaded using the script fetch_datasets.py.
+
+Note that the `h5ad` format has changed since it's inception. While the `rapids_singlecell` package maintains backwards compatibility, older versions of `anndata` will not be able to read files written by more recent versions. To get around this for the benchmarks, datasets have to be able to be read by all versions which can require a setup function that creates the anndata object.
+
+## Usage
+
+### Runnings the benchmarks:
+
+To run benchmarks for a particular commit: `asv run {commit} --steps 1 -b`
+
+To run benchmarks for a range of commits: `asv run {commit1}..{commit2}`
+
+You can filter out the benchmarks which are run with the `-b {patttern}` flag.
+
+### Accessing the benchmarks
+
+You can see what benchmarks you've already run using `asv show`. If you don't specify a commit, it will search for the available commits. If you specify a commit it'll show you those results. For example:
+
+```bash
+$ asv show -b "views"
+Commits with results:
+
+Machine    : mimir.mobility.unimelb.net.au
+Environment: conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy
+
+    61eb5bb7
+    e9ccfc33
+    22f12994
+    0ebe187e
+```
+
+```bash
+$ asv show -b "views" 0ebe187e
+Commit: 0ebe187e <views-of-views>
+
+views.SubsetMemorySuite.track_repeated_subset_memratio [mimir.mobility.unimelb.net.au/conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy]
+  ok
+  ======= ======= ========== ============ ===================== ====================== ======================
+  --                                                                   index_kind
+  --------------------------------------- -------------------------------------------------------------------
+   n_obs   n_var   attr_set   subset_dim         intarray             boolarray                slice
+  ======= ======= ========== ============ ===================== ====================== ======================
+    100     100     X-csr        obs               2.84           1.7916666666666667            0.5
+    100     100     X-csr        var        2.5357142857142856    1.8695652173913044     0.5652173913043478
+    100     100    X-dense       obs        3.1739130434782608    1.6538461538461537            0.6
+...
+```
+
+You can compare two commits with `asv compare`
+
+```bash
+$ asv compare e9ccfc 0ebe187e
+All benchmarks:
+
+       before           after         ratio
+     [e9ccfc33]       [0ebe187e]
+     <master>         <views-of-views>
+-            2.16  1.7916666666666667     0.83  views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'boolarray')
++ 2.533333333333333             2.84     1.12  views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'intarray')
+- 1.1923076923076923              0.5     0.42  views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'slice')
+  1.9615384615384615  1.8695652173913044     0.95  views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'var', 'boolarray')
+```
+
+### View in the browser:
+
+You can view the benchmarks in the browser with `asv publish` followed by `asv preview`. If you want to include benchmarks of a local branch, I think you'll have to add that branch to the `"branches"` list in `asv.conf.json`.
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 00000000..871ca00c
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,175 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+    // The name of the project being benchmarked
+    "project": "rapids_singlecell",
+    // The project's homepage
+    "project_url": "https://rapids-singlecell.readthedocs.io/",
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "../",
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    "build_command": [
+        "python -m pip install build",
+        "python -m build --wheel -o {build_cache_dir} {build_dir}",
+    ],
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": [
+        "main"
+    ], // for git
+    // "branches": ["default"],    // for mercurial
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/scverse/rapids_singlecell/commit/",
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.6"],
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": [
+    //     "conda-forge",
+    //     "defaults",
+    //     "rapidsai",
+    //     "nvidia"
+    // ],
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    "conda_environment_file": "environment.yml",
+    // "matrix": {
+    //     "cuda-version": [
+    //         "12.2"
+    //     ],
+    //     "cudf": [
+    //         "24.4"
+    //     ],
+    //     "cuml": [
+    //         "24.4"
+    //     ],
+    //     "cugraph": [
+    //         "24.4"
+    //     ],
+    //     "pandas": [
+    //         ""
+    //     ],
+    //     "memory_profiler": [
+    //         ""
+    //     ],
+    //     "anndata": [
+    //         ""
+    //     ],
+    //     "scanpy": [
+    //         ""
+    //     ],
+    //     "numpy": [
+    //         ""
+    //     ],
+    //     "scipy": [
+    //         ""
+    //     ]
+    //     // "scanpy": [""],
+    //     // "psutil": [""]
+    // },
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "mamba", "python": "2.7", "libpython": ""},
+    // ],
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/benchmarks/benchmarks/preprocessing.py b/benchmarks/benchmarks/preprocessing.py
new file mode 100644
index 00000000..7d995902
--- /dev/null
+++ b/benchmarks/benchmarks/preprocessing.py
@@ -0,0 +1,99 @@
+"""
+This module will benchmark preprocessing operations in Scanpy
+API documentation: https://scanpy.readthedocs.io/en/stable/api/preprocessing.html
+"""
+
+from __future__ import annotations
+
+import scanpy as sc
+
+import rapids_singlecell as rsc
+
+from .utils import track_peakmem
+
+
+class PreprocessingSuite:
+    _data_dict = dict(pbmc68k_reduced=sc.datasets.pbmc68k_reduced())
+    params = _data_dict.keys()
+    param_names = ["input_data"]
+
+    def setup(self, input_data: str):
+        self.adata = rsc.get.anndata_to_GPU(
+            self._data_dict[input_data].copy(), copy=True
+        )
+
+    def time_calculate_qc_metrics(self, *_):
+        self.adata.var["mt"] = self.adata.var_names.str.startswith("MT-")
+        rsc.pp.calculate_qc_metrics(self.adata, qc_vars=["mt"], log1p=False)
+
+    @track_peakmem
+    def track_peakmem_calculate_qc_metrics(self, *_):
+        self.adata.var["mt"] = self.adata.var_names.str.startswith("MT-")
+        rsc.pp.calculate_qc_metrics(self.adata, qc_vars=["mt"], log1p=False)
+
+    def time_filter_cells(self, *_):
+        rsc.pp.filter_cells(self.adata, qc_var="n_counts", min_count=200)
+
+    @track_peakmem
+    def track_peakmem_filter_cells(self, *_):
+        rsc.pp.filter_cells(self.adata, qc_var="n_counts", min_count=200)
+
+    def time_filter_genes(self, *_):
+        rsc.pp.filter_genes(self.adata, qc_var="n_counts", min_count=3)
+
+    @track_peakmem
+    def track_peakmem_filter_genes(self, *_):
+        rsc.pp.filter_genes(self.adata, qc_var="n_counts", min_count=3)
+
+    def time_normalize_total(self, *_):
+        rsc.pp.normalize_total(self.adata, target_sum=1e4)
+
+    @track_peakmem
+    def track_peakmem_normalize_total(self, *_):
+        rsc.pp.normalize_total(self.adata, target_sum=1e4)
+
+    def time_log1p(self, *_):
+        rsc.pp.log1p(self.adata)
+
+    @track_peakmem
+    def track_peakmem_time_log1p(self, *_):
+        rsc.pp.log1p(self.adata)
+
+    def time_pca(self, *_):
+        rsc.pp.pca(self.adata)
+
+    @track_peakmem
+    def track_peakmem_pca(self, *_):
+        rsc.pp.pca(self.adata)
+
+    def time_highly_variable_genes(self, *_):
+        rsc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
+        )
+
+    @track_peakmem
+    def track_peakmem_highly_variable_genes(self, *_):
+        rsc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
+        )
+
+    def time_regress_out(self, *_):
+        rsc.pp.regress_out(self.adata, ["n_counts", "percent_mito"])
+
+    @track_peakmem
+    def track_peakmem_regress_out(self, *_):
+        rsc.pp.regress_out(self.adata, ["n_counts", "percent_mito"])
+
+    def time_scale(self, *_):
+        rsc.pp.scale(self.adata, max_value=10)
+
+    @track_peakmem
+    def track_peakmem_scale(self, *_):
+        rsc.pp.scale(self.adata, max_value=10)
+
+    def time_neighbors(self, *_):
+        rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=50)
+
+    @track_peakmem
+    def track_peakmem_neighbors(self, *_):
+        rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=50)
diff --git a/benchmarks/benchmarks/readwrite.py b/benchmarks/benchmarks/readwrite.py
new file mode 100644
index 00000000..113ac9ba
--- /dev/null
+++ b/benchmarks/benchmarks/readwrite.py
@@ -0,0 +1,45 @@
+"""
+This module will benchmark io of Scanpy readwrite operations
+
+Things to test:
+
+* Read time, write time
+* Peak memory during io
+* File sizes
+
+Parameterized by:
+
+* What method is being used
+* What data is being included
+* Size of data being used
+
+Also interesting:
+
+* io for views
+* io for backed objects
+* Reading dense as sparse, writing sparse as dense
+"""
+
+from __future__ import annotations
+
+import scanpy as sc
+
+from rapids_singlecell.get import anndata_to_GPU
+
+from .utils import track_peakmem
+
+
+class ToGPUSuite:
+    _data_dict = dict(obmc68k_reduced=sc.datasets.pbmc68k_reduced())
+    params = _data_dict.keys()
+    param_names = ["input_data"]
+
+    def setup(self, input_data: str):
+        self.adata = self._data_dict[input_data]
+
+    def time_to_gpu(self, *_):
+        anndata_to_GPU(self.adata)
+
+    @track_peakmem
+    def track_peakmem_to_gpu(self, *_):
+        anndata_to_GPU(self.adata)
diff --git a/benchmarks/benchmarks/squidpy.py b/benchmarks/benchmarks/squidpy.py
new file mode 100644
index 00000000..b1ce6742
--- /dev/null
+++ b/benchmarks/benchmarks/squidpy.py
@@ -0,0 +1,71 @@
+"""
+This module will benchmark tool operations in Scanpy
+API documentation: https://scanpy.readthedocs.io/en/stable/api/tools.html
+"""
+
+from __future__ import annotations
+
+from itertools import product
+
+import scanpy as sc
+
+import rapids_singlecell as rsc
+
+from .utils import track_peakmem
+
+
+class ToolsSuite:
+    _data_dict = dict(
+        pbmc68k_reduced=sc.datasets.pbmc68k_reduced(),
+    )
+    params = _data_dict.keys()
+    param_names = ["input_data"]
+
+    def setup(self, input_data):
+        self.cpu_adata = self._data_dict[input_data].copy()
+        self.gpu_adata = rsc.get.anndata_to_GPU(self.cpu_adata, copy=True)
+
+    def time_ligrec(self, *_):
+        gene_ids = self.cpu_adata.var.index
+        interactions = tuple(product(gene_ids[:5], gene_ids[:5]))
+        rsc.gr.ligrec(
+            self.cpu_adata,
+            "louvain",
+            interactions=interactions,
+            n_perms=5,
+            use_raw=False,
+        )
+
+    @track_peakmem
+    def track_peakmem_ligrec(self, *_):
+        gene_ids = self.cpu_adata.var.index
+        interactions = tuple(product(gene_ids[:5], gene_ids[:5]))
+        rsc.gr.ligrec(
+            self.cpu_adata,
+            "louvain",
+            interactions=interactions,
+            n_perms=5,
+            use_raw=False,
+        )
+
+    def time_autocorr_moran(self, *_):
+        rsc.gr.spatial_autocorr(
+            self.gpu_adata, mode="moran", connectivity_key="connectivities"
+        )
+
+    @track_peakmem
+    def track_peakmem_autocorr_moran(self, *_):
+        rsc.gr.spatial_autocorr(
+            self.gpu_adata, mode="moran", connectivity_key="connectivities"
+        )
+
+    def time_autocorr_geary(self, *_):
+        rsc.gr.spatial_autocorr(
+            self.gpu_adata, mode="geary", connectivity_key="connectivities"
+        )
+
+    @track_peakmem
+    def track_peakmem_autocorr_geary(self, *_):
+        rsc.gr.spatial_autocorr(
+            self.gpu_adata, mode="geary", connectivity_key="connectivities"
+        )
diff --git a/benchmarks/benchmarks/tools.py b/benchmarks/benchmarks/tools.py
new file mode 100644
index 00000000..cf65b23a
--- /dev/null
+++ b/benchmarks/benchmarks/tools.py
@@ -0,0 +1,53 @@
+"""
+This module will benchmark tool operations in Scanpy
+API documentation: https://scanpy.readthedocs.io/en/stable/api/tools.html
+"""
+
+from __future__ import annotations
+
+import scanpy as sc
+
+import rapids_singlecell as rsc
+
+from .utils import track_peakmem
+
+
+class ToolsSuite:
+    _data_dict = dict(
+        pbmc68k_reduced=sc.datasets.pbmc68k_reduced(),
+    )
+    params = _data_dict.keys()
+    param_names = ["input_data"]
+
+    def setup(self, input_data):
+        self.adata = rsc.get.anndata_to_GPU(
+            self._data_dict[input_data].copy(), copy=True
+        )
+
+    def time_umap(self, *_):
+        rsc.tl.umap(self.adata)
+
+    @track_peakmem
+    def track_peakmem_umap(self, *_):
+        rsc.tl.umap(self.adata)
+
+    def time_diffmap(self, *_):
+        rsc.tl.diffmap(self.adata)
+
+    @track_peakmem
+    def track_peakmem_diffmap(self, *_):
+        rsc.tl.diffmap(self.adata)
+
+    def time_leiden(self, *_):
+        rsc.tl.leiden(self.adata)
+
+    @track_peakmem
+    def track_peakmem_leiden(self, *_):
+        rsc.tl.leiden(self.adata)
+
+    def time_embedding_denity(self, *_):
+        rsc.tl.embedding_density(self.adata, basis="umap")
+
+    @track_peakmem
+    def track_peakmem_embedding_denity(self, *_):
+        rsc.tl.embedding_density(self.adata, basis="umap")
diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py
new file mode 100644
index 00000000..e0314959
--- /dev/null
+++ b/benchmarks/benchmarks/utils.py
@@ -0,0 +1,79 @@
+# From https://github.com/rapidsai/benchmark/blob/570531ba4bc90c508245e943d2aaa11d68a24286/rapids_pytest_benchmark/rapids_pytest_benchmark/rmm_resource_analyzer.py#L29
+from __future__ import annotations
+
+import csv
+import os
+import tempfile
+
+import rmm
+
+
+class RMMResourceAnalyzer:
+    """
+    Class to control enabling, disabling, & parsing RMM resource
+    logs.
+    """
+
+    def __init__(self, benchmark_name):
+        self.max_gpu_util = -1
+        self.max_gpu_mem_usage = 0
+        self.leaked_memory = 0
+        log_file_name = benchmark_name
+        self._log_file_prefix = os.path.join(tempfile.gettempdir(), log_file_name)
+
+    def enable_logging(self):
+        """
+        Enable RMM logging. RMM creates a CSV output file derived from
+        provided file name that looks like: log_file_prefix + ".devX", where
+        X is the GPU number.
+        """
+        rmm.enable_logging(log_file_name=self._log_file_prefix)
+
+    def disable_logging(self):
+        """
+        Disable RMM logging
+        """
+        log_output_files = rmm.get_log_filenames()
+        rmm.mr._flush_logs()
+        rmm.disable_logging()
+        # FIXME: potential improvement here would be to only parse the log files for
+        # the gpu ID that's passed in via --benchmark-gpu-device
+        self._parse_results(log_output_files)
+        for _, log_file in log_output_files.items():
+            os.remove(log_file)
+
+    def _parse_results(self, log_files):
+        """
+        Parse CSV results. CSV file has columns:
+        Thread,Time,Action,Pointer,Size,Stream
+        """
+        current_mem_usage = 0
+        for _, log_file in log_files.items():
+            with open(log_file) as csv_file:
+                csv_reader = csv.DictReader(csv_file)
+                for row in csv_reader:
+                    row_action = row["Action"]
+                    row_size = int(row["Size"])
+
+                    if row_action == "allocate":
+                        current_mem_usage += row_size
+                        if current_mem_usage > self.max_gpu_mem_usage:
+                            self.max_gpu_mem_usage = current_mem_usage
+
+                    if row_action == "free":
+                        current_mem_usage -= row_size
+        self.leaked_memory = current_mem_usage
+
+
+def track_peakmem(fn):
+    from functools import wraps
+
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        resource_analyzer = RMMResourceAnalyzer(benchmark_name=fn.__name__)
+        resource_analyzer.enable_logging()
+        fn(self, *args, **kwargs)
+        resource_analyzer.disable_logging()
+        return resource_analyzer.max_gpu_mem_usage
+
+    return wrapper
diff --git a/benchmarks/environment.yml b/benchmarks/environment.yml
new file mode 100644
index 00000000..b1acd42c
--- /dev/null
+++ b/benchmarks/environment.yml
@@ -0,0 +1,16 @@
+channels:
+  - rapidsai
+  - nvidia
+  - conda-forge
+dependencies:
+  - python=3.11
+  - cuda-version=12
+  - cudf=24.4
+  - cuml=24.4
+  - cugraph=24.4
+  - pandas
+  - memory_profiler
+  - anndata
+  - scanpy
+  - numpy
+  - scipy