Merge branch 'main' into unpin-anndata

scverse · Oct 30, 2024 · ab46782 · ab46782
2 parents cba2739 + d8efffa
commit ab46782
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 5 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
       - id: prettier
         exclude: '^\.conda'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.9
+    rev: v0.7.1
     hooks:
       - id: ruff
         types_or: [python, pyi, jupyter]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning][].
 ### Additions
 
 -   Add a `mask_obs` argument to `tl.clonotype_network` that allows to compute the clonotype networks on a subset of the cells ([#557](https://github.com/scverse/scirpy/pull/557)).
+-   Add `datasets.stephenson2021_5k`, an example dataset for the upcoming BCR tutorial ([#565](https://github.com/scverse/scirpy/pull/565))
 
 ### Fixes
 

diff --git a/docs/api.rst b/docs/api.rst
@@ -246,6 +246,7 @@ Example datasets
    datasets.wu2020
    datasets.wu2020_3k
    datasets.maynard2020
+   datasets.stephenson2021_5k
 
 Reference databases
 ^^^^^^^^^^^^^^^^^^^

diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
@@ -28,14 +28,15 @@
 
 _FIGSHARE = pooch.create(
     path=pooch.os_cache("scirpy"),
-    base_url="doi:10.6084/m9.figshare.22249894.v1",
+    base_url="doi:10.6084/m9.figshare.22249894.v2",
     version=version("scirpy"),
     version_dev="main",
     env="SCIRPY_DATA_DIR",
     registry={
         "wu2020.h5mu": "md5:ed30d9c1c44cae544f4c080a2451118b",
         "wu2020_3k.h5mu": "md5:12c57c790f8a403751304c9de5a18cbf",
         "maynard2020.h5mu": "md5:da64ac62e3e92c80eaf0e8eef6537ac7",
+        "stephenson2021_5k.h5mu": "md5:6ea26f9d95525371ff9028f8e99ed474",
     },
 )
 _POOCH_INFO = dedent(
@@ -124,6 +125,29 @@ def maynard2020() -> MuData:
     return mudata.read_h5mu(fname)
 
 
+@_doc_params(
+    processing_code=indent(_read_to_str(HERE / "_processing_scripts/maynard2020.py"), " " * 8),
+    pooch_info=_POOCH_INFO,
+)
+def stephenson2021_5k() -> MuData:
+    """\
+    Return the dataset from :cite:`Stephenson2021` as MuData object, downsampled
+    to 5000 BCR-containing cells.
+
+    The original study sequenced 1,141,860 cells from 143 PBMC samples collected from patients with different severity of COVID-19 and control groups.
+    Gene expression, TCR-enriched and BCR-enriched libraries were prepared for each sample according to 10x Genomics protocol and NovaSeq 6000 was used for sequencing.
+
+    A preprocessed dataset for the transciptome library was obtained from `Array Express <https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-10026>`__
+    A preprocessed dataset for the BCR-enriched library was obtained from `clatworthylab's GitHub <https://github.com/clatworthylab/COVID_analysis>`__
+    Both dataset have already passed quality control and all cells that didn't express BCR were discarded.
+
+    To  speed up computation time, we solely included 5 samples from each of the COVID-19-positive groups and randomly subsampled down to a total of 5k cells.
+
+    """
+    fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
+    return mudata.read_h5mu(fname)
+
+
 def vdjdb(cached: bool = True, *, cache_path="data/vdjdb.h5ad") -> AnnData:
     """\
     Download VDJdb and process it into an AnnData object.

diff --git a/src/scirpy/get/__init__.py b/src/scirpy/get/__init__.py
@@ -203,6 +203,7 @@ def airr_context(
 
 def _has_ir(params: DataHandler):
     """Return a mask of all cells that have a valid IR configuration"""
-    return ak.to_numpy(
-        (ak.count(params.chain_indices["VJ"], axis=1) + ak.count(params.chain_indices["VDJ"], axis=1)) > 0
-    )
+    return (
+        ak.to_numpy(ak.count(params.chain_indices["VJ"], axis=1))
+        + ak.to_numpy(ak.count(params.chain_indices["VDJ"], axis=1))
+    ) > 0