Skip to content

Commit

Permalink
Merge branch 'main' into unpin-anndata
Browse files Browse the repository at this point in the history
  • Loading branch information
grst authored Oct 30, 2024
2 parents cba2739 + d8efffa commit ab46782
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
- id: prettier
exclude: '^\.conda'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
rev: v0.7.1
hooks:
- id: ruff
types_or: [python, pyi, jupyter]
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning][].
### Additions

- Add a `mask_obs` argument to `tl.clonotype_network` that allows to compute the clonotype networks on a subset of the cells ([#557](https://github.com/scverse/scirpy/pull/557)).
- Add `datasets.stephenson2021_5k`, an example dataset for the upcoming BCR tutorial ([#565](https://github.com/scverse/scirpy/pull/565))

### Fixes

Expand Down
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ Example datasets
datasets.wu2020
datasets.wu2020_3k
datasets.maynard2020
datasets.stephenson2021_5k

Reference databases
^^^^^^^^^^^^^^^^^^^
Expand Down
26 changes: 25 additions & 1 deletion src/scirpy/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@

_FIGSHARE = pooch.create(
path=pooch.os_cache("scirpy"),
base_url="doi:10.6084/m9.figshare.22249894.v1",
base_url="doi:10.6084/m9.figshare.22249894.v2",
version=version("scirpy"),
version_dev="main",
env="SCIRPY_DATA_DIR",
registry={
"wu2020.h5mu": "md5:ed30d9c1c44cae544f4c080a2451118b",
"wu2020_3k.h5mu": "md5:12c57c790f8a403751304c9de5a18cbf",
"maynard2020.h5mu": "md5:da64ac62e3e92c80eaf0e8eef6537ac7",
"stephenson2021_5k.h5mu": "md5:6ea26f9d95525371ff9028f8e99ed474",
},
)
_POOCH_INFO = dedent(
Expand Down Expand Up @@ -124,6 +125,29 @@ def maynard2020() -> MuData:
return mudata.read_h5mu(fname)


@_doc_params(
processing_code=indent(_read_to_str(HERE / "_processing_scripts/maynard2020.py"), " " * 8),
pooch_info=_POOCH_INFO,
)
def stephenson2021_5k() -> MuData:
"""\
Return the dataset from :cite:`Stephenson2021` as MuData object, downsampled
to 5000 BCR-containing cells.
The original study sequenced 1,141,860 cells from 143 PBMC samples collected from patients with different severity of COVID-19 and control groups.
Gene expression, TCR-enriched and BCR-enriched libraries were prepared for each sample according to 10x Genomics protocol and NovaSeq 6000 was used for sequencing.
A preprocessed dataset for the transciptome library was obtained from `Array Express <https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-10026>`__
A preprocessed dataset for the BCR-enriched library was obtained from `clatworthylab's GitHub <https://github.com/clatworthylab/COVID_analysis>`__
Both dataset have already passed quality control and all cells that didn't express BCR were discarded.
To speed up computation time, we solely included 5 samples from each of the COVID-19-positive groups and randomly subsampled down to a total of 5k cells.
"""
fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
return mudata.read_h5mu(fname)


def vdjdb(cached: bool = True, *, cache_path="data/vdjdb.h5ad") -> AnnData:
"""\
Download VDJdb and process it into an AnnData object.
Expand Down
7 changes: 4 additions & 3 deletions src/scirpy/get/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def airr_context(

def _has_ir(params: DataHandler):
"""Return a mask of all cells that have a valid IR configuration"""
return ak.to_numpy(
(ak.count(params.chain_indices["VJ"], axis=1) + ak.count(params.chain_indices["VDJ"], axis=1)) > 0
)
return (
ak.to_numpy(ak.count(params.chain_indices["VJ"], axis=1))
+ ak.to_numpy(ak.count(params.chain_indices["VDJ"], axis=1))
) > 0

0 comments on commit ab46782

Please sign in to comment.