diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4ecfbfe..4a9bc5c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint" + "python.linting.pylintPath": "/opt/conda/bin/pylint", }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } - } + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], + }, + }, } diff --git a/.prettierignore b/.prettierignore index bd2d59c..dd856fc 100644 --- a/.prettierignore +++ b/.prettierignore @@ -12,3 +12,4 @@ testing* bin/ test-datasets/ .nf-test/ +*.scss diff --git a/CHANGELOG.md b/CHANGELOG.md index af81800..ee3eb02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ compatible with further downstream analyses and/or exploration in _e.g._ ### `Added` +- Add a custom nf-core Quarto template for the downstream analysis reports [[#64](https://github.com/nf-core/spatialtranscriptomics/pull/64)] - Allow input directories `fastq_dir` and `spaceranger_dir` to be specified as tar archives (`.tar.gz`) - Add a check to make sure that there are spots left after filtering [[#46](https://github.com/nf-core/spatialtranscriptomics/issues/46)] - Implement tests with nf-test [[#42](https://github.com/nf-core/spatialtranscriptomics/pull/42)] diff --git a/assets/_extensions/nf-core/_extension.yml b/assets/_extensions/nf-core/_extension.yml new file mode 100644 index 0000000..2089122 --- /dev/null +++ b/assets/_extensions/nf-core/_extension.yml @@ -0,0 +1,21 @@ +title: nf-core Quarto Extension +author: Erik Fasterius +version: 1.0.0 +quarto-required: ">=1.2.0" +contributes: + formats: + html: + code-fold: true + df-print: paged + embed-resources: true + highlight-style: nf-core.theme + smooth-scroll: true + theme: [default, nf-core.scss] + toc-location: left + toc: true + revealjs: + code-line-numbers: false + embed-resources: true + slide-level: 2 + slide-number: false + theme: [default, nf-core.scss] diff --git a/assets/_extensions/nf-core/nf-core.scss b/assets/_extensions/nf-core/nf-core.scss new file mode 100644 index 0000000..788a029 --- /dev/null +++ b/assets/_extensions/nf-core/nf-core.scss @@ -0,0 +1,194 @@ +/*-- scss:defaults --*/ + +$theme: "nf-core" !default; + +// Colours +$green: #24B064 !default; +$blue: #3073AF !default; +$red: #E0191A !default; +$yellow: #DABC25 !default; + +// Greyscale +$white: #FFFFFF !default; +$grey-100: #F5F5F5 !default; +$grey-90: #E5E5E5 !default; +$grey-80: #CCCCCC !default; +$grey-70: #B2B2B2 !default; +$grey-60: #999999 !default; +$grey-50: #7F7F7F !default; +$grey-40: #666666 !default; +$grey-30: #4C4C4C !default; +$grey-20: #333333 !default; +$grey-10: #191919 !default; +$black: #000000 !default; + +// Theme +$primary: $green !default; +$secondary: $blue !default; +$tertiary: $red !default; +$light: $grey-100 !default; +$dark: $grey-30 !default; +$success: $green !default; +$info: $blue !default; +$warning: $yellow !default; +$danger: $red !default; + +// Code +$code-color: $primary !default; +$code-bg: $light !default; +$code-block-bg: $light !default; + +// Links +$link-color: $primary !default; + +// Popover +$popover: $light !default; + +// Dropdowns +$dropdown-link-color: $grey-30 !default; +$dropdown-link-hover-color: $white !default; +$dropdown-link-hover-bg: $primary !default; + +// Font +@import "https://fonts.googleapis.com/css2?family=Maven+Pro:wght@300;400;500;600"; +$font-family-sans-serif: "Maven Pro"; + +// Font size for headers +$h1-font-size: 1.75rem !default; +$h2-font-size: 1.50rem !default; +$h3-font-size: 1.25rem !default; + +// Font size base for reveal.js presentations +$presentation-font-size-root: 35px !default; + +// Tables +$table-bg-scale: 0 !default; + +// Navs +$nav-link-padding-y: .5rem !default !default; +$nav-link-padding-x: 2rem !default; +$nav-link-disabled-color: $grey-40 !default !default; +$nav-tabs-border-color: $grey-80 !default; + +// Navbar +$navbar-padding-y: 1rem !default; +$navbar-light-bg: $primary !default; +$navbar-light-color: $white !default; +$navbar-light-hover-color: $success !default; +$navbar-light-active-color: $success !default; +$navbar-light-brand-color: $white !default; +$navbar-light-brand-hover-color: $navbar-light-brand-color !default; +$navbar-dark-color: $white !default; +$navbar-dark-hover-color: $primary !default; +$navbar-dark-active-color: $primary !default; +$navbar-dark-brand-color: $white !default; +$navbar-dark-brand-hover-color: $navbar-dark-brand-color !default; + +// Pagination +$pagination-color: $white !default; +$pagination-bg: $success !default; +$pagination-border-width: 0 !default; +$pagination-border-color: transparent !default; +$pagination-hover-color: $white !default; +$pagination-hover-bg: darken($success, 15%) !default; +$pagination-hover-border-color: transparent !default; +$pagination-active-bg: $pagination-hover-bg !default; +$pagination-active-border-color: transparent !default; +$pagination-disabled-color: $grey-80 !default; +$pagination-disabled-bg: lighten($success, 15%) !default; +$pagination-disabled-border-color: transparent !default; + +// List group +$list-group-hover-bg: $grey-80 !default; +$list-group-disabled-bg: $grey-80 !default; + +// Close +$btn-close-color: $white !default; +$btn-close-opacity: .4 !default; +$btn-close-hover-opacity: 1 !default; + +/*-- scss:rules --*/ + +// Variables +$web-font-path: "https://fonts.googleapis.com/css2?family=Lato:ital,wght@0,400;0,700;1,400&display=swap" !default; +@if $web-font-path { + @import url($web-font-path); +} + +// Navbar +.bg-primary { + .navbar-nav .show > .nav-link, + .navbar-nav .nav-link.active, + .navbar-nav .nav-link:hover, + .navbar-nav .nav-link:focus { + color: $success !important; + } +} + +// Navs +.nav-tabs { + .nav-link.active, + .nav-link.active:focus, + .nav-link.active:hover, + .nav-item.open .nav-link, + .nav-item.open .nav-link:focus, + .nav-item.open .nav-link:hover { + color: $primary; + } +} + +// Pagination +.pagination { + a:hover { + text-decoration: none; + } +} + +// Blockquotes +.blockquote { + color: $primary; + border-left-color: $primary; +} + +// Cell Output +.cell-output-error > pre > code { + color: $red; +} +.cell-output-stderr > pre > code { + color: $yellow; +} + +// Horizontally center level 1 headers +.center h1 { + text-align: center +} + +// Text justification +.justify-right { + text-align: right +} +.justify-center { + text-align: center +} + +// Custom colours +.green { + color: $green; + font-weight: bold; +} +.blue { + color: $blue; + font-weight: bold; +} +.red { + color: $red; + font-weight: bold; +} +.yellow { + color: $yellow; + font-weight: bold; +} +.grey { + color: $grey-70; + font-weight: bold; +} diff --git a/assets/_extensions/nf-core/nf-core.theme b/assets/_extensions/nf-core/nf-core.theme new file mode 100644 index 0000000..1039cfc --- /dev/null +++ b/assets/_extensions/nf-core/nf-core.theme @@ -0,0 +1,211 @@ +{ + "text-color": null, + "background-color": null, + "line-number-color": "#aaaaaa", + "line-number-background-color": null, + "text-styles": { + "Other": { + "text-color": "#9e9370", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Attribute": { + "text-color": "#000000", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "SpecialString": { + "text-color": "#bb6688", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Annotation": { + "text-color": "#005c86", + "background-color": null, + "bold": true, + "italic": true, + "underline": false + }, + "Function": { + "text-color": "#3073AF", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "String": { + "text-color": "#24B064 ", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "ControlFlow": { + "text-color": "#9e9370", + "background-color": null, + "bold": true, + "italic": false, + "underline": false + }, + "Operator": { + "text-color": "#666666", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Error": { + "text-color": "#BB5454 ", + "background-color": null, + "bold": true, + "italic": false, + "underline": false + }, + "BaseN": { + "text-color": "#DABC25", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Alert": { + "text-color": "#BB5454 ", + "background-color": null, + "bold": true, + "italic": false, + "underline": false + }, + "Variable": { + "text-color": "#005c86", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "BuiltIn": { + "text-color": "#005c86", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Extension": { + "text-color": null, + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Preprocessor": { + "text-color": "#BBBB54 ", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Information": { + "text-color": "#005c86", + "background-color": null, + "bold": true, + "italic": true, + "underline": false + }, + "VerbatimString": { + "text-color": "#005c86", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Warning": { + "text-color": "#BBBB54 ", + "background-color": null, + "bold": true, + "italic": true, + "underline": false + }, + "Documentation": { + "text-color": "#BB5454 ", + "background-color": null, + "bold": false, + "italic": true, + "underline": false + }, + "Import": { + "text-color": "#BB5454 ", + "background-color": null, + "bold": true, + "italic": false, + "underline": false + }, + "Char": { + "text-color": "#af75a7 ", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "DataType": { + "text-color": "#9e9370", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Float": { + "text-color": "#DABC25 ", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Comment": { + "text-color": "#B2B2B2 ", + "background-color": null, + "bold": false, + "italic": true, + "underline": false + }, + "CommentVar": { + "text-color": "#B2B2B2 ", + "background-color": null, + "bold": true, + "italic": true, + "underline": false + }, + "Constant": { + "text-color": "#B2B2B2", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "SpecialChar": { + "text-color": "#B2B2B2", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "DecVal": { + "text-color": "#DABC25 ", + "background-color": null, + "bold": false, + "italic": false, + "underline": false + }, + "Keyword": { + "text-color": "#BB5454 ", + "background-color": null, + "bold": true, + "italic": false, + "underline": false + } + } +} diff --git a/bin/st_clustering.qmd b/bin/st_clustering.qmd index 8a6da8e..c37fbbe 100644 --- a/bin/st_clustering.qmd +++ b/bin/st_clustering.qmd @@ -1,10 +1,8 @@ --- -title: "Clustering Spatial Transcriptomics data" +title: "nf-core/spatialtranscriptomics" +subtitle: "Dimensionality reduction and clustering" format: - html: - embed-resources: true - page-layout: full - code-fold: true + nf-core-html: default execute: keep-ipynb: true jupyter: python3 @@ -14,100 +12,134 @@ jupyter: python3 #| tags: [parameters] #| echo: false -fileNameST = None -resolution = 1 -saveFileST = None +input_adata_filtered = "st_adata_filtered.h5ad" # Name of the input anndata file +cluster_resolution = 1 # Resolution for Leiden clustering +n_hvgs = 2000 # Number of HVGs to use for analyses +output_adata_processed = "st_adata_processed.h5ad" # Name of the output anndata file ``` +The data has already been filtered in the _quality controls_ reports and is +saved in the AnnData format: + ```{python} -#| echo: false -# Hide warnings in output html. +#| warning: false import scanpy as sc import numpy as np import pandas as pd from umap import UMAP from matplotlib import pyplot as plt import seaborn as sns -import warnings +from IPython.display import display, Markdown +``` + +```{python} +st_adata = sc.read("./" + input_adata_filtered) +print("Content of the AnnData object:") +print(st_adata) +``` + +# Normalization + +Before we can continue working on the data it needs to be normalized. We here +use the built-in `normalize_total` method from [Scanpy](https://scanpy.readthedocs.io/en/stable/) +followed by a log-transformation. -warnings.filterwarnings("ignore") -sc.settings.verbosity = 0 -sc.set_figure_params(dpi_save=300, facecolor="white") +```{python} +sc.pp.normalize_total(st_adata, inplace=True) +sc.pp.log1p(st_adata) ``` -## Reading the data -The data has already been filtered (see `st_qc_and_normalisation` report) and is saved in AnnData format. +# Feature selection + +Not all features (genes, in this case) are informative, and selecting for a +subset of the total features is commonly done prior to clustering. By selecting +the most variable genes in a dataset we can capture those most important in +regards to yielding a good separation of clusters. ```{python} -#| echo: true -#| code-fold: false -st_adata = sc.read("./" + fileNameST) -print (f"Loading {fileNameST}:") -st_adata +# layout-nrow: 1 +# Find top HVGs and print results +sc.pp.highly_variable_genes(st_adata, flavor="seurat", n_top_genes=n_hvgs) +var_genes_all = st_adata.var.highly_variable +print("Extracted highly variable genes: %d"%sum(var_genes_all)) + +# Plot the HVGs +plt.rcParams["figure.figsize"] = (4.5, 4.5) +sc.pl.highly_variable_genes(st_adata) ``` -## Manifold embedding and clustering based on transcriptional similarity +# Clustering -To uncover the underlying structure of the transcriptional landscape, we perform manifold embedding and clustering based on transcriptional similarity. Principal Component Analysis (PCA) is applied to reduce dimensionality, and UMAP (Uniform Manifold Approximation and Projection) is used for visualization. The Leiden algorithm is employed for clustering with a givent resolution. +To uncover the underlying structure of the transcriptional landscape, we perform +manifold embedding and clustering based on transcriptional similarity. Principal +Component Analysis (PCA) is applied to reduce dimensionality, and UMAP (Uniform +Manifold Approximation and Projection) is used for visualization. The Leiden +algorithm is employed for clustering with a given resolution. ```{python} sc.pp.pca(st_adata) sc.pp.neighbors(st_adata) sc.tl.umap(st_adata) -print (f"Resolution for Leiden clustering: {resolution}") -sc.tl.leiden(st_adata, key_added="clusters", resolution=resolution) +sc.tl.leiden(st_adata, key_added="clusters", resolution=cluster_resolution) +Markdown(f"Resolution for Leiden clustering: `{cluster_resolution}`") ``` -## Exploratory Data Analysis +## All clusters -We generate UMAP plots to visualize the spatial distribution of clusters and investigate potential associations with total counts and detected genes. +We then generate UMAP plots to visualize the distribution of clusters: ```{python} -# Make plots of UMAP of ST spots clusters -plt.rcParams["figure.figsize"] = (4, 4) -sc.pl.umap( - st_adata, color=["total_counts", "n_genes_by_counts", "clusters"], wspace=0.4 -) -sc.tl.embedding_density(st_adata, basis="umap", groupby="clusters") -sc.pl.embedding_density(st_adata, groupby="clusters", ncols=4) +#| warning: false +plt.rcParams["figure.figsize"] = (7, 7) +sc.pl.umap(st_adata, color="clusters") ``` -## Visualization in spatial coordinates +## Counts and genes -Next, we examine how total counts and the number of detected genes behave in spatial coordinates. We overlay the circular spots on the Hematoxylin and Eosin stain (H&E) image to provide a spatial context. +We can also visualise the total counts and the genes with at least 1 count in +the UMAP: ```{python} -plt.rcParams["figure.figsize"] = (10, 10) -sc.pl.spatial( - st_adata, img_key="hires", color=["total_counts", "n_genes_by_counts"] -) +# Make plots of UMAP of ST spots clusters +plt.rcParams["figure.figsize"] = (3.5, 3.5) +sc.pl.umap(st_adata, color=["total_counts", "n_genes_by_counts"]) ``` -## Spatial Clustering Visualization +## Individual clusters -To gain insights into tissue organization and potential inter-cellular communication, we visualize the spatial distribution of clusters on the H&E image. +An additional visualisation is to show where the various spots are in each +individual cluster while ignoring all other cluster: ```{python} -plt.rcParams["figure.figsize"] = (10, 10) -sc.pl.spatial( - st_adata, img_key="hires", color=["clusters"] -) +sc.tl.embedding_density(st_adata, basis="umap", groupby="clusters") +sc.pl.embedding_density(st_adata, groupby="clusters", ncols=2) ``` -Spots belonging to the same cluster in gene expression space often co-occur in spatial dimensions, providing valuable information about the spatial organization of cells. +# Spatial visualisation + +Next, we examine how total counts and the number of detected genes behave in +spatial coordinates by overlaying the spots on the tissue image itself. ```{python} -#| echo: false -# Fix for scanpy issue https://github.com/scverse/scanpy/issues/2181 -st_adata.uns['log1p']['base'] = None +#| layout-nrow: 2 +plt.rcParams["figure.figsize"] = (8, 8) +sc.pl.spatial(st_adata, img_key="hires", color="total_counts", size=1.25) +sc.pl.spatial(st_adata, img_key="hires", color="n_genes_by_counts", size=1.25) ``` -## Data Saving +To gain insights into tissue organization and potential inter-cellular +communication, we visualize the spatial distribution of clusters on the image. +Spots belonging to the same cluster in gene expression space often co-occur in +spatial dimensions, providing valuable information about the spatial +organization of cells. -To facilitate future analyses and reproducibility, the processed AnnData object is saved in the `data` directory. +```{python} +# TODO: Can the colour bar on this figure be fit to the figure? +plt.rcParams["figure.figsize"] = (7, 7) +sc.pl.spatial(st_adata, img_key="hires", color="clusters", size=1.25) +``` ```{python} -if saveFileST is not None: - st_adata.write(saveFileST) - print (f"Anndata object saved to data/{saveFileST}.") +#| echo: false +st_adata.write(output_adata_processed) ``` diff --git a/bin/st_qc_and_normalisation.qmd b/bin/st_qc_and_normalisation.qmd deleted file mode 100644 index d526d7d..0000000 --- a/bin/st_qc_and_normalisation.qmd +++ /dev/null @@ -1,257 +0,0 @@ ---- -title: "Pre-processing and quality control of Spatial Transcriptomics data" -format: - html: - embed-resources: true - page-layout: full - code-fold: true -jupyter: python3 ---- - -## Introduction - -Spatial Transcriptomics data analysis involves several steps, including quality control (QC) and pre-processing, to ensure the reliability of downstream analyses. This report outlines the QC and pre-processing steps for Visium Spatial Transcriptomics data using the (https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html)[anndata format]. - -Quality control is an essential step in spatial transcriptomics to identify and filter out spots and genes that may introduce noise or bias into the analysis. - - -```{python} -#| tags: [parameters] -#| echo: false - -fileNameST = None -resolution = 1 -saveFileST = None - -rawAdata = None # Name of the h5ad file -minCounts = 500 # Min counts per spot -minGenes = 250 # Min genes per spot -minCells = 1 # Min cells per gene -histplotQCmaxTotalCounts = 10000 # Max total counts -histplotQCminGeneCounts = 4000 # Min gene counts -histplotQCbins = 40 # Number of bins -nameDataPlain = "st_adata_plain.h5ad" # Name of the raw data save file -nameDataNorm = "st_adata_norm.h5ad" # Name of the normalized data save file -``` - -```{python} -#| echo: false -import scanpy as sc -import scipy -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -from IPython.display import display, Markdown - -plt.rcParams["figure.figsize"] = (6, 6) -``` - -## Anndata object - -The anndata format is utilized to organize and store the Spatial Transcriptomics data. It includes information about counts, features, observations, and additional metadata. The anndata format ensures compatibility with various analysis tools and facilitates seamless integration into existing workflows. - -```{python} -st_adata = sc.read("./" + rawAdata) - -# Convert X matrix from csr to csc dense matrix for output compatibility: -st_adata.X = scipy.sparse.csc_matrix(st_adata.X) - -# Get mitochondrial percentages -st_adata.var['mt'] = st_adata.var_names.str.startswith('MT-') -st_adata.var['hb'] = st_adata.var_names.str.contains(("^Hb.*-")) - -sc.pp.calculate_qc_metrics(st_adata, qc_vars=["mt", "hb"], inplace=True) - -st_adata_before_filtering = st_adata.copy() - -print ("Content of the anndata object:") -st_adata -``` - -## Quality Control - -The following plots show the distribution of the number of genes per counts and counts per spot, as well as the percentage of counts from mitochondrial genes and hemoglobin genes. - -```{python} -sc.pl.violin(st_adata, ['n_genes_by_counts', 'total_counts'], multi_panel=True, jitter=0.4, rotation= 45) -sc.pl.violin(st_adata, ['pct_counts_mt', 'pct_counts_hb'], multi_panel=True, jitter=0.4, rotation= 45) -``` - -The same can be plotted on top of the tissue: - -```{python} -sc.pl.spatial(st_adata, color = ["total_counts", "n_genes_by_counts"]) -sc.pl.spatial(st_adata, color = ["pct_counts_mt", "pct_counts_hb"]) -``` - -## Filtering - -### In-tissue Filtering - -Spots outside the tissue are removed: - -```{python} -# Create a string observation "obs/in_tissue_str" with "In tissue" and "Outside tissue": -st_adata.obs["in_tissue_str"] = ["In tissue" if x == 1 else "Outside tissue" for x in st_adata.obs["in_tissue"]] - -sc.pl.spatial(st_adata, color=["in_tissue_str"], title="Spots in tissue") -del st_adata.obs["in_tissue_str"] - -# Remove spots outside tissue -Number_spots = st_adata.shape[0] -st_adata = st_adata[st_adata.obs["in_tissue"] == 1] -Number_spots_in_tissue = st_adata.shape[0] -print (f"{Number_spots_in_tissue} spots in tissue on {Number_spots} spots in total.") -``` - -Distribution of counts per spots and genes per counts. - -```{python} -fig, axs = plt.subplots(2, 2, figsize=(8, 7)) -p = sns.histplot( - st_adata.obs["total_counts"], kde=True, ax=axs[0, 0] -).set(title=f"Total counts") -p = sns.histplot( - st_adata.obs["total_counts"][st_adata.obs["total_counts"] < histplotQCmaxTotalCounts], - kde=True, - bins=histplotQCbins, - ax=axs[0, 1] -).set(title=f"Total counts < {histplotQCmaxTotalCounts}") -p = sns.histplot( - st_adata.obs["n_genes_by_counts"], - kde=True, - bins=histplotQCbins, - ax=axs[1, 0] -).set(title=f"Genes by counts") -p = sns.histplot( - st_adata.obs["n_genes_by_counts"][st_adata.obs["n_genes_by_counts"] < histplotQCminGeneCounts], - kde=True, - bins=histplotQCbins, - ax=axs[1, 1] -).set(title=f"Genes by counts < {histplotQCminGeneCounts}") -fig.tight_layout() -``` - -_Top row: Distribution of the number of counts per spots._ -_Bottom row: Distribution of the number of genes with at least 1 count in a spot._ - -### Cell and Gene Filtering - -We filter cells based on minimum counts and genes, and filter genes based on minimum cells. - -```{python} -#| warning: false - -# Filter cells based on counts -Number_spots = st_adata.shape[0] -Number_genes = st_adata.shape[1] -sc.pp.filter_cells(st_adata, min_counts=minCounts) -Number_spots_filtered_minCounts = st_adata.shape[0] -print (f"Removed {Number_spots - Number_spots_filtered_minCounts} spots with less than {minCounts} total counts.") - -# Filter cells based on genes -sc.pp.filter_cells(st_adata, min_genes=minGenes) -Number_spots_filtered_minGenes = st_adata.shape[0] -print (f"Removed {Number_spots_filtered_minCounts - Number_spots_filtered_minGenes} spots with less than {minGenes} genes expressed.") - -# Filter genes based on cells -sc.pp.filter_genes(st_adata, min_cells=minCells) -Number_genes_filtered_minCells = st_adata.shape[1] -print (f"Removed {Number_genes - Number_genes_filtered_minCells} genes expressed in less than {minCells} spots.") - -print (f"{Number_spots_filtered_minGenes} out of {Number_spots} spots remaining after filtering.") -print (f"{Number_genes_filtered_minCells} out of {Number_genes} genes remaining after filtering.") - -if (Number_genes_filtered_minCells == 0 or Number_spots_filtered_minGenes == 0): - st_adata = st_adata_before_filtering - display( - Markdown(""" -::: {.callout-important .content-visible when-format="html"} -## Issue: No Spots Remain After Filtering - -An anomaly has been detected in the data: following the filtering process, all spots have been excluded. It is imperative to assess the data quality and carefully review the Analysis Options outlined in `docs/usage.md`. - -To ensure the smooth progression of downstream analysis, the exported AnnData will, for the time being, remain unfiltered. This precautionary measure is implemented to facilitate continued analysis while investigating and resolving the cause of the unexpected removal of all spots during filtering. -:::""" - ) - ) -``` - -Distributions after filtering: - -```{python} -fig, axs = plt.subplots(2, 2, figsize=(8, 7)) -p = sns.histplot( - st_adata.obs["total_counts"], kde=True, ax=axs[0, 0] -).set(title=f"Total counts") -p = sns.histplot( - st_adata.obs["total_counts"][st_adata.obs["total_counts"] < histplotQCmaxTotalCounts], - kde=True, - bins=histplotQCbins, - ax=axs[0, 1] -).set(title=f"Total counts < {histplotQCmaxTotalCounts}") -p = sns.histplot( - st_adata.obs["n_genes_by_counts"], - kde=True, - bins=histplotQCbins, - ax=axs[1, 0] -).set(title=f"Genes by counts") -p = sns.histplot( - st_adata.obs["n_genes_by_counts"][st_adata.obs["n_genes_by_counts"] < histplotQCminGeneCounts], - kde=True, - bins=histplotQCbins, - ax=axs[1, 1] -).set(title=f"Genes by counts < {histplotQCminGeneCounts}") -fig.tight_layout() -``` - -_Top row: Distribution of the number of counts per spots._ -_Bottom row: Distribution of the number of genes with at least 1 count in a spot._ - -```{python} -# Write pre-processed data to files -st_adata.write(nameDataPlain) -print (f"Non-normalized Anndata object saved to data/{nameDataPlain}.") -``` - -### Normalization - -We proceed to normalize Visium counts data using the built-in `normalize_total` method from Scanpy followed by a log-transformation. - -```{python} -sc.pp.normalize_total(st_adata, inplace=True) -sc.pp.log1p(st_adata) -``` - -### Top expressed genes - -```{python} -sc.pl.highest_expr_genes(st_adata, n_top=20) -``` - -### Highly variable genes - -Highly variable genes are detected for subsequent analysis. - -```{python} -sc.pp.highly_variable_genes(st_adata, flavor="seurat", n_top_genes=2000) -var_genes_all = st_adata.var.highly_variable -print("Extracted highly variable genes: %d"%sum(var_genes_all)) -``` - -Plot of highly variable genes dispersion: - -```{python} -# plot the table of highly variable genes -sc.pl.highly_variable_genes(st_adata) -``` - -### Anndata export - -All anndata files saved can be found in the `data` directory. - -```{python} -# Write normalized data to files -st_adata.write(nameDataNorm) -print (f"Normalized Anndata object saved to data/{nameDataPlain}.") -``` diff --git a/bin/st_quality_controls.qmd b/bin/st_quality_controls.qmd new file mode 100644 index 0000000..e209089 --- /dev/null +++ b/bin/st_quality_controls.qmd @@ -0,0 +1,258 @@ +--- +title: "nf-core/spatialtranscriptomics" +subtitle: "Pre-processing and quality controls" +format: + nf-core-html: default +jupyter: python3 +--- + +# Introduction + +Spatial Transcriptomics data analysis involves several steps, including quality +controls (QC) and pre-processing, to ensure the reliability of downstream +analyses. This is an essential step in spatial transcriptomics to +identify and filter out spots and genes that may introduce noise and/or bias +into the analysis. + +This report outlines the QC and pre-processing steps for Visium Spatial +Transcriptomics data using the [AnnData format](https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html) +and the [`scanpy` Python package](https://scanpy.readthedocs.io/en/stable/). +The anndata format is utilized to organize and store the Spatial Transcriptomics +data. It includes information about counts, features, observations, and +additional metadata. The anndata format ensures compatibility with various +analysis tools and facilitates seamless integration into existing workflows. + +```{python} +#| tags: [parameters] +#| echo: false +input_adata_raw = "st_adata_raw.h5ad" # Name of the input anndata file +min_counts = 500 # Min counts per spot +min_genes = 250 # Min genes per spot +min_spots = 1 # Min spots per gene +mito_threshold = 20 # Mitochondrial content threshold (%) +ribo_threshold = 0 # Ribosomal content threshold (%) +hb_threshold = 100 # content threshold (%) +output_adata_filtered = "st_adata_filtered.h5ad" # Name of the output anndata file +``` + +```{python} +import scanpy as sc +import scipy +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from IPython.display import display, Markdown +from textwrap import dedent +plt.rcParams["figure.figsize"] = (6, 6) +``` + +```{python} +# Read the data +st_adata = sc.read("./" + input_adata_raw) + +# Convert X matrix from csr to csc dense matrix for output compatibility: +st_adata.X = scipy.sparse.csc_matrix(st_adata.X) + +# Store the raw data so that it can be used for analyses from scratch if desired +st_adata.layers['raw'] = st_adata.X.copy() + +# Print the anndata object for inspection +print("Content of the AnnData object:") +print(st_adata) +``` + +# Quality controls + +There are several different quality metrics that are normally computed for +spatial data. Common metrics include the number of genes with a least 1 count +(`n_genes_by_counts`), counts per spot (`total_counts`) as well as the +percentage of counts from mitochondrial, ribosomal and haemoglobin genes +(`pct_counts_[mt/ribo/hb]`). + +```{python} +# Calculate mitochondrial, ribosomal and haemoglobin percentages +st_adata.var['mt'] = st_adata.var_names.str.startswith('MT-') +st_adata.var['ribo'] = st_adata.var_names.str.contains(("^RP[LS]")) +st_adata.var['hb'] = st_adata.var_names.str.contains(("^HB[AB]")) +sc.pp.calculate_qc_metrics(st_adata, qc_vars=["mt", "ribo", "hb"], + inplace=True, log1p=False) + +# Save a copy of data as a restore-point if filtering results in 0 spots left +st_adata_before_filtering = st_adata.copy() +``` + +## Violin plots + +The following violin plots show the distribution of the number of genes per +counts and counts per spot, as well as the percentage of counts from +mitochondrial, ribosomal and haemoglobin genes: + +```{python} +#| layout-nrow: 2 +sc.pl.violin(st_adata, ['n_genes_by_counts', 'total_counts'], + multi_panel=True, jitter=0.4, rotation= 45) +sc.pl.violin(st_adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], + multi_panel=True, jitter=0.4, rotation= 45) +``` + +## Spatial distributions + +The same quality metrics can also be plotted on top of the tissue so that +spatial patterns may be discerned: + +```{python} +#| layout-nrow: 2 +sc.pl.spatial(st_adata, color = ["total_counts", "n_genes_by_counts"], size=1.25) +sc.pl.spatial(st_adata, color = ["pct_counts_mt", "pct_counts_ribo", "pct_counts_hb"], size=1.25) +``` + +## Scatter plots + +It is also useful to compare some of these quality metrics against each other in +scatter plots, such as mitochondrial versus ribosomal content and the total +counts versus the number of genes: + +```{python} +#| layout-ncol: 2 +sc.pl.scatter(st_adata, x='pct_counts_ribo', y='pct_counts_mt') +sc.pl.scatter(st_adata, x='total_counts', y='n_genes_by_counts') +``` + +## Top expressed genes + +It can also be informative to see which genes are the most expressed in the +dataset; the following figure shows the top 20 most expressed genes. + +```{python} +sc.pl.highest_expr_genes(st_adata, n_top=20) +``` + +# Filtering + +## Non-tissue spots + +The following plot indicates which spots are outside of the tissue. These spots +are uninformative and are thus removed. + +```{python} +# Create a string observation "obs/in_tissue_str" with "In tissue" and "Outside tissue": +st_adata.obs["in_tissue_str"] = ["In tissue" if x == 1 else "Outside tissue" for x in st_adata.obs["in_tissue"]] + +# Plot spots inside tissue +sc.pl.spatial(st_adata, color=["in_tissue_str"], title="Spots in tissue", size=1.25) +del st_adata.obs["in_tissue_str"] + +# Remove spots outside tissue and print results +n_spots = st_adata.shape[0] +st_adata = st_adata[st_adata.obs["in_tissue"] == 1] +n_spots_in_tissue = st_adata.shape[0] +Markdown(f"""A total of `{n_spots_in_tissue}` spots are situated inside the +tissue, out of `{n_spots}` spots in total.""") +``` + +## Counts, genes and spots + +We filter spots based on minimum counts and genes, but also filter genes based +on minimum spots; exactly what filtering criteria is reasonable is up to you and +your knowledge of the specific tissue at hand. + +```{python} +#| warning: false +# Filter spots based on counts +n_spots = st_adata.shape[0] +n_genes = st_adata.shape[1] +sc.pp.filter_cells(st_adata, min_counts=min_counts) +n_spots_filtered_min_counts = st_adata.shape[0] + +# Filter spots based on genes +sc.pp.filter_cells(st_adata, min_genes=min_genes) +n_spots_filtered_min_genes = st_adata.shape[0] + +# Filter genes based on spots +sc.pp.filter_genes(st_adata, min_cells=min_spots) +n_genes_filtered_min_spots = st_adata.shape[1] + +# Print results +Markdown(f""" +- Removed `{n_spots - n_spots_filtered_min_counts}` spots with less than `{min_counts}` total counts. +- Removed `{n_spots_filtered_min_counts - n_spots_filtered_min_genes}` spots with less than `{min_genes}` genes expressed. +- Removed `{n_genes - n_genes_filtered_min_spots}` genes expressed in less than `{min_spots}` spots. +""") +``` + +## Mito, ribo and Hb + +We can also filter for mitochondrial, ribosomal and haemoglobin content of the +cells; exactly which filtering threshold should be used are, again, up to you +and your biological knowledge of the sample at hand. Please note that neither +ribosomal nor haemoglobin content is filtered by default. + +```{python} +# Filter spots +st_adata = st_adata[st_adata.obs["pct_counts_mt"] <= mito_threshold] +n_spots_filtered_mito = st_adata.shape[0] +st_adata = st_adata[st_adata.obs["pct_counts_ribo"] >= ribo_threshold] +n_spots_filtered_ribo = st_adata.shape[0] +st_adata = st_adata[st_adata.obs["pct_counts_hb"] <= hb_threshold] +n_spots_filtered_hb = st_adata.shape[0] + +# Print results +Markdown(f""" +- Removed `{st_adata.shape[0] - n_spots_filtered_mito}` spots with more than `{mito_threshold}%` mitochondrial content. +- Removed `{n_spots_filtered_mito - n_spots_filtered_ribo}` spots with less than `{ribo_threshold}%` ribosomal content. +- Removed `{n_spots_filtered_ribo - n_spots_filtered_hb}` spots with more than `{hb_threshold}%` haemoglobin content. +""") +``` + +```{python} +#| echo: false +# Restore non-filtered data if filtering results in 0 spots left +if (st_adata.shape[0] == 0 or st_adata.shape[1] == 0): + st_adata = st_adata_before_filtering + display( + Markdown(dedent( + """ + ::: {.callout-important .content-visible when-format="html"} + ## Issue: no spots remain after filtering + + An anomaly has been detected in the data: following the filtering + process, all spots have been excluded. It is imperative to assess + the data quality and carefully review the values of the filtering + parameters. + + To ensure the smooth progression of downstream analysis, the + exported AnnData will, for the time being, remain unfiltered. This + precautionary measure is implemented to facilitate continued + analysis while investigating and resolving the cause of the + unexpected removal of all spots during filtering. + ::: + """ + )) + ) +``` + +## Filtering results + +```{python} +# Print filtering results +Markdown(f""" +The final results of all the filtering is as follows: + +- A total of `{st_adata.shape[0]}` spots out of `{n_spots}` remain after filtering. +- A total of `{st_adata.shape[1]}` genes out of `{n_genes}` remain after filtering. +""") +``` + +```{python} +#| layout-nrow: 2 +sc.pl.violin(st_adata, ['n_genes_by_counts', 'total_counts'], + multi_panel=True, jitter=0.4, rotation= 45) +sc.pl.violin(st_adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], + multi_panel=True, jitter=0.4, rotation= 45) +``` + +```{python} +#| echo: false +# Write filtered data to disk +st_adata.write(output_adata_filtered) +``` diff --git a/bin/st_spatial_de.qmd b/bin/st_spatial_de.qmd index d8a407a..82d8e81 100644 --- a/bin/st_spatial_de.qmd +++ b/bin/st_spatial_de.qmd @@ -1,30 +1,20 @@ - - --- -title: "Differential Gene Expression and spatially variable genes" +title: "nf-core/spatialtranscriptomics" +subtitle: "Differential gene expression" format: - html: - embed-resources: true - page-layout: full - code-fold: true + nf-core-html: default jupyter: python3 --- ```{python} #| tags: [parameters] #| echo: false - -fileNameST = "" -numberOfColumns = 5 -saveDEFileName = "" -saveSpatialDEFileName = "" -plotTopHVG = 15 +input_adata_processed = "st_adata_processed.h5ad" +output_spatial_degs = "st_spatial_de.csv" +n_top_spatial_degs = 14 ``` ```{python} -# Load packages import scanpy as sc import pandas as pd import SpatialDE @@ -32,54 +22,78 @@ from matplotlib import pyplot as plt ``` ```{python} -st_adata = sc.read(fileNameST) -st_adata +# Read data +st_adata = sc.read(input_adata_processed) +print("Content of the AnnData object:") +print(st_adata) + +# Fix for scanpy issue https://github.com/scverse/scanpy/issues/2181 +st_adata.uns['log1p']['base'] = None + +# Suppress scanpy-specific warnings +sc.settings.verbosity = 0 ``` -## Differential Gene Expression +# Differential gene expression -```{python} -plt.rcParams["figure.figsize"] = (5, 5) -st_adata.uns['log1p']['base'] = None +Before we look for spatially variable genes we first find differentially +expressed genes across the different clusters found in the data. We can +visualize the top DEGs in a heatmap: +```{python} +#| warning: false sc.tl.rank_genes_groups(st_adata, 'clusters', method='t-test') -sc.pl.rank_genes_groups(st_adata, n_genes=25, sharey=False, gene_symbols="gene_symbol") +sc.pl.rank_genes_groups_heatmap(st_adata, n_genes=5, groupby="clusters") ``` +A different but similar visualization of the DEGs is the dot plot, where we can +also include the gene names: + ```{python} -sc.tl.rank_genes_groups(st_adata, 'clusters', method='wilcoxon') -sc.pl.rank_genes_groups(st_adata, n_genes=25, sharey=False, gene_symbols="gene_symbol") +#| warning: false +sc.pl.rank_genes_groups_dotplot(st_adata, n_genes=5, groupby="clusters") ``` -## Spatially variable genes +::: {.callout-note} +Please note that you may need to scroll sidewise in these figures, as their +height and width depends on the number of clusters as well as the number and +intersection of the DEGs that are being plotted. +::: -Spatial transcriptomics allows researchers to investigate how gene expression trends varies in space, thus identifying spatial patterns of gene expression. For this purpose, we use SpatialDE [Svensson18](https://www.nature.com/articles/nmeth.4636) ([code](https://github.com/Teichlab/SpatialDE)), a Gaussian process-based statistical framework that aims to identify spatially variable genes. +# Spatial gene expression -First, we convert normalized counts and coordinates to pandas dataframe, needed for inputs to spatialDE. +Spatial transcriptomics data can give insight into how genes are expressed in +different areas in a tissue, allowing identification of spatial gene expression +patterns. Here we use [SpatialDE](https://www.nature.com/articles/nmeth.4636) to +identify such patterns. ```{python} +#| output: false results = SpatialDE.run(st_adata.obsm["spatial"], st_adata.to_df()) ``` -We concatenate the results with the DataFrame of annotations of variables: `st_adata.var`. +We can then inspect the spatial DEGs in a table: ```{python} results.set_index("g", inplace=True) # workaround for https://github.com/Teichlab/SpatialDE/issues/36 results = results.loc[~results.index.duplicated(keep="first")] +# Add annotations st_adata.var = pd.concat([st_adata.var, results.loc[st_adata.var.index.values, :]], axis=1) -``` -Then we can inspect significant genes that varies in space and visualize them with `sc.pl.spatial` function. - -```{python} +# Print results table results_tab = st_adata.var.sort_values("qval", ascending=True) -results_tab.to_csv(saveSpatialDEFileName) -results_tab.head(plotTopHVG) +results_tab.to_csv(output_spatial_degs) +results_tab.head(n_top_spatial_degs) ``` +We can also plot the top spatially variable genes on top of the tissue image +itself to visualize the patterns: + ```{python} -symbols = results_tab.iloc[: plotTopHVG]["gene_symbol"] -sc.pl.spatial(st_adata, img_key="hires", color=symbols.index, alpha=0.7, ncols=numberOfColumns, title=symbols) +symbols = results_tab.iloc[: n_top_spatial_degs]["gene_symbol"] +plt.rcParams["figure.figsize"] = (3.5, 4) +sc.pl.spatial(st_adata, img_key="hires", color=symbols.index, alpha=0.7, + ncols=2, title=symbols, size=1.25) ``` diff --git a/conf/modules.config b/conf/modules.config index 6cffaed..30a52b1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,7 +68,7 @@ process { ] } - withName: 'ST_READ_DATA|ST_QC_AND_NORMALISATION|ST_CLUSTERING|ST_SPATIAL_DE' { + withName: 'ST_READ_DATA|ST_QUALITY_CONTROLS|ST_CLUSTERING|ST_SPATIAL_DE' { publishDir = [ [ path: { "${params.outdir}/${meta.id}/reports" }, diff --git a/conf/test.config b/conf/test.config index b707385..ce4909c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,7 +25,7 @@ params { spaceranger_reference = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/homo_sapiens_chr22_reference.tar.gz" // Parameters - st_preprocess_min_counts = 5 - st_preprocess_min_genes = 3 + st_qc_min_counts = 5 + st_qc_min_genes = 3 outdir = 'results' } diff --git a/conf/test_downstream.config b/conf/test_downstream.config index 971f16b..89a56b2 100644 --- a/conf/test_downstream.config +++ b/conf/test_downstream.config @@ -25,7 +25,7 @@ params { spaceranger_reference = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/homo_sapiens_chr22_reference.tar.gz" // Parameters - st_preprocess_min_counts = 5 - st_preprocess_min_genes = 3 + st_qc_min_counts = 5 + st_qc_min_genes = 3 outdir = 'results' } diff --git a/conf/test_spaceranger_v1.config b/conf/test_spaceranger_v1.config index 67cf02a..2fca10e 100644 --- a/conf/test_spaceranger_v1.config +++ b/conf/test_spaceranger_v1.config @@ -25,7 +25,7 @@ params { spaceranger_reference = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/homo_sapiens_chr22_reference.tar.gz" // Parameters - st_preprocess_min_counts = 5 - st_preprocess_min_genes = 3 + st_qc_min_counts = 5 + st_qc_min_genes = 3 outdir = 'results' } diff --git a/docs/output.md b/docs/output.md index d5623ba..23a88d8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,10 +1,5 @@ # nf-core/spatialtranscriptomics: Output - - - - - ## Introduction This document describes the output produced by the pipeline. Most of the output @@ -70,21 +65,19 @@ the data in an interactive way. ## Reports -### Quality controls and normalisation +### Quality controls and filtering
Output files - `/reports/` - - `st_qc_and_normalisation.html`: HTML report. - - `st_qc_and_normalisation_files/`: Data needed for the HTML report. + - `st_quality_controls.html`: HTML report.
-Report containing analyses related to quality controls, filtering and -normalisation of spatial data. Spots are filtered based on total counts, -number of expressed genes, mitochondrial content as well as presence in tissue; -you can find more details in the report itself. +Report containing analyses related to quality controls and filtering of spatial +data. Spots are filtered based on total counts, number of expressed genes as +well as presence in tissue; you can find more details in the report itself. ### Clustering @@ -93,12 +86,11 @@ you can find more details in the report itself. - `/reports/` - `st_clustering.html`: HTML report. - - `st_clustering_files/`: Data needed for the HTML report. -Report containing analyses related to dimensionality reduction, clustering and -spatial visualisation. Leiden clustering is currently the only clustering +Report containing analyses related to normalisation, dimensionality reduction, +clustering and spatial visualisation. Leiden clustering is currently the only option; you can find more details in the report itself. ### Differential expression @@ -108,9 +100,8 @@ option; you can find more details in the report itself. - `/reports/` - `st_spatial_de.html`: HTML report. - - `st_spatial_de_files/`: Data needed for the HTML report. - `/degs/` - - `st_spatial_de.csv`: List of spatially varying genes. + - `st_spatial_de.csv`: List of spatially differentially expressed genes. diff --git a/docs/usage.md b/docs/usage.md index 13a6921..cdfc0ab 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -114,7 +114,7 @@ gene expression matrices as well as spatial information. ## Space Ranger The pipeline exposes several of Space Ranger's parameters when executing with -raw spatial data. Space Ranger requieres a lot of memory +raw spatial data. Space Ranger requires a lot of memory (64 GB) and several threads (8) to be able to run. You can find the Space Ranger documentation at the [10X website](https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/what-is-space-ranger). @@ -125,34 +125,11 @@ path to its directory (or another link from the 10X website above) using the default human reference for you automatically. > [!NOTE] -> For FFPE and Cytassist experiments, you need to manually supply the appropriate probset using the `--spaceranger_probeset` parameter -> Please refer to the [Spaceranger Downloads page](https://support.10xgenomics.com/spatial-gene-expression/software/downloads/latest) +> For FFPE and Cytassist experiments, you need to manually supply the +> appropriate probeset using the `--spaceranger_probeset` parameter Please refer +> to the [Space Ranger Downloads page](https://support.10xgenomics.com/spatial-gene-expression/software/downloads/latest) > to obtain the correct probeset. -## Analysis options - -The pipeline uses Python and the `scverse` tools to do the downstream analysis -(quality control, filtering, clustering, spatial differential equations). - -### Parameters for Quality Control and Filtering: - -The following parameters are exposed for preprocessing: - -- `--st_preprocess_min_counts`: Minimum number of counts for a spot to be considered in the analysis. -- `--st_preprocess_min_genes`: Minimum number of genes expressed in a spot for the spot to be considered. -- `--st_preprocess_min_cells`: Minimum number of spots expressing a gene for the gene to be considered. -- `--st_preprocess_hist_qc_max_total_counts`: Maximum total counts for the histogram plot in quality control. -- `--st_preprocess_hist_qc_min_gene_counts`: Minimum gene counts for the histogram plot in quality control. -- `--st_preprocess_hist_qc_bins`: Number of bins for the histogram plot in quality control. - -### Parameters for Clustering : - -- `--st_cluster_resolution`: Resolution parameter for the clustering algorithm, controlling granularity. - -### Parameters for Spatial Differential Expression : - -- `st_spatial_de_ncols`: Number of columns in the output figure. - ## Running the pipeline The typical command for running the pipeline is as follows: diff --git a/modules.json b/modules.json index 64ed110..f29c0dc 100644 --- a/modules.json +++ b/modules.json @@ -12,7 +12,7 @@ }, "fastqc": { "branch": "master", - "git_sha": "617777a807a1770f73deb38c80004bac06807eef", + "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", "installed_by": ["modules"] }, "multiqc": { @@ -27,7 +27,7 @@ }, "untar": { "branch": "master", - "git_sha": "8e6960947b6647952b97667173805b34b40f25d6", + "git_sha": "e719354ba77df0a1bd310836aa2039b45c29d620", "installed_by": ["modules"] } } diff --git a/modules/local/st_clustering.nf b/modules/local/st_clustering.nf index 0da7fbb..0819c7a 100644 --- a/modules/local/st_clustering.nf +++ b/modules/local/st_clustering.nf @@ -1,9 +1,8 @@ // -// Clustering etc. +// Dimensionality reduction and clustering // process ST_CLUSTERING { - // TODO: Add a better description // TODO: Update Conda directive when Quarto/Pandoc works on ARM64 tag "${meta.id}" @@ -22,7 +21,8 @@ process ST_CLUSTERING { input: path(report) - tuple val(meta), path(st_adata_norm, stageAs: "adata_norm.h5ad") + path(report_template) + tuple val(meta), path(st_adata_filtered) output: tuple val(meta), path("st_adata_processed.h5ad"), emit: st_adata_processed @@ -35,10 +35,10 @@ process ST_CLUSTERING { script: """ quarto render ${report} \ - --output "st_clustering.html" \ - -P fileNameST:${st_adata_norm} \ - -P resolution:${params.st_cluster_resolution} \ - -P saveFileST:st_adata_processed.h5ad + -P input_adata_filtered:${st_adata_filtered} \ + -P cluster_resolution:${params.st_cluster_resolution} \ + -P n_hvgs:${params.st_cluster_n_hvgs} \ + -P output_adata_processed:st_adata_processed.h5ad cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/st_qc_and_normalisation.nf b/modules/local/st_qc_and_normalisation.nf deleted file mode 100644 index e268ffb..0000000 --- a/modules/local/st_qc_and_normalisation.nf +++ /dev/null @@ -1,56 +0,0 @@ -// -// Spatial data pre-processing -// -process ST_QC_AND_NORMALISATION { - - // TODO: Add a better description - // TODO: Update Conda directive when Quarto/Pandoc works on ARM64 - - tag "${meta.id}" - label 'process_low' - - conda "conda-forge::quarto=1.3.353 conda-forge::scanpy=1.9.3 conda-forge::papermill=2.3.4 conda-forge::jupyter=1.0.0" - container "docker.io/erikfas/spatialtranscriptomics" - - // Exit if running this module with -profile conda / -profile mamba on ARM64 - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - architecture = System.getProperty("os.arch") - if (architecture == "arm64" || architecture == "aarch64") { - exit 1, "The ST_QC_AND_NORMALISATION module does not support Conda on ARM64. Please use Docker / Singularity / Podman instead." - } - } - - input: - path(report) - tuple val(meta), path(st_raw, stageAs: "adata_raw.h5ad") - - output: - tuple val(meta), path("st_adata_norm.h5ad") , emit: st_data_norm - tuple val(meta), path("st_adata_plain.h5ad") , emit: st_data_plain - tuple val(meta), path("st_qc_and_normalisation.html") , emit: html - path("versions.yml") , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - quarto render ${report} \ - --output st_qc_and_normalisation.html \ - -P rawAdata:${st_raw} \ - -P minCounts:${params.st_preprocess_min_counts} \ - -P minGenes:${params.st_preprocess_min_genes} \ - -P minCells:${params.st_preprocess_min_cells} \ - -P histplotQCmaxTotalCounts:${params.st_preprocess_hist_qc_max_total_counts} \ - -P histplotQCminGeneCounts:${params.st_preprocess_hist_qc_min_gene_counts} \ - -P histplotQCbins:${params.st_preprocess_hist_qc_bins} \ - -P nameDataPlain:st_adata_plain.h5ad \ - -P nameDataNorm:st_adata_norm.h5ad - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - quarto: \$(quarto -v) - scanpy: \$(python -c "import scanpy; print(scanpy.__version__)") - END_VERSIONS - """ -} diff --git a/modules/local/st_quality_controls.nf b/modules/local/st_quality_controls.nf new file mode 100644 index 0000000..f52b6bb --- /dev/null +++ b/modules/local/st_quality_controls.nf @@ -0,0 +1,53 @@ +// +// Quality controls and filtering +// +process ST_QUALITY_CONTROLS { + + // TODO: Update Conda directive when Quarto/Pandoc works on ARM64 + + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::quarto=1.3.353 conda-forge::scanpy=1.9.3 conda-forge::papermill=2.3.4 conda-forge::jupyter=1.0.0" + container "docker.io/erikfas/spatialtranscriptomics" + + // Exit if running this module with -profile conda / -profile mamba on ARM64 + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + architecture = System.getProperty("os.arch") + if (architecture == "arm64" || architecture == "aarch64") { + exit 1, "The ST_QUALITY_CONTROLS module does not support Conda on ARM64. Please use Docker / Singularity / Podman instead." + } + } + + input: + path(report) + path(report_template) + tuple val(meta), path(st_adata_raw) + + output: + tuple val(meta), path("st_adata_filtered.h5ad") , emit: st_adata_filtered + tuple val(meta), path("st_quality_controls.html"), emit: html + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + quarto render ${report} \ + -P input_adata_raw:${st_adata_raw} \ + -P min_counts:${params.st_qc_min_counts} \ + -P min_genes:${params.st_qc_min_genes} \ + -P min_spots:${params.st_qc_min_spots} \ + -P mito_threshold:${params.st_qc_mito_threshold} \ + -P ribo_threshold:${params.st_qc_ribo_threshold} \ + -P hb_threshold:${params.st_qc_hb_threshold} \ + -P output_adata_filtered:st_adata_filtered.h5ad + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + quarto: \$(quarto -v) + scanpy: \$(python -c "import scanpy; print(scanpy.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/st_read_data.nf b/modules/local/st_read_data.nf index d482cc5..a39c9c3 100644 --- a/modules/local/st_read_data.nf +++ b/modules/local/st_read_data.nf @@ -15,7 +15,7 @@ process ST_READ_DATA { tuple val (meta), path("${meta.id}/*") output: - tuple val(meta), path("st_adata_raw.h5ad"), emit: st_raw + tuple val(meta), path("st_adata_raw.h5ad"), emit: st_adata_raw path("versions.yml") , emit: versions when: diff --git a/modules/local/st_spatial_de.nf b/modules/local/st_spatial_de.nf index 6f0f617..249fda1 100644 --- a/modules/local/st_spatial_de.nf +++ b/modules/local/st_spatial_de.nf @@ -3,7 +3,6 @@ // process ST_SPATIAL_DE { - // TODO: Add a better description // TODO: Update Conda directive when Quarto/Pandoc works on ARM64 tag "${meta.id}" @@ -21,12 +20,13 @@ process ST_SPATIAL_DE { } input: path(report) - tuple val(meta), path(st_adata_norm, stageAs: "adata_norm.h5ad") + path(report_template) + tuple val(meta), path(st_adata_processed) output: - tuple val(meta), path("*.csv") , emit: degs - tuple val(meta), path("st_spatial_de.html") , emit: html - path("versions.yml") , emit: versions + tuple val(meta), path("*.csv") , emit: degs + tuple val(meta), path("st_spatial_de.html"), emit: html + path("versions.yml") , emit: versions when: task.ext.when == null || task.ext.when @@ -34,12 +34,9 @@ process ST_SPATIAL_DE { script: """ quarto render ${report} \ - --output "st_spatial_de.html" \ - -P fileNameST:${st_adata_norm} \ - -P numberOfColumns:${params.st_spatial_de_ncols} \ - -P plotTopHVG:${params.st_spatial_de_top_hvg} \ - -P saveDEFileName:st_gde.csv \ - -P saveSpatialDEFileName:st_spatial_de.csv + -P input_adata_processed:${st_adata_processed} \ + -P n_top_spatial_degs:${params.st_n_top_spatial_degs} \ + -P output_spatial_degs:st_spatial_de.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index ad9bc54..1f21c66 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -13,12 +13,10 @@ nextflow_process { when { process { """ - input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) """ } } @@ -44,15 +42,13 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) - ] - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ } } @@ -76,11 +72,11 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) - ] + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) """ } } @@ -102,12 +98,12 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ } } @@ -128,17 +124,15 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) - ] - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ } } @@ -168,12 +162,12 @@ nextflow_process { when { process { - """ - input[0] = [ - [ id:'mysample', single_end:true ], // meta map - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - """ + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ } } @@ -197,12 +191,10 @@ nextflow_process { when { process { """ - input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) """ } } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 5ef5afb..5d624bb 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -7,7 +7,7 @@ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-12-29T02:48:05.126117287" + "timestamp": "2024-01-17T18:40:57.254299" }, "versions": { "content": [ @@ -15,6 +15,6 @@ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-12-29T02:46:49.507942667" + "timestamp": "2024-01-17T18:36:50.033627" } } \ No newline at end of file diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test index f57234f..679e83c 100644 --- a/modules/nf-core/untar/tests/main.nf.test +++ b/modules/nf-core/untar/tests/main.nf.test @@ -16,7 +16,7 @@ nextflow_process { } process { """ - input[0] = [ [], file(params.testdata_base_path + 'modules/genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] """ } } @@ -38,7 +38,7 @@ nextflow_process { } process { """ - input[0] = [ [], file(params.testdata_base_path + 'modules/generic/tar/hello.tar.gz', checkIfExists: true) ] + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] """ } } diff --git a/nextflow.config b/nextflow.config index 58fb589..a8212c9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,20 +17,20 @@ params { spaceranger_probeset = null spaceranger_save_reference = false - // Preprocessing, QC and normalisation - st_preprocess_min_counts = 500 - st_preprocess_min_genes = 250 - st_preprocess_min_cells = 1 - st_preprocess_hist_qc_max_total_counts = 10000 - st_preprocess_hist_qc_min_gene_counts = 4000 - st_preprocess_hist_qc_bins = 40 + // Quality controls and filtering + st_qc_min_counts = 500 + st_qc_min_genes = 250 + st_qc_min_spots = 1 + st_qc_mito_threshold = 20 + st_qc_ribo_threshold = 0 + st_qc_hb_threshold = 100 // Clustering - st_cluster_resolution = 1 + st_cluster_n_hvgs = 2000 + st_cluster_resolution = 1 // Spatial differential expression - st_spatial_de_top_hvg = 15 - st_spatial_de_ncols = 5 + st_n_top_spatial_degs = 14 // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 8fce53e..5db2fe4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -95,41 +95,48 @@ "fa_icon": "fas fa-magnifying-glass-chart", "description": "Options related to the downstream analyses performed by the pipeline.", "properties": { - "st_preprocess_min_counts": { + "st_qc_min_counts": { "type": "integer", "default": 500, "description": "The minimum number of UMIs needed in a spot for that spot to pass the filtering.", "fa_icon": "fas fa-hashtag" }, - "st_preprocess_min_genes": { + "st_qc_min_genes": { "type": "integer", "default": 250, "description": "The minimum number of expressed genes in a spot needed for that spot to pass the filtering.", "fa_icon": "fas fa-hashtag" }, - "st_preprocess_min_cells": { + "st_qc_min_spots": { "type": "integer", "default": 1, "description": "The minimum number of spots in which a gene is expressed for that gene to pass the filtering.", "fa_icon": "fas fa-hashtag" }, - "st_preprocess_hist_qc_max_total_counts": { - "type": "integer", - "default": 10000, - "description": "Max total counts cutoff for histogram QC plot.", + "st_qc_mito_threshold": { + "type": "number", + "default": 20, + "description": "The maximum proportion of mitochondrial content that a spot is allowed to have to pass the filtering.", + "help_text": "If you do not wish to filter based on mitochondrial content, set this parameter to `100`.", "fa_icon": "fas fa-hashtag" }, - "st_preprocess_hist_qc_min_gene_counts": { - "type": "integer", - "default": 4000, - "description": "Min total gene counts cutoff for histogram QC plot.", + "st_qc_ribo_threshold": { + "type": "number", + "default": 0, + "description": "The minimum proportion of ribosomal content that a spot is needs to have to pass the filtering (no filtering is done by defeault).", "fa_icon": "fas fa-hashtag" }, - "st_preprocess_hist_qc_bins": { + "st_qc_hb_threshold": { + "type": "number", + "default": 100, + "description": "The maximum proportion of haemoglobin content that a spot is allowed to have to pass the filtering (no filtering is done by defeault).", + "fa_icon": "fas fa-hashtag" + }, + "st_cluster_n_hvgs": { "type": "integer", - "default": 40, - "description": "The number of bins for the QC histogram plots.", - "fa_icon": "fas fa-chart-simple" + "default": 2000, + "description": "The number of top highly variable genes to use for the analyses.", + "fa_icon": "fas fa-hashtag" }, "st_cluster_resolution": { "type": "number", @@ -138,17 +145,10 @@ "help_text": "The resolution controls the coarseness of the clustering, where a higher resolution leads to more clusters.", "fa_icon": "fas fa-circle-nodes" }, - "st_spatial_de_top_hvg": { - "type": "integer", - "default": 15, - "description": "The number of top spatially highly variable genes to plot.", - "fa_icon": "fas fa-hashtag" - }, - "st_spatial_de_ncols": { + "st_n_top_spatial_degs": { "type": "integer", - "default": 5, - "description": "Number of columns to group gene plots into.", - "help_text": "The default, 5, will plot the top spatially highly variable genes into groups of 5 plots per row. This, in combinationation with the default number of top HVGs to plot (15) will yield three rows with 5 plots each.", + "default": 14, + "description": "The number of top spatial differentially expressed genes to plot.", "fa_icon": "fas fa-hashtag" } } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 8d9630d..baff0e8 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -107,8 +107,6 @@ def create_channel_spaceranger(LinkedHashMap meta) { if(!fastq_files.size()) { error "No `fastq_dir` specified or no samples found in folder." - } else { - log.info "${fastq_files.size()} FASTQ files found for sample ${meta['id']}." } check_optional_files = ["manual_alignment", "slidefile", "image", "cytaimage", "colorizedimage", "darkimage"] diff --git a/subworkflows/local/st_downstream.nf b/subworkflows/local/st_downstream.nf new file mode 100644 index 0000000..91b5e74 --- /dev/null +++ b/subworkflows/local/st_downstream.nf @@ -0,0 +1,67 @@ +// +// Subworkflow for downstream analyses of ST data +// + +include { ST_QUALITY_CONTROLS } from '../../modules/local/st_quality_controls' +include { ST_SPATIAL_DE } from '../../modules/local/st_spatial_de' +include { ST_CLUSTERING } from '../../modules/local/st_clustering' + +workflow ST_DOWNSTREAM { + + take: + st_adata_raw + + main: + + ch_versions = Channel.empty() + + // + // Report files + // + report_quality_controls = file("${projectDir}/bin/st_quality_controls.qmd") + report_clustering = file("${projectDir}/bin/st_clustering.qmd") + report_spatial_de = file("${projectDir}/bin/st_spatial_de.qmd") + report_template = Channel.fromPath("${projectDir}/assets/_extensions").collect() + + // + // Quality controls and filtering + // + ST_QUALITY_CONTROLS ( + report_quality_controls, + report_template, + st_adata_raw + ) + ch_versions = ch_versions.mix(ST_QUALITY_CONTROLS.out.versions) + + // + // Normalisation, dimensionality reduction and clustering + // + ST_CLUSTERING ( + report_clustering, + report_template, + ST_QUALITY_CONTROLS.out.st_adata_filtered + ) + ch_versions = ch_versions.mix(ST_CLUSTERING.out.versions) + + // + // Spatial differential expression + // + ST_SPATIAL_DE ( + report_spatial_de, + report_template, + ST_CLUSTERING.out.st_adata_processed + ) + ch_versions = ch_versions.mix(ST_SPATIAL_DE.out.versions) + + emit: + st_data_norm = ST_QUALITY_CONTROLS.out.st_adata_filtered // channel: [ meta, h5ad ] + html = ST_QUALITY_CONTROLS.out.html // channel: [ html ] + + st_adata_processed = ST_CLUSTERING.out.st_adata_processed // channel: [ meta, h5ad] + html = ST_CLUSTERING.out.html // channel: [ html ] + + degs = ST_SPATIAL_DE.out.degs // channel: [ meta, csv ] + html = ST_SPATIAL_DE.out.html // channel: [ html ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/st_postprocess.nf b/subworkflows/local/st_postprocess.nf deleted file mode 100644 index defa6d6..0000000 --- a/subworkflows/local/st_postprocess.nf +++ /dev/null @@ -1,45 +0,0 @@ -// -// Spatial differential expression and reporting -// - -include { ST_SPATIAL_DE } from '../../modules/local/st_spatial_de' -include { ST_CLUSTERING } from '../../modules/local/st_clustering' - -workflow ST_POSTPROCESS { - - take: - st_adata_norm - - main: - - ch_versions = Channel.empty() - - // - // Report files - // - report_clustering = file("${projectDir}/bin/st_clustering.qmd") - report_spatial_de = file("${projectDir}/bin/st_spatial_de.qmd") - - // - // Clustering - // - ST_CLUSTERING ( - report_clustering, - st_adata_norm - ) - ch_versions = ch_versions.mix(ST_CLUSTERING.out.versions) - - // - // Spatial differential expression - // - ST_SPATIAL_DE ( - report_spatial_de, - ST_CLUSTERING.out.st_adata_processed - ) - ch_versions = ch_versions.mix(ST_SPATIAL_DE.out.versions) - - emit: - spatial_degs = ST_SPATIAL_DE.out.degs // channel: [ val(sample), csv ] - - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/st_preprocess.nf b/subworkflows/local/st_preprocess.nf deleted file mode 100644 index 3827392..0000000 --- a/subworkflows/local/st_preprocess.nf +++ /dev/null @@ -1,35 +0,0 @@ -// -// Pre-processing of ST data -// - -include { ST_QC_AND_NORMALISATION } from '../../modules/local/st_qc_and_normalisation' - -workflow ST_PREPROCESS { - - take: - st_raw - - main: - - ch_versions = Channel.empty() - - // - // Report files - // - report = file("${projectDir}/bin/st_qc_and_normalisation.qmd") - - // - // Spatial pre-processing - // - ST_QC_AND_NORMALISATION ( - report, - st_raw - ) - ch_versions = ch_versions.mix(ST_QC_AND_NORMALISATION.out.versions) - - emit: - st_data_norm = ST_QC_AND_NORMALISATION.out.st_data_norm // channel: [ val(sample), h5ad ] - st_data_plain = ST_QC_AND_NORMALISATION.out.st_data_plain // channel: [ val(sample), h5ad ] - - versions = ch_versions // channel: [ version.yml ] -} diff --git a/tests/pipeline/test_downstream.nf.test b/tests/pipeline/test_downstream.nf.test index 11d9d3a..dbfc38a 100644 --- a/tests/pipeline/test_downstream.nf.test +++ b/tests/pipeline/test_downstream.nf.test @@ -12,8 +12,8 @@ nextflow_pipeline { spaceranger_reference = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/homo_sapiens_chr22_reference.tar.gz" // Parameters - st_preprocess_min_counts = 5 - st_preprocess_min_genes = 3 + st_qc_min_counts = 5 + st_qc_min_genes = 3 outdir = "$outputDir" } } @@ -26,14 +26,19 @@ nextflow_pipeline { // Data { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/data/st_adata_processed.h5ad").exists() }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2.2/data/st_adata_processed.h5ad").exists() }, // Reports - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_qc_and_normalisation.html").text.contains("Distributions after filtering") }, - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_clustering.html").text.contains("AnnData object is saved in the") }, - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_spatial_de.html").text.contains("Spatially variable genes") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_quality_controls.html").text.contains("final results of all the filtering") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_clustering.html").text.contains("spatial distribution of clusters") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_spatial_de.html").text.contains("plot the top spatially variable genes") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2.2/reports/st_quality_controls.html").text.contains("final results of all the filtering") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2.2/reports/st_clustering.html").text.contains("spatial distribution of clusters") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2.2/reports/st_spatial_de.html").text.contains("plot the top spatially variable genes") }, // DEGs { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/degs/st_spatial_de.csv").exists() }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2.2/degs/st_spatial_de.csv").exists() }, // MultiQC { assert file("$outputDir/multiqc/multiqc_report.html").exists() } diff --git a/tests/pipeline/test_downstream.nf.test.snap b/tests/pipeline/test_downstream.nf.test.snap index 45bd56e..75f12d8 100644 --- a/tests/pipeline/test_downstream.nf.test.snap +++ b/tests/pipeline/test_downstream.nf.test.snap @@ -1,8 +1,8 @@ { "software_versions": { "content": [ - "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QC_AND_NORMALISATION={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" + "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QUALITY_CONTROLS={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" ], "timestamp": "2024-01-15T16:00:03.485826" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_spaceranger_ffpe_v1.nf.test b/tests/pipeline/test_spaceranger_ffpe_v1.nf.test index 0421158..fc02eaf 100644 --- a/tests/pipeline/test_spaceranger_ffpe_v1.nf.test +++ b/tests/pipeline/test_spaceranger_ffpe_v1.nf.test @@ -9,8 +9,8 @@ nextflow_pipeline { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/human-ovarian-cancer-1-standard_v1_ffpe/samplesheet_spaceranger.csv' spaceranger_probeset = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/human-ovarian-cancer-1-standard_v1_ffpe/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv' spaceranger_reference = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialtranscriptomics/testdata/homo_sapiens_chr22_reference.tar.gz" - st_preprocess_min_counts = 5 - st_preprocess_min_genes = 3 + st_qc_min_counts = 5 + st_qc_min_genes = 3 outdir = "$outputDir" } } @@ -26,9 +26,9 @@ nextflow_pipeline { { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/data/st_adata_processed.h5ad").exists() }, // Reports - { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_qc_and_normalisation.html").text.contains("Distributions after filtering") }, - { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_clustering.html").text.contains("AnnData object is saved in the") }, - { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_spatial_de.html").text.contains("Spatially variable genes") }, + { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_quality_controls.html").text.contains("final results of all the filtering") }, + { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_clustering.html").text.contains("spatial distribution of clusters") }, + { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/reports/st_spatial_de.html").text.contains("plot the top spatially variable genes") }, // DEGs { assert file("$outputDir/Visium_FFPE_Human_Ovarian_Cancer/degs/st_spatial_de.csv").exists() }, diff --git a/tests/pipeline/test_spaceranger_ffpe_v1.nf.test.snap b/tests/pipeline/test_spaceranger_ffpe_v1.nf.test.snap index 8be642f..441c711 100644 --- a/tests/pipeline/test_spaceranger_ffpe_v1.nf.test.snap +++ b/tests/pipeline/test_spaceranger_ffpe_v1.nf.test.snap @@ -1,8 +1,8 @@ { "software_versions": { "content": [ - "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, FASTQC={fastqc=0.12.1}, SPACERANGER_COUNT={spaceranger=2.1.0}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QC_AND_NORMALISATION={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" + "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, FASTQC={fastqc=0.12.1}, SPACERANGER_COUNT={spaceranger=2.1.0}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QUALITY_CONTROLS={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" ], "timestamp": "2024-01-15T13:44:40.789425" } -} \ No newline at end of file +} diff --git a/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test b/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test index e9ee204..ab7a967 100644 --- a/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test +++ b/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test @@ -22,9 +22,9 @@ nextflow_pipeline { { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/data/st_adata_processed.h5ad").exists() }, // Reports - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_qc_and_normalisation.html").text.contains("Distributions after filtering") }, - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_clustering.html").text.contains("AnnData object is saved in the") }, - { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_spatial_de.html").text.contains("Spatially variable genes") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_quality_controls.html").text.contains("final results of all the filtering") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_clustering.html").text.contains("spatial distribution of clusters") }, + { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/reports/st_spatial_de.html").text.contains("plot the top spatially variable genes") }, // DEGs { assert file("$outputDir/CytAssist_11mm_FFPE_Human_Glioblastoma_2/degs/st_spatial_de.csv").exists() }, diff --git a/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test.snap b/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test.snap index 25954a9..1dad690 100644 --- a/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test.snap +++ b/tests/pipeline/test_spaceranger_ffpe_v2_cytassist.nf.test.snap @@ -1,8 +1,8 @@ { "software_versions": { "content": [ - "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, FASTQC={fastqc=0.12.1}, SPACERANGER_COUNT={spaceranger=2.1.0}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QC_AND_NORMALISATION={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" + "{CUSTOM_DUMPSOFTWAREVERSIONS={python=3.11.7, yaml=5.4.1}, FASTQC={fastqc=0.12.1}, SPACERANGER_COUNT={spaceranger=2.1.0}, SPACERANGER_UNTAR_REFERENCE={untar=1.30}, ST_CLUSTERING={quarto=1.3.302, scanpy=1.9.3}, ST_QUALITY_CONTROLS={quarto=1.3.302, scanpy=1.9.3}, ST_READ_DATA={scanpy=1.7.2}, ST_SPATIAL_DE={SpatialDE=1.1.3, leidenalg=0.9.1, quarto=1.3.302, scanpy=1.9.3}, Workflow={nf-core/spatialtranscriptomics=1.0dev}}" ], "timestamp": "2024-01-15T15:42:52.651007" } -} \ No newline at end of file +} diff --git a/workflows/spatialtranscriptomics.nf b/workflows/spatialtranscriptomics.nf index 76757da..c3c34f9 100644 --- a/workflows/spatialtranscriptomics.nf +++ b/workflows/spatialtranscriptomics.nf @@ -58,8 +58,7 @@ include { ST_READ_DATA } from '../modules/local/st_read_data' // include { INPUT_CHECK } from '../subworkflows/local/input_check' include { SPACERANGER } from '../subworkflows/local/spaceranger' -include { ST_PREPROCESS } from '../subworkflows/local/st_preprocess' -include { ST_POSTPROCESS } from '../subworkflows/local/st_postprocess' +include { ST_DOWNSTREAM } from '../subworkflows/local/st_downstream' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -122,20 +121,12 @@ workflow ST { ch_versions = ch_versions.mix(ST_READ_DATA.out.versions) // - // SUBWORKFLOW: Pre-processing of ST data + // SUBWORKFLOW: Downstream analyses of ST data // - ST_PREPROCESS ( - ST_READ_DATA.out.st_raw + ST_DOWNSTREAM ( + ST_READ_DATA.out.st_adata_raw ) - ch_versions = ch_versions.mix(ST_PREPROCESS.out.versions) - - // - // SUBWORKFLOW: Post-processing and reporting - // - ST_POSTPROCESS ( - ST_PREPROCESS.out.st_data_norm - ) - ch_versions = ch_versions.mix(ST_POSTPROCESS.out.versions) + ch_versions = ch_versions.mix(ST_DOWNSTREAM.out.versions) // // MODULE: Pipeline reporting