Merge pull request #88 from nf-core/multiqc-custom-content

Add quality control metrics as MultiQC custom content
nf-core · Oct 11, 2024 · a4abaad · a4abaad
2 parents 7c25f5f + ae0a9a8
commit a4abaad
Show file tree

Hide file tree

Showing 9 changed files with 120 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ compatible with further downstream analyses and/or exploration in _e.g._
 
 ### `Added`
 
+- Add quality control metrics as custom MultiQC content [[#88](https://github.com/nf-core/spatialvi/pull/88)]
 - Add MultiQC support for Space Ranger outputs [[#70](https://github.com/nf-core/spatialvi/pull/70)]
 - Use the QUARTONOTEBOOK nf-core module instead of local Quarto-based modules [[#68](https://github.com/nf-core/spatialvi/pull/68)]
 - Add support for SpatialData [[$67](https://github.com/nf-core/spatialvi/pull/67)]

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -5,11 +5,24 @@ report_comment: >
 report_section_order:
   "nf-core-spatialvi-methods-description":
     order: -1000
-  software_versions:
+  "custom_data":
     order: -1001
-  "nf-core-spatialvi-summary":
+  software_versions:
     order: -1002
+  "nf-core-spatialvi-summary":
+    order: -1003
 
 export_plots: true
 
 disable_version_detection: true
+
+custom_data:
+  quality_controls:
+    file_format: csv
+    section_name: Quality controls
+    description: Quality control metrics from the report analyses.
+    plot_type: table
+
+sp:
+  quality_controls:
+    fn: "mqc_*.csv"
diff --git a/bin/quality_controls.qmd b/bin/quality_controls.qmd
@@ -173,11 +173,13 @@ sc.pl.spatial(adata, color=["in_tissue_str"], title="Spots in tissue", size=1.25
 del adata.obs["in_tissue_str"]
 
 # Remove spots outside tissue and print results
-n_spots = adata.shape[0]
+n_total_spots = adata.shape[0]
 adata = adata[adata.obs["in_tissue"] == 1]
 n_spots_in_tissue = adata.shape[0]
-Markdown(f"""A total of `{n_spots_in_tissue}` spots are situated inside the
-tissue, out of `{n_spots}` spots in total.""")
+n_spots_outside_tissue = n_total_spots - n_spots_in_tissue
+Markdown(f"""
+- Removed `{n_spots_outside_tissue}` spots outside the tissue.
+""")
 ```
 
 ## Counts, genes and spots
@@ -189,24 +191,28 @@ your knowledge of the specific tissue at hand.
 ```{python}
 #| warning: false
 # Filter spots based on counts
-n_spots = adata.shape[0]
-n_genes = adata.shape[1]
+n_current_spots = adata.shape[0]
 sc.pp.filter_cells(adata, min_counts=min_counts)
-n_spots_filtered_min_counts = adata.shape[0]
+n_spots_filtered_min_counts = n_current_spots - adata.shape[0]
 
 # Filter spots based on genes
+n_current_spots = adata.shape[0]
 sc.pp.filter_cells(adata, min_genes=min_genes)
-n_spots_filtered_min_genes = adata.shape[0]
+n_spots_filtered_min_genes = n_current_spots - adata.shape[0]
 
 # Filter genes based on spots
+n_total_genes = adata.shape[1]
 sc.pp.filter_genes(adata, min_cells=min_spots)
-n_genes_filtered_min_spots = adata.shape[1]
+n_genes_filtered_min_spots = n_total_genes - adata.shape[1]
 
 # Print results
 Markdown(f"""
-- Removed `{n_spots - n_spots_filtered_min_counts}` spots with less than `{min_counts}` total counts.
-- Removed `{n_spots_filtered_min_counts - n_spots_filtered_min_genes}` spots with less than `{min_genes}` genes expressed.
-- Removed `{n_genes - n_genes_filtered_min_spots}` genes expressed in less than `{min_spots}` spots.
+- Removed `{n_spots_filtered_min_counts}` spots with less than `{min_counts}`
+total counts.
+- Removed `{n_spots_filtered_min_genes}` spots with less than `{min_genes}`
+genes expressed.
+- Removed `{n_genes_filtered_min_spots}` genes expressed in less than
+`{min_spots}` spots.
 """)
 ```
 
@@ -219,18 +225,21 @@ ribosomal nor haemoglobin content is filtered by default.
 
 ```{python}
 # Filter spots
+n_current_spots = adata.shape[0]
 adata = adata[adata.obs["pct_counts_mt"] <= mito_threshold]
-n_spots_filtered_mito = adata.shape[0]
+n_spots_filtered_mito = n_current_spots - adata.shape[0]
+n_current_spots = adata.shape[0]
 adata = adata[adata.obs["pct_counts_ribo"] >= ribo_threshold]
-n_spots_filtered_ribo = adata.shape[0]
+n_spots_filtered_ribo = n_current_spots - adata.shape[0]
+n_current_spots = adata.shape[0]
 adata = adata[adata.obs["pct_counts_hb"] <= hb_threshold]
-n_spots_filtered_hb = adata.shape[0]
+n_spots_filtered_hb = n_current_spots - adata.shape[0]
 
 # Print results
 Markdown(f"""
-- Removed `{adata.shape[0] - n_spots_filtered_mito}` spots with more than `{mito_threshold}%` mitochondrial content.
-- Removed `{n_spots_filtered_mito - n_spots_filtered_ribo}` spots with less than `{ribo_threshold}%` ribosomal content.
-- Removed `{n_spots_filtered_ribo - n_spots_filtered_hb}` spots with more than `{hb_threshold}%` haemoglobin content.
+- Removed `{n_spots_filtered_mito}` spots with more than `{mito_threshold}%` mitochondrial content.
+- Removed `{n_spots_filtered_ribo}` spots with less than `{ribo_threshold}%` ribosomal content.
+- Removed `{n_spots_filtered_hb}` spots with more than `{hb_threshold}%` haemoglobin content.
 """)
 ```
 
@@ -265,11 +274,15 @@ if (adata.shape[0] == 0 or adata.shape[1] == 0):
 
 ```{python}
 # Print filtering results
+n_remaining_spots = adata.shape[0]
+n_remaining_genes = adata.shape[1]
+n_spots_filtered = n_total_spots - n_remaining_spots
+n_genes_filtered = n_total_genes - n_remaining_genes
 Markdown(f"""
 The final results of all the filtering is as follows:
 
-- A total of `{adata.shape[0]}` spots out of `{n_spots}` remain after filtering.
-- A total of `{adata.shape[1]}` genes out of `{n_genes}` remain after filtering.
+- A total of `{n_remaining_spots}` spots out of `{n_total_spots}` remain after filtering.
+- A total of `{n_remaining_genes}` genes out of `{n_total_genes}` remain after filtering.
 """)
 ```
 
@@ -286,3 +299,26 @@ del sdata.tables["table"]
 sdata.tables["table"] = adata
 sdata.write(os.path.join(artifact_dir, output_sdata))
 ```
+
+```{python}
+#| echo: false
+# Write QC metrics to file for MultiQC aggregation
+mqc_dict = {
+    'sample': [meta['id']],
+    'total_spots': [n_total_spots],
+    'spots_filtered': [n_spots_filtered],
+    'spots_remaining': [n_remaining_spots],
+    'spots_filtered_outside_tissue': [n_spots_outside_tissue],
+    'spots_filtered_total_counts': [n_spots_filtered_min_counts],
+    'spots_filtered_genes_expressed': [n_spots_filtered_min_genes],
+    'spots_filtered_mito_content': [n_spots_filtered_mito],
+    'spots_filtered_ribo_content': [n_spots_filtered_ribo],
+    'spots_filtered_hb_content': [n_spots_filtered_hb],
+    'total_genes': [n_total_genes],
+    'genes_filtered': [n_genes_filtered],
+    'genes_remaining': [n_remaining_genes]
+}
+mqc_data = pd.DataFrame(mqc_dict)
+mqc_name = 'artifacts/mqc_quality_controls_' + meta['id'] + '.csv'
+mqc_data.to_csv(mqc_name, index=False)
+```
diff --git a/conf/modules.config b/conf/modules.config
@@ -82,6 +82,12 @@ process {
                 pattern: "artifacts/adata_processed.h5ad",
                 saveAs: { "adata_processed.h5ad" }
             ],
+            [
+                path: { "${params.outdir}/${meta.id}/data" },
+                mode: params.publish_dir_mode,
+                pattern: "artifacts/mqc_*.csv",
+                saveAs: { filename -> filename.split('/')[1] }
+            ],
             [
                 path: { "${params.outdir}/${meta.id}/data" },
                 mode: params.publish_dir_mode,

diff --git a/modules.json b/modules.json
@@ -13,7 +13,8 @@
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "19ca321db5d8bd48923262c2eca6422359633491",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/multiqc/multiqc.diff"
                     },
                     "quartonotebook": {
                         "branch": "master",

diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
diff --git a/modules/nf-core/multiqc/multiqc.diff b/modules/nf-core/multiqc/multiqc.diff
diff --git a/subworkflows/local/downstream.nf b/subworkflows/local/downstream.nf
@@ -49,6 +49,19 @@ workflow DOWNSTREAM {
         extensions
     )
     ch_versions = ch_versions.mix(QUALITY_CONTROLS.out.versions)
+    ch_qc = QUALITY_CONTROLS.out.artifacts
+        | map { meta, artifacts -> [meta, artifacts[0], meta, artifacts[1]] }
+        | flatten
+        | collate ( 2 )
+        | branch {
+            sdata: it[1].name.endsWith('.zarr')
+            mqc: it[1].name.endsWith('.csv')
+        }
+    ch_qc_sdata = ch_qc.sdata
+    ch_qc_mqc   = ch_qc.mqc
+    ch_qc_html  = QUALITY_CONTROLS.out.html
+    ch_qc_nb    = QUALITY_CONTROLS.out.notebook
+    ch_qc_yml   = QUALITY_CONTROLS.out.params_yaml
 
     //
     // Normalisation, dimensionality reduction and clustering
@@ -98,10 +111,11 @@ workflow DOWNSTREAM {
     ch_versions = ch_versions.mix(SPATIALLY_VARIABLE_GENES.out.versions)
 
     emit:
-    qc_html           = QUALITY_CONTROLS.out.html                // channel: [ meta, html ]
-    qc_sdata          = QUALITY_CONTROLS.out.artifacts           // channel: [ meta, h5ad ]
-    qc_nb             = QUALITY_CONTROLS.out.notebook            // channel: [ meta, qmd ]
-    qc_params         = QUALITY_CONTROLS.out.params_yaml         // channel: [ meta, yml ]
+    qc_html           = ch_qc_html  // channel: [ meta, html ]
+    qc_sdata          = ch_qc_sdata // channel: [ meta, zarr ]
+    qc_mqc            = ch_qc_mqc   // channel: [ meta, csv ]
+    qc_nb             = ch_qc_nb    // channel: [ meta, qmd ]
+    qc_params         = ch_qc_yml   // channel: [ meta, yml ]
 
     clustering_html   = CLUSTERING.out.html                      // channel: [ html ]
     clustering_sdata  = CLUSTERING.out.artifacts                 // channel: [ meta, h5ad]

diff --git a/workflows/spatialvi.nf b/workflows/spatialvi.nf
@@ -116,6 +116,9 @@ workflow SPATIALVI {
     ch_methods_description                = Channel.value(
         methodsDescriptionText(ch_multiqc_custom_methods_description))
 
+    ch_multiqc_files = ch_multiqc_files.mix(
+        DOWNSTREAM.out.qc_mqc.map{it[1]}.collect()
+    )
     ch_multiqc_files = ch_multiqc_files.mix(
         ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions)