From e7b38592d868cee0751e4b2b810cca371e7cd061 Mon Sep 17 00:00:00 2001
From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com>
Date: Fri, 18 Oct 2024 18:18:56 +0200
Subject: [PATCH] Update dataset loaders (#909)

* update to viash 0.9 and categorise datasets

* group workflows

* add api for spatial datasets

* add more metadata

* update publish dir path

* update project config

* update namespace

* fix id

* update example

* fix example

* update test resources

* update helper resources

* fix multiple separator

---------

Co-authored-by: Robrecht Cannoodt <rcannood@gmail.com>
---
 _viash.yaml                                   |   3 +-
 src/datasets/api/comp_dataset_loader.yaml     |  31 ++-
 src/datasets/api/comp_normalization.yaml      |  71 +++---
 src/datasets/api/comp_processor_hvg.yaml      |  79 ++++---
 src/datasets/api/comp_processor_knn.yaml      |  77 ++++---
 src/datasets/api/comp_processor_pca.yaml      |  95 ++++----
 src/datasets/api/comp_processor_subset.yaml   |  59 +++--
 src/datasets/api/comp_processor_svd.yaml      |  87 ++++----
 src/datasets/api/file_common_dataset.yaml     |  11 +-
 src/datasets/api/file_hvg.yaml                |   7 +-
 src/datasets/api/file_knn.yaml                |   7 +-
 src/datasets/api/file_multimodal_dataset.yaml |  17 +-
 src/datasets/api/file_normalized.yaml         |   7 +-
 src/datasets/api/file_pca.yaml                |   7 +-
 src/datasets/api/file_raw.yaml                |  27 ++-
 src/datasets/api/file_spatial_dataset.yaml    | 194 ++++++++++++++++
 src/datasets/api/file_svd.yaml                |   7 +-
 .../loaders/cellxgene_census/config.vsh.yaml  | 167 --------------
 .../config.vsh.yaml                           | 130 -----------
 .../config.vsh.yaml                           |  75 +++++++
 .../openproblems_neurips2021_bmmc/script.py   |   2 +-
 .../openproblems_neurips2021_bmmc/test.py     |   0
 .../config.vsh.yaml                           |  81 +++++++
 .../openproblems_neurips2022_pbmc/script.py   |   2 +-
 .../openproblems_neurips2022_pbmc/test.py     |   0
 .../config.vsh.yaml                           |  95 ++++++++
 .../openproblems_v1_multimodal/script.py      |   0
 .../openproblems_v1_multimodal/test.py        |   0
 .../config.vsh.yaml                           |  74 -------
 .../config.vsh.yaml                           |  80 -------
 .../loaders/openproblems_v1/config.vsh.yaml   |  86 -------
 .../config.vsh.yaml                           |  94 --------
 .../scrnaseq/cellxgene_census/config.vsh.yaml | 176 +++++++++++++++
 .../{ => scrnaseq}/cellxgene_census/script.py |   0
 .../{ => scrnaseq}/cellxgene_census/test.py   |   0
 .../config.vsh.yaml                           | 132 +++++++++++
 .../script.py                                 |   0
 .../cellxgene_census_from_source_h5ad/test.py |   0
 .../scrnaseq/openproblems_v1/config.vsh.yaml  |  91 ++++++++
 .../{ => scrnaseq}/openproblems_v1/script.py  |   0
 .../{ => scrnaseq}/openproblems_v1/test.py    |   0
 .../spatial/tenx_visium/config.vsh.yaml       |  96 ++++++++
 .../{ => spatial}/tenx_visium/script.py       |   2 +-
 .../loaders/{ => spatial}/tenx_visium/test.py |   0
 .../loaders/spatial/zenodo/config.vsh.yaml    |  88 ++++++++
 .../zenodo}/script.py                         |   4 +-
 .../zenodo}/test.py                           |   2 +-
 .../spatial/zenodo_slidetags/config.vsh.yaml  |  88 ++++++++
 .../zenodo_slidetags}/script.py               |   4 +-
 .../zenodo_slidetags}/test.py                 |   2 +-
 .../loaders/tenx_visium/config.vsh.yaml       |  96 --------
 .../loaders/zenodo_spatial/config.vsh.yaml    |  87 --------
 .../zenodo_spatial_slidetags/config.vsh.yaml  |  88 --------
 .../normalization/atac_tfidf/config.vsh.yaml  |  23 +-
 .../normalization/atac_tfidf/script.py        |   4 +-
 .../normalization/l1_sqrt/config.vsh.yaml     |  29 +--
 src/datasets/normalization/l1_sqrt/script.py  |   4 +-
 .../normalization/log_cp/config.vsh.yaml      |  28 +--
 src/datasets/normalization/log_cp/script.py   |   4 +-
 .../log_scran_pooling/config.vsh.yaml         |  20 +-
 .../normalization/log_scran_pooling/script.R  |   2 +-
 .../normalization/prot_clr/config.vsh.yaml    |  29 +--
 src/datasets/normalization/prot_clr/script.py |   4 +-
 .../normalization/sqrt_cp/config.vsh.yaml     |  27 +--
 src/datasets/normalization/sqrt_cp/script.py  |   4 +-
 src/datasets/processors/hvg/config.vsh.yaml   |  15 +-
 src/datasets/processors/knn/config.vsh.yaml   |  15 +-
 src/datasets/processors/pca/config.vsh.yaml   |  15 +-
 .../processors/subsample/config.vsh.yaml      |  85 +++----
 .../processors/subsample/test_script.py       |   6 +-
 src/datasets/processors/svd/config.vsh.yaml   |  15 +-
 .../resource_scripts/cellxgene_census.sh      |   4 +-
 src/datasets/resource_scripts/dataset_info.sh |   6 +-
 .../openproblems_neurips2021_multimodal.sh    |   4 +-
 ...penproblems_neurips2021_multimodal_test.sh |   4 +-
 .../openproblems_neurips2022_pbmc.sh          |   4 +-
 .../resource_scripts/openproblems_v1.sh       |   4 +-
 .../openproblems_v1_multimodal.sh             |   4 +-
 .../openproblems_v1_multimodal_test.sh        |   4 +-
 .../resource_scripts/openproblems_v1_test.sh  |   4 +-
 src/datasets/resource_scripts/tenx_visium.sh  |   8 +-
 .../{zenodo_spatial.sh => zenodo.sh}          |  68 +++---
 ...atial_slidetags.sh => zenodo_slidetags.sh} |  12 +-
 .../cxg_immune_cell_atlas.sh                  |   5 +-
 .../cxg_mouse_pancreas_atlas.sh               |   4 +-
 .../mouse_brain_coronal.sh                    |   4 +-
 .../resource_test_scripts/neurips2021_bmmc.sh |   8 +-
 .../resource_test_scripts/neurips2022_pbmc.sh |   6 +-
 .../resource_test_scripts/pancreas.sh         |  12 +-
 .../scicar_cell_lines.sh                      |   4 +-
 .../resource_test_scripts/slideseq_test.sh    |  36 ---
 .../extract_dataset_info/config.vsh.yaml      |  66 +++---
 .../extract_dataset_meta/config.vsh.yaml      |  47 ++--
 .../config.vsh.yaml                           | 138 ++++++++++++
 .../main.nf                                   |   0
 .../config.vsh.yaml                           | 144 ++++++++++++
 .../main.nf                                   |   0
 .../config.vsh.yaml                           | 165 ++++++++++++++
 .../main.nf                                   |   0
 .../process_cellxgene_census/config.vsh.yaml  | 201 -----------------
 .../config.vsh.yaml                           | 137 ------------
 .../config.vsh.yaml                           | 143 ------------
 .../process_openproblems_v1/config.vsh.yaml   | 163 --------------
 .../config.vsh.yaml                           | 161 --------------
 .../process_tenx_visium/config.vsh.yaml       | 142 ------------
 .../process_zenodo_spatial/config.vsh.yaml    | 138 ------------
 .../config.vsh.yaml                           | 138 ------------
 .../process_cellxgene_census/config.vsh.yaml  | 209 ++++++++++++++++++
 .../process_cellxgene_census/main.nf          |   0
 .../process_openproblems_v1/config.vsh.yaml   | 167 ++++++++++++++
 .../process_openproblems_v1/main.nf           |   0
 .../process_tenx_visium/config.vsh.yaml       | 143 ++++++++++++
 .../{ => spatial}/process_tenx_visium/main.nf |   0
 .../spatial/process_zenodo/config.vsh.yaml    | 139 ++++++++++++
 .../process_zenodo}/main.nf                   |   2 +-
 .../process_zenodo_slidetags/config.vsh.yaml  | 139 ++++++++++++
 .../process_zenodo_slidetags}/main.nf         |   2 +-
 117 files changed, 2985 insertions(+), 2761 deletions(-)
 create mode 100644 src/datasets/api/file_spatial_dataset.yaml
 delete mode 100644 src/datasets/loaders/cellxgene_census/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml
 create mode 100644 src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml
 rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2021_bmmc/script.py (98%)
 rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2021_bmmc/test.py (100%)
 create mode 100644 src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml
 rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2022_pbmc/script.py (98%)
 rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2022_pbmc/test.py (100%)
 create mode 100644 src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml
 rename src/datasets/loaders/{ => multimodal}/openproblems_v1_multimodal/script.py (100%)
 rename src/datasets/loaders/{ => multimodal}/openproblems_v1_multimodal/test.py (100%)
 delete mode 100644 src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/openproblems_v1/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml
 create mode 100644 src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml
 rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census/script.py (100%)
 rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census/test.py (100%)
 create mode 100644 src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml
 rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census_from_source_h5ad/script.py (100%)
 rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census_from_source_h5ad/test.py (100%)
 create mode 100644 src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml
 rename src/datasets/loaders/{ => scrnaseq}/openproblems_v1/script.py (100%)
 rename src/datasets/loaders/{ => scrnaseq}/openproblems_v1/test.py (100%)
 create mode 100644 src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml
 rename src/datasets/loaders/{ => spatial}/tenx_visium/script.py (98%)
 rename src/datasets/loaders/{ => spatial}/tenx_visium/test.py (100%)
 create mode 100644 src/datasets/loaders/spatial/zenodo/config.vsh.yaml
 rename src/datasets/loaders/{zenodo_spatial => spatial/zenodo}/script.py (96%)
 rename src/datasets/loaders/{zenodo_spatial => spatial/zenodo}/test.py (97%)
 create mode 100644 src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml
 rename src/datasets/loaders/{zenodo_spatial_slidetags => spatial/zenodo_slidetags}/script.py (96%)
 rename src/datasets/loaders/{zenodo_spatial_slidetags => spatial/zenodo_slidetags}/test.py (97%)
 delete mode 100644 src/datasets/loaders/tenx_visium/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/zenodo_spatial/config.vsh.yaml
 delete mode 100644 src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml
 rename src/datasets/resource_scripts/{zenodo_spatial.sh => zenodo.sh} (92%)
 rename src/datasets/resource_scripts/{zenodo_spatial_slidetags.sh => zenodo_slidetags.sh} (89%)
 delete mode 100755 src/datasets/resource_test_scripts/slideseq_test.sh
 create mode 100644 src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml
 rename src/datasets/workflows/{ => multimodal}/process_openproblems_neurips2021_bmmc/main.nf (100%)
 create mode 100644 src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml
 rename src/datasets/workflows/{ => multimodal}/process_openproblems_neurips2022_pbmc/main.nf (100%)
 create mode 100644 src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml
 rename src/datasets/workflows/{ => multimodal}/process_openproblems_v1_multimodal/main.nf (100%)
 delete mode 100644 src/datasets/workflows/process_cellxgene_census/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_openproblems_v1/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_tenx_visium/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml
 delete mode 100644 src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml
 create mode 100644 src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml
 rename src/datasets/workflows/{ => scrnaseq}/process_cellxgene_census/main.nf (100%)
 create mode 100644 src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml
 rename src/datasets/workflows/{ => scrnaseq}/process_openproblems_v1/main.nf (100%)
 create mode 100644 src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml
 rename src/datasets/workflows/{ => spatial}/process_tenx_visium/main.nf (100%)
 create mode 100644 src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml
 rename src/datasets/workflows/{process_zenodo_spatial => spatial/process_zenodo}/main.nf (99%)
 create mode 100644 src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml
 rename src/datasets/workflows/{process_zenodo_spatial_slidetags => spatial/process_zenodo_slidetags}/main.nf (98%)

diff --git a/_viash.yaml b/_viash.yaml
index f262b4964b..0d06459772 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -6,7 +6,7 @@ viash_version: 0.9.0
 description: |
   Open Problems is a living, extensible, community-guided benchmarking platform.
 license: MIT
-keywords: [openproblems, benchmarking, single-cell]
+keywords: [openproblems, benchmarking, single-cell omics]
 
 references:
   doi:
@@ -24,6 +24,7 @@ config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
   .runners[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'"
 
+
 info:
   test_resources:
     - type: s3
diff --git a/src/datasets/api/comp_dataset_loader.yaml b/src/datasets/api/comp_dataset_loader.yaml
index 75909b106a..f3ea6426bb 100644
--- a/src/datasets/api/comp_dataset_loader.yaml
+++ b/src/datasets/api/comp_dataset_loader.yaml
@@ -1,16 +1,15 @@
-functionality:
-  namespace: "datasets/loaders"
-  info:
-    type: dataset_loader
-    type_info:
-      label: Dataset loader
-      summary: A component which generates a "Common dataset". 
-      description: |
-        A dataset loader will typically have an identifier (e.g. a GEO identifier)
-        or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it.
-  arguments:
-    - name: "--output"
-      __merge__: file_raw.yaml
-      direction: "output"
-      required: true
-  test_resources: []
\ No newline at end of file
+# namespace: "datasets/loaders"
+info:
+  type: dataset_loader
+  type_info:
+    label: Dataset loader
+    summary: A component which generates a "Common dataset". 
+    description: |
+      A dataset loader will typically have an identifier (e.g. a GEO identifier)
+      or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it.
+arguments:
+  - name: "--output"
+    __merge__: file_raw.yaml
+    direction: "output"
+    required: true
+test_resources: []
\ No newline at end of file
diff --git a/src/datasets/api/comp_normalization.yaml b/src/datasets/api/comp_normalization.yaml
index 6f2c1ffa64..38cd4efe81 100644
--- a/src/datasets/api/comp_normalization.yaml
+++ b/src/datasets/api/comp_normalization.yaml
@@ -1,36 +1,35 @@
-functionality:
-  namespace: "datasets/normalization"
-  info:
-    type: dataset_normalization
-    type_info:
-      label: Dataset normalization
-      summary: |
-        A normalization method which processes the raw counts into a normalized dataset.
-      description:
-        A component for normalizing the raw counts as output by dataset loaders into a normalized dataset.
-  arguments:
-    - name: "--input"
-      __merge__: file_raw.yaml
-      direction: input
-      required: true
-    - name: "--output"
-      __merge__: file_normalized.yaml
-      direction: output
-      required: true
-    - name: "--normalization_id"
-      type: string
-      description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used."
-      required: false
-    - name: "--layer_output"
-      type: string
-      default: "normalized"
-      description: The name of the layer in which to store the normalized data.
-    - name: "--obs_size_factors"
-      type: string
-      default: "size_factors"
-      description: In which .obs slot to store the size factors (if any).
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/normalization"
+info:
+  type: dataset_normalization
+  type_info:
+    label: Dataset normalization
+    summary: |
+      A normalization method which processes the raw counts into a normalized dataset.
+    description:
+      A component for normalizing the raw counts as output by dataset loaders into a normalized dataset.
+arguments:
+  - name: "--input"
+    __merge__: file_raw.yaml
+    direction: input
+    required: true
+  - name: "--output"
+    __merge__: file_normalized.yaml
+    direction: output
+    required: true
+  - name: "--normalization_id"
+    type: string
+    description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used."
+    required: false
+  - name: "--layer_output"
+    type: string
+    default: "normalized"
+    description: The name of the layer in which to store the normalized data.
+  - name: "--obs_size_factors"
+    type: string
+    default: "size_factors"
+    description: In which .obs slot to store the size factors (if any).
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_hvg.yaml b/src/datasets/api/comp_processor_hvg.yaml
index 2e24033aac..bfed255d02 100644
--- a/src/datasets/api/comp_processor_hvg.yaml
+++ b/src/datasets/api/comp_processor_hvg.yaml
@@ -1,40 +1,39 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: HVG
-      summary: |
-        Computes the highly variable genes scores.
-      description: |
-        The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'.
-  arguments:
-    - name: "--input"
-      __merge__: file_normalized.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--output"
-      direction: output
-      __merge__: file_hvg.yaml
-      required: true
-    - name: "--var_hvg"
-      type: string
-      default: "hvg"
-      description: "In which .var slot to store whether a feature is considered to be hvg."
-    - name: "--var_hvg_score"
-      type: string
-      default: "hvg_score"
-      description: "In which .var slot to store the gene variance score (normalized dispersion)."
-    - name: "--num_features"
-      type: integer
-      default: 1000
-      description: "The number of HVG to select"
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: HVG
+    summary: |
+      Computes the highly variable genes scores.
+    description: |
+      The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'.
+arguments:
+  - name: "--input"
+    __merge__: file_normalized.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--output"
+    direction: output
+    __merge__: file_hvg.yaml
+    required: true
+  - name: "--var_hvg"
+    type: string
+    default: "hvg"
+    description: "In which .var slot to store whether a feature is considered to be hvg."
+  - name: "--var_hvg_score"
+    type: string
+    default: "hvg_score"
+    description: "In which .var slot to store the gene variance score (normalized dispersion)."
+  - name: "--num_features"
+    type: integer
+    default: 1000
+    description: "The number of HVG to select"
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_knn.yaml b/src/datasets/api/comp_processor_knn.yaml
index b0e16f8fc4..be95b83e38 100644
--- a/src/datasets/api/comp_processor_knn.yaml
+++ b/src/datasets/api/comp_processor_knn.yaml
@@ -1,39 +1,38 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: KNN
-      summary: |
-        Computes the k-nearest-neighbours for each cell.
-      description: |
-        The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'.
-  arguments:
-    - name: "--input"
-      __merge__: file_pca.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--output"
-      direction: output
-      __merge__: file_knn.yaml
-      required: true
-    - name: "--key_added"
-      type: string
-      default: "knn"
-      description: |
-        The neighbors data is added to `.uns[key_added]`, 
-        distances are stored in `.obsp[key_added+'_distances']` and 
-        connectivities in `.obsp[key_added+'_connectivities']`.
-    - name: "--num_neighbors"
-      type: integer
-      default: 15
-      description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation."
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: KNN
+    summary: |
+      Computes the k-nearest-neighbours for each cell.
+    description: |
+      The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'.
+arguments:
+  - name: "--input"
+    __merge__: file_pca.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--output"
+    direction: output
+    __merge__: file_knn.yaml
+    required: true
+  - name: "--key_added"
+    type: string
+    default: "knn"
+    description: |
+      The neighbors data is added to `.uns[key_added]`, 
+      distances are stored in `.obsp[key_added+'_distances']` and 
+      connectivities in `.obsp[key_added+'_connectivities']`.
+  - name: "--num_neighbors"
+    type: integer
+    default: 15
+    description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation."
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_pca.yaml b/src/datasets/api/comp_processor_pca.yaml
index a7ca82bc07..051532cf1e 100644
--- a/src/datasets/api/comp_processor_pca.yaml
+++ b/src/datasets/api/comp_processor_pca.yaml
@@ -1,49 +1,48 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: PCA
-      summary: |
-        Computes a PCA embedding of the normalized data.
-      description:
-        The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'.
-  arguments:
-    - name: "--input"
-      __merge__: file_hvg.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--input_var_features"
-      type: string
-      description: Column name in .var matrix that will be used to select which genes to run the PCA on.
-      default: hvg
-    - name: "--output"
-      direction: output
-      __merge__: file_pca.yaml
-      required: true
-    - name: "--obsm_embedding"
-      type: string
-      default: "X_pca"
-      description: "In which .obsm slot to store the resulting embedding."
-    - name: "--varm_loadings"
-      type: string
-      default: "pca_loadings"
-      description: "In which .varm slot to store the resulting loadings matrix."
-    - name: "--uns_variance"
-      type: string
-      default: "pca_variance"
-      description: "In which .uns slot to store the resulting variance objects."
-    - name: "--num_components"
-      type: integer
-      example: 25
-      description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: PCA
+    summary: |
+      Computes a PCA embedding of the normalized data.
+    description:
+      The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'.
+arguments:
+  - name: "--input"
+    __merge__: file_hvg.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--input_var_features"
+    type: string
+    description: Column name in .var matrix that will be used to select which genes to run the PCA on.
+    default: hvg
+  - name: "--output"
+    direction: output
+    __merge__: file_pca.yaml
+    required: true
+  - name: "--obsm_embedding"
+    type: string
+    default: "X_pca"
+    description: "In which .obsm slot to store the resulting embedding."
+  - name: "--varm_loadings"
+    type: string
+    default: "pca_loadings"
+    description: "In which .varm slot to store the resulting loadings matrix."
+  - name: "--uns_variance"
+    type: string
+    default: "pca_variance"
+    description: "In which .uns slot to store the resulting variance objects."
+  - name: "--num_components"
+    type: integer
+    example: 25
+    description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
 
diff --git a/src/datasets/api/comp_processor_subset.yaml b/src/datasets/api/comp_processor_subset.yaml
index bad64a6762..c49e7f2ece 100644
--- a/src/datasets/api/comp_processor_subset.yaml
+++ b/src/datasets/api/comp_processor_subset.yaml
@@ -1,31 +1,30 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: Subset
-      summary: Sample cells and genes randomly.
-      description: This component subsets the layers, obs and var to create smaller test datasets.
-  arguments:
-    - name: "--input"
-      __merge__: file_common_dataset.yaml
-      required: true
-      direction: input
-    - name: "--input_mod2"
-      __merge__: file_common_dataset.yaml
-      direction: input
-      required: false
-    - name: "--output"
-      __merge__: file_common_dataset.yaml
-      direction: output
-      required: true
-    - name: "--output_mod2"
-      __merge__: file_common_dataset.yaml
-      direction: output
-      required: false
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: Subset
+    summary: Sample cells and genes randomly.
+    description: This component subsets the layers, obs and var to create smaller test datasets.
+arguments:
+  - name: "--input"
+    __merge__: file_common_dataset.yaml
+    required: true
+    direction: input
+  - name: "--input_mod2"
+    __merge__: file_common_dataset.yaml
+    direction: input
+    required: false
+  - name: "--output"
+    __merge__: file_common_dataset.yaml
+    direction: output
+    required: true
+  - name: "--output_mod2"
+    __merge__: file_common_dataset.yaml
+    direction: output
+    required: false
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
 
diff --git a/src/datasets/api/comp_processor_svd.yaml b/src/datasets/api/comp_processor_svd.yaml
index 91413c2624..d5c0ae8ba8 100644
--- a/src/datasets/api/comp_processor_svd.yaml
+++ b/src/datasets/api/comp_processor_svd.yaml
@@ -1,45 +1,44 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: SVD
-      summary: |
-        Computes a SVD PCA embedding of the normalized data.
-      description:
-        The resulting AnnData will contain an embedding in obsm.
-  arguments:
-    - name: "--input"
-      __merge__: file_normalized.yaml
-      required: true
-      direction: input
-    - name: "--input_mod2"
-      __merge__: file_normalized.yaml
-      required: false
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--output"
-      direction: output
-      __merge__: file_svd.yaml
-      required: true
-    - name: "--output_mod2"
-      direction: output
-      __merge__: file_svd.yaml
-      required: false
-    - name: "--obsm_embedding"
-      type: string
-      default: "X_svd"
-      description: "In which .obsm slot to store the resulting embedding."
-    - name: "--num_components"
-      type: integer
-      default: 100
-      description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation.
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: SVD
+    summary: |
+      Computes a SVD PCA embedding of the normalized data.
+    description:
+      The resulting AnnData will contain an embedding in obsm.
+arguments:
+  - name: "--input"
+    __merge__: file_normalized.yaml
+    required: true
+    direction: input
+  - name: "--input_mod2"
+    __merge__: file_normalized.yaml
+    required: false
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--output"
+    direction: output
+    __merge__: file_svd.yaml
+    required: true
+  - name: "--output_mod2"
+    direction: output
+    __merge__: file_svd.yaml
+    required: false
+  - name: "--obsm_embedding"
+    type: string
+    default: "X_svd"
+    description: "In which .obsm slot to store the resulting embedding."
+  - name: "--num_components"
+    type: integer
+    default: 100
+    description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation.
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
 
diff --git a/src/datasets/api/file_common_dataset.yaml b/src/datasets/api/file_common_dataset.yaml
index ed7836bf5c..4ca8722aa7 100644
--- a/src/datasets/api/file_common_dataset.yaml
+++ b/src/datasets/api/file_common_dataset.yaml
@@ -1,9 +1,8 @@
 __merge__: file_knn.yaml
 type: file
 example: "resources_test/common/pancreas/dataset.h5ad"
-info:
-  label: "Common dataset"
-  summary: A dataset processed by the common dataset processing pipeline. 
-  description: |
-    This dataset contains both raw counts and normalized data matrices,
-    as well as a PCA embedding, HVG selection and a kNN graph.
+label: "Common dataset"
+summary: A dataset processed by the common dataset processing pipeline. 
+description: |
+  This dataset contains both raw counts and normalized data matrices,
+  as well as a PCA embedding, HVG selection and a kNN graph.
\ No newline at end of file
diff --git a/src/datasets/api/file_hvg.yaml b/src/datasets/api/file_hvg.yaml
index 697be29e32..47b8f88922 100644
--- a/src/datasets/api/file_hvg.yaml
+++ b/src/datasets/api/file_hvg.yaml
@@ -1,10 +1,11 @@
 __merge__: file_normalized.yaml
 type: file 
 example: "resources_test/common/pancreas/hvg.h5ad"
+label: "Dataset+HVG"
+summary: "A normalised dataset with a PCA embedding and HVG selection."
 info:
-  label: "Dataset+HVG"
-  summary: "A normalised dataset with a PCA embedding and HVG selection."
-  slots:
+  format:
+    type: h5ad 
     var:
       - type: boolean
         name: hvg
diff --git a/src/datasets/api/file_knn.yaml b/src/datasets/api/file_knn.yaml
index de7d2b8df5..c2f320e08e 100644
--- a/src/datasets/api/file_knn.yaml
+++ b/src/datasets/api/file_knn.yaml
@@ -1,10 +1,11 @@
 __merge__: file_pca.yaml
 type: file
 example: "resources_test/common/pancreas/knn.h5ad"
+label: "Dataset+HVG+PCA+kNN"
+summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph"
 info:
-  label: "Dataset+HVG+PCA+kNN"
-  summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph"
-  slots:
+  format:
+    type: h5ad
     obsp:
       - type: double
         name: knn_distances
diff --git a/src/datasets/api/file_multimodal_dataset.yaml b/src/datasets/api/file_multimodal_dataset.yaml
index daac29d77b..b8ae760225 100644
--- a/src/datasets/api/file_multimodal_dataset.yaml
+++ b/src/datasets/api/file_multimodal_dataset.yaml
@@ -1,14 +1,15 @@
 type: file
 example: "resources_test/common/pancreas/dataset.h5ad"
+label: "Common dataset"
+summary: A dataset processed by the common dataset processing pipeline. 
+description: |
+  This dataset contains both raw counts and normalized data matrices,
+  as well as a SVD embedding and a HVG selection.
+
+  The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
 info:
-  label: "Common dataset"
-  summary: A dataset processed by the common dataset processing pipeline. 
-  description: |
-    This dataset contains both raw counts and normalized data matrices,
-    as well as a SVD embedding and a HVG selection.
-
-    The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
-  slots:
+  format:
+    type: h5ad
     layers: 
       - type: integer
         name: counts
diff --git a/src/datasets/api/file_normalized.yaml b/src/datasets/api/file_normalized.yaml
index ea6f14e9fb..f163e31db9 100644
--- a/src/datasets/api/file_normalized.yaml
+++ b/src/datasets/api/file_normalized.yaml
@@ -1,10 +1,11 @@
 __merge__: file_raw.yaml
 type: file
 example: "resources_test/common/pancreas/normalized.h5ad"
+label: "Normalized dataset"
+summary: "A normalized dataset"
 info:
-  label: "Normalized dataset"
-  summary: "A normalized dataset"
-  slots:
+  format:
+    type: h5ad
     layers:
       - type: double
         name: normalized
diff --git a/src/datasets/api/file_pca.yaml b/src/datasets/api/file_pca.yaml
index daa26618e1..2d2e48f95b 100644
--- a/src/datasets/api/file_pca.yaml
+++ b/src/datasets/api/file_pca.yaml
@@ -1,10 +1,11 @@
 __merge__: file_hvg.yaml
 type: file
 example: "resources_test/common/pancreas/pca.h5ad"
+label: "Dataset+HVG+PCA"
+summary: "A normalised dataset with a PCA embedding"
 info:
-  label: "Dataset+HVG+PCA"
-  summary: "A normalised dataset with a PCA embedding"
-  slots:
+  format:
+    type: h5ad
     obsm:
       - type: double
         name: X_pca
diff --git a/src/datasets/api/file_raw.yaml b/src/datasets/api/file_raw.yaml
index 7ffab3b43e..f42b022a38 100644
--- a/src/datasets/api/file_raw.yaml
+++ b/src/datasets/api/file_raw.yaml
@@ -1,13 +1,14 @@
 type: file
 example: "resources_test/common/pancreas/raw.h5ad"
-info:
-  label: "Raw dataset"
-  summary: An unprocessed dataset as output by a dataset loader.
-  description: |
-    This dataset contains raw counts and metadata as output by a dataset loader.
+label: "Raw dataset"
+summary: An unprocessed dataset as output by a dataset loader.
+description: |
+  This dataset contains raw counts and metadata as output by a dataset loader.
 
-    The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
-  slots:
+  The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
+info:
+  format:
+    type: h5ad
     layers: 
       - type: integer
         name: counts
@@ -203,3 +204,15 @@ info:
         description: The organism of the sample in the dataset.
         required: false
         multiple: true
+      - name: dataset_technology 
+        type: string
+        description: The technology used to generate the dataset.
+        required: false 
+      - name: dataset_organ
+        type: string
+        description: The organ of the sample in the dataset.
+        required: false
+      - name: dataset_tissue
+        type: string
+        description: The tissue of the sample in the dataset.
+        required: false
diff --git a/src/datasets/api/file_spatial_dataset.yaml b/src/datasets/api/file_spatial_dataset.yaml
new file mode 100644
index 0000000000..d7971c52ac
--- /dev/null
+++ b/src/datasets/api/file_spatial_dataset.yaml
@@ -0,0 +1,194 @@
+type: file
+example: "resources_test/common/mouse_brain_coronal/dataset.h5ad"
+label: "Common dataset"
+summary: An unprocessed dataset as output by the common dataset processing pipeline.
+description: |
+  This dataset contains both raw counts and normalized data matrices.
+info:
+  format:
+    type: h5ad
+    layers: 
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: double
+        name: normalized
+        description: Normalised expression values
+        required: true
+    obs:
+      - type: string
+        name: dataset_id
+        description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes.
+        required: false
+      - type: string
+        name: assay
+        description: Type of assay used to generate the cell data, indicating the methodology or technique employed.
+        required: false
+      - type: string
+        name: assay_ontology_term_id
+        description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type.
+        required: false
+      - type: string
+        name: cell_type
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: false
+      - type: string
+        name: cell_type_ontology_term_id
+        description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification.
+        required: false
+      - type: string
+        name: development_stage
+        description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase.
+        required: false
+      - type: string
+        name: development_stage_ontology_term_id
+        description: |
+          Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase.
+          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used.  
+          If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used.
+          Otherwise, the Uberon (`UBERON:`) ontology is used.
+        required: false
+      - type: string
+        name: disease
+        description: Information on any disease or pathological condition associated with the cell or donor.
+        required: false
+      - type: string
+        name: disease_ontology_term_id
+        description: |
+          Ontology term identifier for the disease, enabling standardized disease classification and referencing.
+          Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`).
+        required: false
+      - type: string
+        name: donor_id
+        description: Identifier for the donor from whom the cell sample is obtained.
+        required: false
+      - type: boolean
+        name: is_primary_data
+        description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data.
+        required: false
+      - type: string
+        name: organism
+        description: Organism from which the cell sample is obtained.
+        required: false    
+      - type: string
+        name: organism_ontology_term_id
+        description: |
+          Ontology term identifier for the organism, providing a standardized reference for the organism.
+          Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.
+        required: false
+      - type: string
+        name: self_reported_ethnicity
+        description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits.
+        required: false
+      - type: string
+        name: self_reported_ethnicity_ontology_term_id
+        description: |
+          Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications.
+          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used.
+        required: false
+      - type: string
+        name: sex
+        description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions.
+        required: false
+      - type: string
+        name: sex_ontology_term_id
+        description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed.
+        required: false
+      - type: string
+        name: suspension_type
+        description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions.
+        required: false
+      - type: string
+        name: tissue
+        description: Specific tissue from which the cells were derived, key for context and specificity in cell studies.
+        required: false
+      - type: string
+        name: tissue_ontology_term_id
+        description: |
+          Ontology term identifier for the tissue, providing a standardized reference for the tissue type.
+          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
+          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
+        required: false
+      - type: string
+        name: tissue_general
+        description: General category or classification of the tissue, useful for broader grouping and comparison of cell data.
+        required: false
+      - type: string
+        name: tissue_general_ontology_term_id
+        description: |
+          Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types.
+          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
+          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
+        required: false
+      - type: string
+        name: batch
+        description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc.
+        required: false
+      - type: integer
+        name: soma_joinid
+        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell.
+        required: false
+    obsm: 
+      - type: double
+        name: spatial
+        description: Spatial coordinates of the cells in the format `x, y`.
+        required: true
+    var:
+      - type: string
+        name: feature_id
+        description: Unique identifier for the feature, usually a ENSEMBL gene id.
+        # TODO: make this required once openproblems_v1 dataloader supports it
+        required: false
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        # TODO: make this required once the dataloader supports it
+        required: true
+      - type: integer
+        name: soma_joinid
+        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature.
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
+      - name: dataset_technology 
+        type: string
+        description: The technology used to generate the dataset.
+        required: false 
+      - name: dataset_organ
+        type: string
+        description: The organ of the sample in the dataset.
+        required: false
+      - name: dataset_tissue
+        type: string
+        description: The tissue of the sample in the dataset.
+        required: false
\ No newline at end of file
diff --git a/src/datasets/api/file_svd.yaml b/src/datasets/api/file_svd.yaml
index 2a727369e3..c9f22b50f7 100644
--- a/src/datasets/api/file_svd.yaml
+++ b/src/datasets/api/file_svd.yaml
@@ -1,10 +1,11 @@
 __merge__: file_normalized.yaml
 type: file
 example: "resources_test/common/pancreas/svd.h5ad"
+label: "Dataset+SVD"
+summary: "A normalised dataset with a SVD embedding"
 info:
-  label: "Dataset+SVD"
-  summary: "A normalised dataset with a SVD embedding"
-  slots:
+  format:
+    type: h5ad
     obsm:
       - type: double
         name: X_svd
diff --git a/src/datasets/loaders/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/cellxgene_census/config.vsh.yaml
deleted file mode 100644
index 667e1c6a6b..0000000000
--- a/src/datasets/loaders/cellxgene_census/config.vsh.yaml
+++ /dev/null
@@ -1,167 +0,0 @@
-functionality:
-  name: cellxgene_census
-  namespace: datasets/loaders
-  description: |
-    Query cells from a CellxGene Census or custom TileDBSoma object.
-    Aside from fetching the cells' RNA counts (`.X`), cell metadata
-    (`.obs`) and gene metadata (`.var`), this component also fetches
-    the dataset metadata and joins it into the cell metadata.
-  argument_groups:
-    - name: Input database
-      description: "Open CellxGene Census by version or URI."
-      arguments:
-        - name: "--input_uri"
-          type: string
-          description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument."
-          required: false
-          example: "s3://bucket/path"
-        - name: "--census_version"
-          description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)."
-          type: string
-          example: "stable"
-          required: false
-    - name: Cell query
-      description: Arguments related to the query.
-      arguments:
-        - name: "--species"
-          type: string
-          description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`.
-          required: true
-          example: "homo_sapiens"
-        - name: "--obs_value_filter"
-          type: string
-          description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax."
-          required: true
-          example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'"
-    - name: Filter cells by grouping
-      description: 
-      arguments:
-        - name: "--cell_filter_grouping"
-          type: string
-          description: |
-            A subset of 'obs' columns by which to group the cells for filtering.
-            Only groups surpassing or equal to the `--cell_filter_minimum_count`
-            threshold will be retained. Take care not to introduce a selection
-            bias against cells with more fine-grained ontology annotations.
-          required: false
-          example: ["dataset_id", "tissue", "assay", "disease", "cell_type"]
-          multiple: true
-        - name: "--cell_filter_minimum_count"
-          type: integer
-          description: |
-            A minimum number of cells per group to retain. If `--cell_filter_grouping`
-            is defined, this parameter should also be provided and vice versa.
-          required: false
-          example: 100
-    - name: Count filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--cell_filter_min_genes"
-          type: integer
-          description: Remove cells with less than this number of genes.
-          required: false
-          default: 50
-        - name: "--cell_filter_min_counts"
-          type: integer
-          description: Remove cells with less than this number of counts.
-          required: false
-          default: 0
-        - name: "--gene_filter_min_cells"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          default: 5
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-          default: 0
-    - name: Cell metadata
-      description: Cell metadata arguments
-      arguments:
-        - name: "--obs_batch"
-          type: string
-          description: |
-            Location of where to find the observation batch IDs.  
-            
-            * If not specified, the `.obs["batch"]` field will not be included.
-            * If one or more values are specified, the `.obs["batch"]` field will be 
-              set to the concatenated values of the specified fields, separated by
-              the `obs_batch_separator`.
-          required: false
-          multiple: true
-          multiple_sep: ","
-          example: ["batch"]
-        - name: "--obs_batch_separator"
-          type: string
-          description: Separator to use when concatenating the values of the `--obs_batch` fields.
-          required: false
-          default: "+"
-    - name: Dataset metadata
-      description: Information about the dataset that will be stored in the `.uns` slot.
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: true
-    - name: Outputs
-      description: Output arguments.
-      arguments:
-        - name: "--output"
-          type: file
-          description: Output h5ad file.
-          direction: output
-          required: true
-          example: output.h5ad
-        - name: "--output_compression"
-          type: string
-          choices: ["gzip", "lzf"]
-          required: false
-          example: "gzip"
-  resources:
-    - type: python_script
-      path: script.py
-    - path: /src/common/helper_functions/setup_logger.py
-  test_resources:
-    - type: python_script
-      path: test.py
-platforms:
-  - type: docker
-    #image: openproblems/base_python:1.0.0
-    image: python:3.11
-    setup:
-      - type: python
-        packages:
-          - cellxgene-census
-          - scanpy
-    test_setup:
-      - type: python
-        packages:
-          - viashpy
-  - type: nextflow
-    directives:
-      label: [highmem, midcpu]
\ No newline at end of file
diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml
deleted file mode 100644
index 7ee4166d9d..0000000000
--- a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml
+++ /dev/null
@@ -1,130 +0,0 @@
-functionality:
-  name: cellxgene_census_from_source_h5ad
-  namespace: datasets/loaders
-  description: |
-    Query cells from a CellxGene Census or custom TileDBSoma object.
-    Aside from fetching the cells' RNA counts (`.X`), cell metadata
-    (`.obs`) and gene metadata (`.var`), this component also fetches
-    the dataset metadata and joins it into the cell metadata.
-  argument_groups:
-    - name: Input
-      description: Input arguments
-      arguments:
-        - name: "--input_id"
-          type: string
-          description: |
-            The dataset ID of the CellxGene Census dataset to query.
-          required: true
-          example: "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4"
-    - name: Count filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--cell_filter_min_genes"
-          type: integer
-          description: Remove cells with less than this number of genes.
-          required: false
-          default: 50
-        - name: "--cell_filter_min_counts"
-          type: integer
-          description: Remove cells with less than this number of counts.
-          required: false
-          default: 0
-        - name: "--gene_filter_min_cells"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          default: 5
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-          default: 0
-    - name: Cell metadata
-      description: Cell metadata arguments
-      arguments:
-        - name: "--obs_batch"
-          type: string
-          description: |
-            Location of where to find the observation batch IDs.  
-            
-            * If not specified, the `.obs["batch"]` field will not be included.
-            * If one or more values are specified, the `.obs["batch"]` field will be 
-              set to the concatenated values of the specified fields, separated by
-              the `obs_batch_separator`.
-          required: false
-          multiple: true
-          multiple_sep: ","
-          example: ["batch"]
-        - name: "--obs_batch_separator"
-          type: string
-          description: Separator to use when concatenating the values of the `--obs_batch` fields.
-          required: false
-          default: "+"
-    - name: Dataset metadata
-      description: Information about the dataset that will be stored in the `.uns` slot.
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: true
-    - name: Outputs
-      description: Output arguments.
-      arguments:
-        - name: "--output"
-          type: file
-          description: Output h5ad file.
-          direction: output
-          required: true
-          example: output.h5ad
-        - name: "--output_compression"
-          type: string
-          choices: ["gzip", "lzf"]
-          required: false
-          example: "gzip"
-  resources:
-    - type: python_script
-      path: script.py
-    - path: /src/common/helper_functions/setup_logger.py
-  test_resources:
-    - type: python_script
-      path: test.py
-platforms:
-  - type: docker
-    #image: openproblems/base_python:1.0.0
-    image: python:3.11
-    setup:
-      - type: python
-        packages:
-          - cellxgene-census
-          - scanpy
-    test_setup:
-      - type: python
-        packages:
-          - viashpy
-  - type: nextflow
-    directives:
-      label: [highmem, midcpu]
\ No newline at end of file
diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml
new file mode 100644
index 0000000000..3a73fe0538
--- /dev/null
+++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml
@@ -0,0 +1,75 @@
+name: openproblems_neurips2021_bmmc
+namespace: datasets/loaders/multimodal
+description: Fetch a dataset from the OpenProblems NeurIPS2021 competition
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        type: file
+        description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122.
+        required: true
+        example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad
+      - name: --mod1
+        type: string
+        description: Name of the first modality.
+        required: true
+        example: GEX
+      - name: --mod2
+        type: string
+        description: Name of the second modality.
+        required: true
+        example: ADT
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: A unique identifier for the dataset
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+      - name: --output_mod2
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+  # - type: file
+  #   path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py
similarity index 98%
rename from src/datasets/loaders/openproblems_neurips2021_bmmc/script.py
rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py
index de62f039f6..eb62dd67e9 100644
--- a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py
+++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py
@@ -19,7 +19,7 @@
   "output_mod2": "output/mod2.h5ad"
 }
 meta = {
-  "functionality_name": "openproblems_neurips2021_bmmc",
+  "name": "openproblems_neurips2021_bmmc",
   "resources_dir": "/tmp/viash_inject_openproblems_neurips2021_bmmc14365472827677740971", 
 }
 ## VIASH END
diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py
similarity index 100%
rename from src/datasets/loaders/openproblems_neurips2021_bmmc/test.py
rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py
diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml
new file mode 100644
index 0000000000..5994e4ccc9
--- /dev/null
+++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml
@@ -0,0 +1,81 @@
+name: openproblems_neurips2022_pbmc
+namespace: datasets/loaders/multimodal
+description: Fetch a dataset from the OpenProblems NeurIPS2022 competition
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_mod1
+        type: file
+        description: Processed RNA h5ad file
+        required: true
+        example: cite_rna_merged.h5ad
+      - name: --input_mod2
+        type: file
+        description: Processed ADT or ATAC h5ad file
+        required: true
+        example: cite_prot_merged.h5ad
+      - name: --mod1
+        type: string
+        description: Name of the first modality.
+        required: true
+        example: GEX
+      - name: --mod2
+        type: string
+        description: Name of the second modality.
+        required: true
+        example: ADT
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: A unique identifier for the dataset
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+      - name: --output_mod2
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+resources:
+  - type: python_script
+    path: script.py
+# skip unit test until data is public
+# test_resources:
+#   - type: python_script
+#     path: test.py
+#   - type: file
+#     path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py
similarity index 98%
rename from src/datasets/loaders/openproblems_neurips2022_pbmc/script.py
rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py
index d0dd855b55..8940afed26 100644
--- a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py
+++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py
@@ -18,7 +18,7 @@
   "output_mod2": "output/mod2.h5ad"
 }
 meta = {
-  "functionality_name": "openproblems_neurips2022_pbmc",
+  "name": "openproblems_neurips2022_pbmc",
 }
 ## VIASH END
 
diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py
similarity index 100%
rename from src/datasets/loaders/openproblems_neurips2022_pbmc/test.py
rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py
diff --git a/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml
new file mode 100644
index 0000000000..b0afb9311b
--- /dev/null
+++ b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml
@@ -0,0 +1,95 @@
+name: openproblems_v1_multimodal
+namespace: datasets/loaders/multimodal
+description: Fetch a dataset from OpenProblems v1
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_id
+        type: string
+        description: The ID of the dataset in OpenProblems v1
+        required: true
+      - name: --obs_cell_type
+        type: string
+        description: Location of where to find the observation cell types.
+      - name: --obs_batch
+        type: string
+        description: Location of where to find the observation batch IDs.
+      - name: --obs_tissue
+        type: string
+        description: Location of where to find the observation tissue information.
+      - name: --layer_counts
+        type: string
+        description: In which layer to find the counts matrix. Leave undefined to use `.X`.
+        example: counts
+      - name: --sparse
+        type: boolean
+        default: true
+        description: Convert layers to a sparse CSR format.
+      - name: --var_feature_id
+        type: string
+        description: Location of where to find the feature IDs. Can be set to index if the feature IDs are the index.
+        example: gene_ids
+      - name: --var_feature_name
+        type: string
+        description: Location of where to find the feature names. Can be set to index if the feature names are the index.
+        default: index
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+      - name: --output_mod2
+        __merge__: ../../../api/file_raw.yaml
+        direction: output
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: apt
+        packages: git
+      - type: docker
+        run: |
+          git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \
+            pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \
+            pip install --no-cache-dir --editable /opt/openproblems
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
diff --git a/src/datasets/loaders/openproblems_v1_multimodal/script.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py
similarity index 100%
rename from src/datasets/loaders/openproblems_v1_multimodal/script.py
rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py
diff --git a/src/datasets/loaders/openproblems_v1_multimodal/test.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py
similarity index 100%
rename from src/datasets/loaders/openproblems_v1_multimodal/test.py
rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py
diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml
deleted file mode 100644
index 96dad30e76..0000000000
--- a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-functionality:
-  name: "openproblems_neurips2021_bmmc"
-  namespace: "datasets/loaders"
-  description: "Fetch a dataset from the OpenProblems NeurIPS2021 competition"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input"
-          type: file
-          description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122.
-          required: true
-          example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad
-        - name: "--mod1"
-          type: string
-          description: Name of the first modality.
-          required: true
-          example: GEX
-        - name: "--mod2"
-          type: string
-          description: Name of the second modality.
-          required: true
-          example: ADT
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: "A unique identifier for the dataset"
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-        - name: "--output_mod2"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-    # - type: file
-    #   path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-  - type: nextflow
-    directives:
-      label: [highmem, midcpu, midtime]
\ No newline at end of file
diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml
deleted file mode 100644
index b2141482f1..0000000000
--- a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-functionality:
-  name: "openproblems_neurips2022_pbmc"
-  namespace: "datasets/loaders"
-  description: "Fetch a dataset from the OpenProblems NeurIPS2022 competition"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input_mod1"
-          type: file
-          description: "Processed RNA h5ad file"
-          required: true
-          example: cite_rna_merged.h5ad
-        - name: "--input_mod2"
-          type: file
-          description: "Processed ADT or ATAC h5ad file"
-          required: true
-          example: cite_prot_merged.h5ad
-        - name: "--mod1"
-          type: string
-          description: Name of the first modality.
-          required: true
-          example: GEX
-        - name: "--mod2"
-          type: string
-          description: Name of the second modality.
-          required: true
-          example: ADT
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: "A unique identifier for the dataset"
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-        - name: "--output_mod2"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-  resources:
-    - type: python_script
-      path: script.py
-  # skip unit test until data is public
-  # test_resources:
-  #   - type: python_script
-  #     path: test.py
-    # - type: file
-    #   path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-  - type: nextflow
-    directives:
-      label: [ highmem, midcpu, midtime]
\ No newline at end of file
diff --git a/src/datasets/loaders/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/openproblems_v1/config.vsh.yaml
deleted file mode 100644
index d3a3ad846f..0000000000
--- a/src/datasets/loaders/openproblems_v1/config.vsh.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-__merge__: ../../api/comp_dataset_loader.yaml
-functionality:
-  name: "openproblems_v1"
-  description: "Fetch a dataset from OpenProblems v1"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input_id"
-          type: "string"
-          description: "The ID of the dataset in OpenProblems v1"
-          required: true
-        - name: "--obs_cell_type"
-          type: "string"
-          description: "Location of where to find the observation cell types."
-        - name: "--obs_batch"
-          type: "string"
-          description: "Location of where to find the observation batch IDs."
-        - name: "--obs_tissue"
-          type: "string"
-          description: "Location of where to find the observation tissue information."
-        - name: "--layer_counts"
-          type: "string"
-          description: "In which layer to find the counts matrix. Leave undefined to use `.X`."
-          example: counts
-        - name: "--sparse"
-          type: boolean
-          default: true
-          description: Convert layers to a sparse CSR format.
-        - name: "--var_feature_id"
-          type: "string"
-          description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index."
-          example: gene_ids
-        - name: "--var_feature_name"
-          type: "string"
-          description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
-          default: index
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-    setup:
-      - type: apt
-        packages: git
-      - type: docker
-        run: |
-          git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \
-            pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \
-            pip install --no-cache-dir --editable /opt/openproblems
-  - type: nextflow
-    directives:
-      label: [highmem, midcpu , midtime]
diff --git a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml
deleted file mode 100644
index 6247ae3bf9..0000000000
--- a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-functionality:
-  name: "openproblems_v1_multimodal"
-  namespace: "datasets/loaders"
-  description: "Fetch a dataset from OpenProblems v1"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input_id"
-          type: "string"
-          description: "The ID of the dataset in OpenProblems v1"
-          required: true
-        - name: "--obs_cell_type"
-          type: "string"
-          description: "Location of where to find the observation cell types."
-        - name: "--obs_batch"
-          type: "string"
-          description: "Location of where to find the observation batch IDs."
-        - name: "--obs_tissue"
-          type: "string"
-          description: "Location of where to find the observation tissue information."
-        - name: "--layer_counts"
-          type: "string"
-          description: "In which layer to find the counts matrix. Leave undefined to use `.X`."
-          example: counts
-        - name: "--sparse"
-          type: boolean
-          default: true
-          description: Convert layers to a sparse CSR format.
-        - name: "--var_feature_id"
-          type: "string"
-          description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index."
-          example: gene_ids
-        - name: "--var_feature_name"
-          type: "string"
-          description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
-          default: index
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-        - name: "--output_mod2"
-          __merge__: ../../api/file_raw.yaml
-          direction: "output"
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-    setup:
-      - type: apt
-        packages: git
-      - type: docker
-        run: |
-          git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \
-            pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \
-            pip install --no-cache-dir --editable /opt/openproblems
-  - type: nextflow
-    directives:
-      label: [highmem, midcpu , midtime]
diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml
new file mode 100644
index 0000000000..6ab9fe785d
--- /dev/null
+++ b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml
@@ -0,0 +1,176 @@
+name: cellxgene_census
+namespace: datasets/loaders/scrnaseq
+description: |
+  Query cells from a CellxGene Census or custom TileDBSoma object.
+  Aside from fetching the cells' RNA counts (`.X`), cell metadata
+  (`.obs`) and gene metadata (`.var`), this component also fetches
+  the dataset metadata and joins it into the cell metadata.
+argument_groups:
+  - name: Input database
+    description: Open CellxGene Census by version or URI.
+    arguments:
+      - name: --input_uri
+        type: string
+        description: If specified, a URI containing the Census SOMA objects. If specified,
+          will take precedence over the `--census_version` argument.
+        required: false
+        example: s3://bucket/path
+      - name: --census_version
+        description: Which release of CellxGene census to use. Possible values are
+          "latest", "stable", or the date of one of the releases (e.g. "2023-07-25").
+          For more information, check the documentation on [Census data 
+          releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html).
+        type: string
+        example: stable
+        required: false
+  - name: Cell query
+    description: Arguments related to the query.
+    arguments:
+      - name: --species
+        type: string
+        description: The organism to query, usually one of `Homo sapiens` or `Mus
+          musculus`.
+        required: true
+        example: homo_sapiens
+      - name: --obs_value_filter
+        type: string
+        description: Filter for selecting the `obs` metadata (i.e. cells). Value is
+          a filter query written in the SOMA `value_filter` syntax.
+        required: true
+        example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136',
+          'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'
+  - name: Filter cells by grouping
+    description:
+    arguments:
+      - name: --cell_filter_grouping
+        type: string
+        description: |
+          A subset of 'obs' columns by which to group the cells for filtering.
+          Only groups surpassing or equal to the `--cell_filter_minimum_count`
+          threshold will be retained. Take care not to introduce a selection
+          bias against cells with more fine-grained ontology annotations.
+        required: false
+        example: [dataset_id, tissue, assay, disease, cell_type]
+        multiple: true
+      - name: --cell_filter_minimum_count
+        type: integer
+        description: |
+          A minimum number of cells per group to retain. If `--cell_filter_grouping`
+          is defined, this parameter should also be provided and vice versa.
+        required: false
+        example: 100
+  - name: Count filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --cell_filter_min_genes
+        type: integer
+        description: Remove cells with less than this number of genes.
+        required: false
+        default: 50
+      - name: --cell_filter_min_counts
+        type: integer
+        description: Remove cells with less than this number of counts.
+        required: false
+        default: 0
+      - name: --gene_filter_min_cells
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        default: 5
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+        default: 0
+  - name: Cell metadata
+    description: Cell metadata arguments
+    arguments:
+      - name: --obs_batch
+        type: string
+        description: |
+          Location of where to find the observation batch IDs.  
+
+          * If not specified, the `.obs["batch"]` field will not be included.
+          * If one or more values are specified, the `.obs["batch"]` field will be 
+            set to the concatenated values of the specified fields, separated by
+            the `obs_batch_separator`.
+        required: false
+        multiple: true
+        multiple_sep: ','
+        example: [batch]
+      - name: --obs_batch_separator
+        type: string
+        description: Separator to use when concatenating the values of the `--obs_batch`
+          fields.
+        required: false
+        default: +
+  - name: Dataset metadata
+    description: Information about the dataset that will be stored in the `.uns` slot.
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: --output
+        type: file
+        description: Output h5ad file.
+        direction: output
+        required: true
+        example: output.h5ad
+      - name: --output_compression
+        type: string
+        choices: [gzip, lzf]
+        required: false
+        example: gzip
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/common/helper_functions/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+engines:
+  - type: docker
+    #image: openproblems/base_python:1.0.0
+    image: python:3.11
+    setup:
+      - type: python
+        packages:
+          - cellxgene-census
+          - scanpy
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu]
diff --git a/src/datasets/loaders/cellxgene_census/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census/script.py
similarity index 100%
rename from src/datasets/loaders/cellxgene_census/script.py
rename to src/datasets/loaders/scrnaseq/cellxgene_census/script.py
diff --git a/src/datasets/loaders/cellxgene_census/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census/test.py
similarity index 100%
rename from src/datasets/loaders/cellxgene_census/test.py
rename to src/datasets/loaders/scrnaseq/cellxgene_census/test.py
diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml
new file mode 100644
index 0000000000..99ae8929a5
--- /dev/null
+++ b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml
@@ -0,0 +1,132 @@
+name: cellxgene_census_from_source_h5ad
+namespace: datasets/loaders/scrnaseq
+description: |
+  Query cells from a CellxGene Census or custom TileDBSoma object.
+  Aside from fetching the cells' RNA counts (`.X`), cell metadata
+  (`.obs`) and gene metadata (`.var`), this component also fetches
+  the dataset metadata and joins it into the cell metadata.
+argument_groups:
+  - name: Input
+    description: Input arguments
+    arguments:
+      - name: --input_id
+        type: string
+        description: |
+          The dataset ID of the CellxGene Census dataset to query.
+        required: true
+        example: a93eab58-3d82-4b61-8a2f-d7666dcdb7c4
+  - name: Count filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --cell_filter_min_genes
+        type: integer
+        description: Remove cells with less than this number of genes.
+        required: false
+        default: 50
+      - name: --cell_filter_min_counts
+        type: integer
+        description: Remove cells with less than this number of counts.
+        required: false
+        default: 0
+      - name: --gene_filter_min_cells
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        default: 5
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+        default: 0
+  - name: Cell metadata
+    description: Cell metadata arguments
+    arguments:
+      - name: --obs_batch
+        type: string
+        description: |
+          Location of where to find the observation batch IDs.  
+
+          * If not specified, the `.obs["batch"]` field will not be included.
+          * If one or more values are specified, the `.obs["batch"]` field will be 
+            set to the concatenated values of the specified fields, separated by
+            the `obs_batch_separator`.
+        required: false
+        multiple: true
+        multiple_sep: ','
+        example: [batch]
+      - name: --obs_batch_separator
+        type: string
+        description: Separator to use when concatenating the values of the `--obs_batch`
+          fields.
+        required: false
+        default: +
+  - name: Dataset metadata
+    description: Information about the dataset that will be stored in the `.uns` slot.
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: true
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: --output
+        type: file
+        description: Output h5ad file.
+        direction: output
+        required: true
+        example: output.h5ad
+      - name: --output_compression
+        type: string
+        choices: [gzip, lzf]
+        required: false
+        example: gzip
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/common/helper_functions/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+engines:
+  - type: docker
+    #image: openproblems/base_python:1.0.0
+    image: python:3.11
+    setup:
+      - type: python
+        packages:
+          - cellxgene-census
+          - scanpy
+    test_setup:
+      - type: python
+        packages:
+          - viashpy
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu]
diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py
similarity index 100%
rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py
rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py
diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py
similarity index 100%
rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py
rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py
diff --git a/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml
new file mode 100644
index 0000000000..6ebb63b410
--- /dev/null
+++ b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml
@@ -0,0 +1,91 @@
+__merge__: ../../../api/comp_dataset_loader.yaml
+name: openproblems_v1
+namespace: datasets/loaders/scrnaseq
+description: Fetch a dataset from OpenProblems v1
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_id
+        type: string
+        description: The ID of the dataset in OpenProblems v1
+        required: true
+      - name: --obs_cell_type
+        type: string
+        description: Location of where to find the observation cell types.
+      - name: --obs_batch
+        type: string
+        description: Location of where to find the observation batch IDs.
+      - name: --obs_tissue
+        type: string
+        description: Location of where to find the observation tissue information.
+      - name: --layer_counts
+        type: string
+        description: In which layer to find the counts matrix. Leave undefined to
+          use `.X`.
+        example: counts
+      - name: --sparse
+        type: boolean
+        default: true
+        description: Convert layers to a sparse CSR format.
+      - name: --var_feature_id
+        type: string
+        description: Location of where to find the feature IDs. Can be set to index
+          if the feature IDs are the index.
+        example: gene_ids
+      - name: --var_feature_name
+        type: string
+        description: Location of where to find the feature names. Can be set to index
+          if the feature names are the index.
+        default: index
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: apt
+        packages: git
+      - type: docker
+        run: |
+          git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \
+            pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \
+            pip install --no-cache-dir --editable /opt/openproblems
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
diff --git a/src/datasets/loaders/openproblems_v1/script.py b/src/datasets/loaders/scrnaseq/openproblems_v1/script.py
similarity index 100%
rename from src/datasets/loaders/openproblems_v1/script.py
rename to src/datasets/loaders/scrnaseq/openproblems_v1/script.py
diff --git a/src/datasets/loaders/openproblems_v1/test.py b/src/datasets/loaders/scrnaseq/openproblems_v1/test.py
similarity index 100%
rename from src/datasets/loaders/openproblems_v1/test.py
rename to src/datasets/loaders/scrnaseq/openproblems_v1/test.py
diff --git a/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml
new file mode 100644
index 0000000000..b673826a91
--- /dev/null
+++ b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml
@@ -0,0 +1,96 @@
+name: tenx_visium
+namespace: datasets/loaders/spatial
+description: |
+  Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else).
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_expression
+        type: string
+        description: URL to the feature / barcode matrix HDF5 of the 10x dataset.
+        required: true
+      - name: --input_spatial
+        type: string
+        description: URL to the Spatial imaging data of the 10x dataset.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        example: dataset.h5ad
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitochondrial genes?
+        required: false
+
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    setup:
+      - type: python
+        packages:
+          - squidpy
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/datasets/loaders/tenx_visium/script.py b/src/datasets/loaders/spatial/tenx_visium/script.py
similarity index 98%
rename from src/datasets/loaders/tenx_visium/script.py
rename to src/datasets/loaders/spatial/tenx_visium/script.py
index 100bfde555..2cfa3c9054 100644
--- a/src/datasets/loaders/tenx_visium/script.py
+++ b/src/datasets/loaders/spatial/tenx_visium/script.py
@@ -18,7 +18,7 @@
   "remove_mitochondrial": True
 }
 meta = {
-  "functionality_name": "tenx_visium"
+  "name": "tenx_visium"
 }
 ## VIASH END
 
diff --git a/src/datasets/loaders/tenx_visium/test.py b/src/datasets/loaders/spatial/tenx_visium/test.py
similarity index 100%
rename from src/datasets/loaders/tenx_visium/test.py
rename to src/datasets/loaders/spatial/tenx_visium/test.py
diff --git a/src/datasets/loaders/spatial/zenodo/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml
new file mode 100644
index 0000000000..b4e06238a8
--- /dev/null
+++ b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml
@@ -0,0 +1,88 @@
+name: zenodo
+namespace: datasets/loaders/spatial
+description: |
+  Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_data
+        type: string
+        description: URL to the Anndata file.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        example: dataset.h5ad
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitochondrial genes?
+        required: false
+
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/datasets/loaders/zenodo_spatial/script.py b/src/datasets/loaders/spatial/zenodo/script.py
similarity index 96%
rename from src/datasets/loaders/zenodo_spatial/script.py
rename to src/datasets/loaders/spatial/zenodo/script.py
index 83aeb86056..7392274a42 100644
--- a/src/datasets/loaders/zenodo_spatial/script.py
+++ b/src/datasets/loaders/spatial/zenodo/script.py
@@ -5,7 +5,7 @@
 # VIASH START
 par = {
     "input_data": "ps://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1",
-    "dataset_id": "zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2",
+    "dataset_id": "zenodo/mouse_olfactory_bulb_puck_slideseqv2",
     "dataset_name": "Mouse Olfactory Bulk Puck",
     "dataset_url": "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary",
     "dataset_summary": "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2",
@@ -16,7 +16,7 @@
     "remove_mitochondrial": True
 }
 meta = {
-    "functionality_name": "zenodo_spatial"
+    "name": "zenodo"
 }
 # VIASH END
 
diff --git a/src/datasets/loaders/zenodo_spatial/test.py b/src/datasets/loaders/spatial/zenodo/test.py
similarity index 97%
rename from src/datasets/loaders/zenodo_spatial/test.py
rename to src/datasets/loaders/spatial/zenodo/test.py
index 07dcd953a8..17a87366ec 100644
--- a/src/datasets/loaders/zenodo_spatial/test.py
+++ b/src/datasets/loaders/spatial/zenodo/test.py
@@ -3,7 +3,7 @@
 import anndata as ad
 
 input_data ="https://zenodo.org/records/12784832/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1"
-dataset_id = "zenodo_spatial/mouse_olfactory_bulb_puck"
+dataset_id = "zenodo/mouse_olfactory_bulb_puck"
 dataset_name = "mouse_olfactory_bulb_puck"
 dataset_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
 dataset_summary = "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2"
diff --git a/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml
new file mode 100644
index 0000000000..0355c8bb64
--- /dev/null
+++ b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml
@@ -0,0 +1,88 @@
+name: zenodo_slidetags
+namespace: datasets/loaders/spatial
+description: |
+  Download a compressed file containing gene expression matrix and spatial locations from zenodo.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_data
+        type: string
+        description: URL to the file.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        example: dataset.h5ad
+  - name: Metadata
+    arguments:
+      - name: --dataset_id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitochondrial genes?
+        required: false
+
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/script.py b/src/datasets/loaders/spatial/zenodo_slidetags/script.py
similarity index 96%
rename from src/datasets/loaders/zenodo_spatial_slidetags/script.py
rename to src/datasets/loaders/spatial/zenodo_slidetags/script.py
index 5a8cf212fa..777f7e9e45 100644
--- a/src/datasets/loaders/zenodo_spatial_slidetags/script.py
+++ b/src/datasets/loaders/spatial/zenodo_slidetags/script.py
@@ -6,7 +6,7 @@
 # VIASH START
 par = {
     "input_data": "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1",
-    "dataset_id": "zenodo_spatial_slidetags/human_cortex_slidetags",
+    "dataset_id": "zenodo_slidetags/human_cortex_slidetags",
     "dataset_name": "slidetag_human_cortex",
     "dataset_url": "https://www.nature.com/articles/s41586-023-06837-4",
     "dataset_summary": "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics",
@@ -17,7 +17,7 @@
     "remove_mitochondrial": True
 }
 meta = {
-    "functionality_name": "zenodo_spatial_slidetags"
+    "name": "zenodo_slidetags"
 }
 # VIASH END
 
diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/test.py b/src/datasets/loaders/spatial/zenodo_slidetags/test.py
similarity index 97%
rename from src/datasets/loaders/zenodo_spatial_slidetags/test.py
rename to src/datasets/loaders/spatial/zenodo_slidetags/test.py
index 9f859ebea6..c97203735b 100644
--- a/src/datasets/loaders/zenodo_spatial_slidetags/test.py
+++ b/src/datasets/loaders/spatial/zenodo_slidetags/test.py
@@ -3,7 +3,7 @@
 import anndata as ad
 
 input_data ="https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1"
-dataset_id = "zenodo_spatial_slidetags/human_cortex"
+dataset_id = "zenodo_slidetags/human_cortex"
 dataset_name = "slidetag_human_cortex"
 dataset_url = "https://www.nature.com/articles/s41586-023-06837-4"
 dataset_summary = "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics"
diff --git a/src/datasets/loaders/tenx_visium/config.vsh.yaml b/src/datasets/loaders/tenx_visium/config.vsh.yaml
deleted file mode 100644
index ba28b32b89..0000000000
--- a/src/datasets/loaders/tenx_visium/config.vsh.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-functionality:
-  name: tenx_visium
-  namespace: datasets/loaders
-  description: |
-    Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else).
-  
-  argument_groups: 
-    - name: Inputs
-      arguments:
-        - name: "--input_expression"
-          type: string
-          description: URL to the feature / barcode matrix HDF5 of the 10x dataset.
-          required: true
-        - name: "--input_spatial"
-          type: string
-          description: URL to the Spatial imaging data of the 10x dataset.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          example: dataset.h5ad
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitochondrial genes?
-          required: false
-  
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-
-platforms:
-  - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        packages:
-          - squidpy
-  - type: nextflow
diff --git a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml
deleted file mode 100644
index 776b177481..0000000000
--- a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-functionality:
-  name: zenodo_spatial
-  namespace: datasets/loaders
-  description: |
-    Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo.
-  argument_groups: 
-    - name: Inputs
-      arguments:
-        - name: "--input_data"
-          type: string
-          description: URL to the Anndata file.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          example: dataset.h5ad
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitochondrial genes?
-          required: false
-  
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-  - type: nextflow
diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml
deleted file mode 100644
index 905be3514c..0000000000
--- a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-functionality:
-  name: zenodo_spatial_slidetags
-  namespace: datasets/loaders
-  description: |
-    Download a compressed file containing gene expression matrix and spatial locations from zenodo.
-  
-  argument_groups: 
-    - name: Inputs
-      arguments:
-        - name: "--input_data"
-          type: string
-          description: URL to the file.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          example: dataset.h5ad
-    - name: Metadata
-      arguments:
-        - name: "--dataset_id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitochondrial genes?
-          required: false
-  
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test.py
-
-platforms:
-  - type: docker
-    image: openproblems/base_python:1.0.0
-  - type: nextflow
diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml
index 31319f0958..850b49363b 100644
--- a/src/datasets/normalization/atac_tfidf/config.vsh.yaml
+++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml
@@ -1,16 +1,15 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "atac_tfidf"
-  description: |
-    Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency).
+name: atac_tfidf
+description: |
+  Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency).
 
-    TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF
+  TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF
 
-    By default, log(TF) * log(IDF) is returned.
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+  By default, log(TF) * log(IDF) is returned.
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
     setup:
@@ -18,6 +17,8 @@ platforms:
         packages:
           - muon
           - numpy<2
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/atac_tfidf/script.py b/src/datasets/normalization/atac_tfidf/script.py
index ecb772bd64..1d38a8fcc8 100644
--- a/src/datasets/normalization/atac_tfidf/script.py
+++ b/src/datasets/normalization/atac_tfidf/script.py
@@ -7,7 +7,7 @@
     'output': "output_norm.h5ad"
 }
 meta = {
-    'functionality_name': "tfidf"
+    'name': "tfidf"
 }
 ## VIASH END
 
@@ -20,7 +20,7 @@
 
 print("Store output in adata", flush=True)
 adata.layers[par["layer_output"]] = normalized_counts
-adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name']
+adata.uns["normalization_id"] = par["normalization_id"] or meta['name']
 
 print("Write data", flush=True)
 adata.write_h5ad(par['output'], compression="gzip")
diff --git a/src/datasets/normalization/l1_sqrt/config.vsh.yaml b/src/datasets/normalization/l1_sqrt/config.vsh.yaml
index 212eadc968..3a3aca8ed0 100644
--- a/src/datasets/normalization/l1_sqrt/config.vsh.yaml
+++ b/src/datasets/normalization/l1_sqrt/config.vsh.yaml
@@ -1,20 +1,19 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "l1_sqrt"
-  description: |
-    Scaled L1 sqrt normalization.
+name: l1_sqrt
+description: |
+  Scaled L1 sqrt normalization.
 
-    This normalization method causes all cells to have the same sum of values.
+  This normalization method causes all cells to have the same sum of values.
 
-    Steps:
+  Steps:
 
-    * Compute the square root of the counts.
-    * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1).
-    * Multiply by the median UMI count per cell, causing all cells to have the sum of values.
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+  * Compute the square root of the counts.
+  * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1).
+  * Multiply by the median UMI count per cell, causing all cells to have the sum of values.
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
     setup:
@@ -22,6 +21,8 @@ platforms:
         packages:
           - scprep
           - numpy<2
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/l1_sqrt/script.py b/src/datasets/normalization/l1_sqrt/script.py
index 76c69cf897..9dd5c96505 100644
--- a/src/datasets/normalization/l1_sqrt/script.py
+++ b/src/datasets/normalization/l1_sqrt/script.py
@@ -8,7 +8,7 @@
     'output': "output_norm.h5ad"
 }
 meta = {
-    'functionality_name': "l1_sqrt"
+    'name': "l1_sqrt"
 }
 ## VIASH END
 
@@ -23,7 +23,7 @@
 
 print("Store output in adata", flush=True)
 adata.layers[par["layer_output"]] = l1_sqrt
-adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name']
+adata.uns["normalization_id"] = par["normalization_id"] or meta['name']
 
 print("Write data", flush=True)
 adata.write_h5ad(par['output'], compression="gzip")
diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml
index 89b2a283f9..d686c54147 100644
--- a/src/datasets/normalization/log_cp/config.vsh.yaml
+++ b/src/datasets/normalization/log_cp/config.vsh.yaml
@@ -1,18 +1,20 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "log_cp"
-  description: "Normalize data using Log CP"
-  resources:
-    - type: python_script
-      path: script.py
-  arguments:
-    - name: "--n_cp"
-      type: integer
-      default: 1e4
-      description: "Number of counts per cell. When set to -1, will use None."
-platforms:
+name: log_cp
+links: {}
+description: Normalize data using Log CP
+resources:
+  - type: python_script
+    path: script.py
+arguments:
+  - name: --n_cp
+    type: integer
+    default: 1e4
+    description: Number of counts per cell. When set to -1, will use None.
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/log_cp/script.py b/src/datasets/normalization/log_cp/script.py
index 39ddf61636..d537ee4a7c 100644
--- a/src/datasets/normalization/log_cp/script.py
+++ b/src/datasets/normalization/log_cp/script.py
@@ -9,7 +9,7 @@
     'n_cp': 1e6,
 }
 meta = {
-    "functionality_name": "normalize_log_cp10k"
+    "name": "normalize_log_cp10k"
 }
 ## VIASH END
 
@@ -36,7 +36,7 @@
 print(">> Store output in adata", flush=True)
 adata.layers[par["layer_output"]] = lognorm
 adata.obs[par["obs_size_factors"]] = norm["norm_factor"]
-adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name']
+adata.uns["normalization_id"] = par["normalization_id"] or meta['name']
 
 print(">> Write data", flush=True)
 adata.write_h5ad(par['output'], compression="gzip")
diff --git a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml
index 4cbf81ff5a..e010c1032b 100644
--- a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml
+++ b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml
@@ -1,18 +1,20 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "log_scran_pooling"
-  description: "Normalize data using scran pooling"
-  resources:
-    - type: r_script
-      path: script.R
-platforms:
+name: log_scran_pooling
+links: {}
+description: Normalize data using scran pooling
+resources:
+  - type: r_script
+    path: script.R
+engines:
   - type: docker
     image: openproblems/base_r:1.0.0
     setup:
       - type: r
-        cran: [ Matrix, rlang, scran, BiocParallel ]
+        cran: [Matrix, rlang, scran, BiocParallel]
       - type: python
         pip: scanpy
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/log_scran_pooling/script.R b/src/datasets/normalization/log_scran_pooling/script.R
index be51e21f38..33bb13e8eb 100644
--- a/src/datasets/normalization/log_scran_pooling/script.R
+++ b/src/datasets/normalization/log_scran_pooling/script.R
@@ -30,7 +30,7 @@ adata$obs[[par$obs_size_factors]] <- size_factors
 adata$layers[[par$layer_output]] <- lognorm
 norm_id <- par[["normalization_id"]]
 if (is.null(norm_id)) {
-  norm_id <- meta[["functionality_name"]]
+  norm_id <- meta[["name"]]
 }
 adata$uns[["normalization_id"]] <- norm_id
 
diff --git a/src/datasets/normalization/prot_clr/config.vsh.yaml b/src/datasets/normalization/prot_clr/config.vsh.yaml
index 8f6bbe269f..3262ca73b7 100644
--- a/src/datasets/normalization/prot_clr/config.vsh.yaml
+++ b/src/datasets/normalization/prot_clr/config.vsh.yaml
@@ -1,26 +1,27 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "prot_clr"
-  description: |
-    Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017).
+name: prot_clr
+description: |
+  Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017).
 
-    The CLR transformation is defined as:
+  The CLR transformation is defined as:
 
-    $$
-    x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right)
-    $$
+  $$
+  x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right)
+  $$
 
-    where $\(g(x)\)$ is the geometric mean of the row $\(x\)$.
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+  where $\(g(x)\)$ is the geometric mean of the row $\(x\)$.
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
     setup:
       - type: python
         packages:
           - muon
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/prot_clr/script.py b/src/datasets/normalization/prot_clr/script.py
index 3f0a2fb3fd..4741625935 100644
--- a/src/datasets/normalization/prot_clr/script.py
+++ b/src/datasets/normalization/prot_clr/script.py
@@ -7,7 +7,7 @@
     'output': "output_norm.h5ad"
 }
 meta = {
-    'functionality_name': "clr"
+    'name': "clr"
 }
 ## VIASH END
 
@@ -22,7 +22,7 @@
 
 print("Store output in adata", flush=True)
 adata.layers[par["layer_output"]] = normalized_counts.X
-adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name']
+adata.uns["normalization_id"] = par["normalization_id"] or meta['name']
 
 print("Write data", flush=True)
 adata.write_h5ad(par['output'], compression="gzip")
diff --git a/src/datasets/normalization/sqrt_cp/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml
index 4d95636f4c..1e241e89d0 100644
--- a/src/datasets/normalization/sqrt_cp/config.vsh.yaml
+++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml
@@ -1,18 +1,19 @@
 __merge__: ../../api/comp_normalization.yaml
-functionality:
-  name: "sqrt_cp"
-  description: "Normalize data using Log Sqrt"
-  resources:
-    - type: python_script
-      path: script.py
-  arguments:
-    - name: "--n_cp"
-      type: integer
-      default: 1e4
-      description: "Number of counts per cell"
-platforms:
+name: sqrt_cp
+description: Normalize data using Log Sqrt
+resources:
+  - type: python_script
+    path: script.py
+arguments:
+  - name: --n_cp
+    type: integer
+    default: 1e4
+    description: Number of counts per cell
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
   - type: nextflow
-    directives: 
+    directives:
       label: [midtime, midmem, midcpu]
diff --git a/src/datasets/normalization/sqrt_cp/script.py b/src/datasets/normalization/sqrt_cp/script.py
index 84afdaa19d..d2540a519a 100644
--- a/src/datasets/normalization/sqrt_cp/script.py
+++ b/src/datasets/normalization/sqrt_cp/script.py
@@ -10,7 +10,7 @@
     'n_cp': 1e6,
 }
 meta = {
-    "functionality_name": "normalize_sqrt_cpm"
+    "name": "normalize_sqrt_cpm"
 }
 ## VIASH END
 
@@ -29,7 +29,7 @@
 print(">> Store output in adata", flush=True)
 adata.layers[par["layer_output"]] = lognorm
 adata.obs[par["obs_size_factors"]] = norm["norm_factor"]
-adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name']
+adata.uns["normalization_id"] = par["normalization_id"] or meta['name']
 
 print(">> Write data", flush=True)
 adata.write_h5ad(par['output'], compression="gzip")
diff --git a/src/datasets/processors/hvg/config.vsh.yaml b/src/datasets/processors/hvg/config.vsh.yaml
index aed18c6d38..886fe92f94 100644
--- a/src/datasets/processors/hvg/config.vsh.yaml
+++ b/src/datasets/processors/hvg/config.vsh.yaml
@@ -1,13 +1,14 @@
 __merge__: ../../api/comp_processor_hvg.yaml
-functionality:
-  name: "hvg"
-  description: "Compute HVG"
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+name: hvg
+description: Compute HVG
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midtime, highmem, midcpu]
diff --git a/src/datasets/processors/knn/config.vsh.yaml b/src/datasets/processors/knn/config.vsh.yaml
index 9908fe9086..b3cf894420 100644
--- a/src/datasets/processors/knn/config.vsh.yaml
+++ b/src/datasets/processors/knn/config.vsh.yaml
@@ -1,13 +1,14 @@
 __merge__: ../../api/comp_processor_knn.yaml
-functionality:
-  name: "knn"
-  description: "Compute KNN"
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+name: knn
+description: Compute KNN
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midtime, highmem, midcpu]
diff --git a/src/datasets/processors/pca/config.vsh.yaml b/src/datasets/processors/pca/config.vsh.yaml
index 7f0213b922..b1c1a7ba14 100644
--- a/src/datasets/processors/pca/config.vsh.yaml
+++ b/src/datasets/processors/pca/config.vsh.yaml
@@ -1,17 +1,18 @@
 __merge__: ../../api/comp_processor_pca.yaml
-functionality:
-  name: "pca"
-  description: "Compute PCA"
-  resources:
-    - type: python_script
-      path: script.py
+name: pca
+description: Compute PCA
+resources:
+  - type: python_script
+    path: script.py
   # test_resources:
   #   - type: python_script
   #     path: test_script.py
   #   - path: "../../../resources_test/common/pancreas"
-platforms:
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midtime, highmem, midcpu]
diff --git a/src/datasets/processors/subsample/config.vsh.yaml b/src/datasets/processors/subsample/config.vsh.yaml
index 4e52e93db5..9cc62c3602 100644
--- a/src/datasets/processors/subsample/config.vsh.yaml
+++ b/src/datasets/processors/subsample/config.vsh.yaml
@@ -1,51 +1,54 @@
 __merge__: ../../api/comp_processor_subset.yaml
-functionality:
-  name: "subsample"
-  description: "Subsample an h5ad file"
-  arguments:
-    - name: "--n_obs"
-      type: integer
-      description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-      default: 500
-    - name: "--n_vars"
-      type: integer
-      description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-      default: 500
-    - name: "--keep_features"
-      type: string
-      multiple: true
-      description: A list of genes to keep.
-    - name: "--keep_cell_type_categories"
-      type: "string"
-      multiple: true
-      description: "Cell type indexes to be selected"
-      required: false
-    - name: "--keep_batch_categories"
-      type: "string"
-      multiple: true
-      description: "Categories indexes to be selected"
-      required: false
-    - name: "--even"
-      type: "boolean_true"
-      description: Subsample evenly from different batches
-    - name: "--seed"
-      type: "integer"
-      description: "A seed for the subsampling."
-      example: 123
-  resources:
-    - type: python_script
-      path: script.py
-  test_resources:
-    - type: python_script
-      path: test_script.py
-    - path: /resources_test/common/pancreas
-platforms:
+name: subsample
+description: Subsample an h5ad file
+arguments:
+  - name: --n_obs
+    type: integer
+    description: Maximum number of observations to be kept. It might end up being
+      less because empty cells / genes are removed.
+    default: 500
+  - name: --n_vars
+    type: integer
+    description: Maximum number of variables to be kept. It might end up being less
+      because empty cells / genes are removed.
+    default: 500
+  - name: --keep_features
+    type: string
+    multiple: true
+    description: A list of genes to keep.
+  - name: --keep_cell_type_categories
+    type: string
+    multiple: true
+    description: Cell type indexes to be selected
+    required: false
+  - name: --keep_batch_categories
+    type: string
+    multiple: true
+    description: Categories indexes to be selected
+    required: false
+  - name: --even
+    type: boolean_true
+    description: Subsample evenly from different batches
+  - name: --seed
+    type: integer
+    description: A seed for the subsampling.
+    example: 123
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: python_script
+    path: test_script.py
+  - path: /resources_test/common/pancreas
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
     test_setup:
       - type: python
         packages:
           - viashpy
+runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midtime, highmem, midcpu]
diff --git a/src/datasets/processors/subsample/test_script.py b/src/datasets/processors/subsample/test_script.py
index 80dde5d383..cb7f90189a 100644
--- a/src/datasets/processors/subsample/test_script.py
+++ b/src/datasets/processors/subsample/test_script.py
@@ -42,9 +42,9 @@ def test_keep_functionality(run_component):
 
     run_component([
         "--input", input_path,
-        "--keep_cell_type_categories", "acinar:beta",
-        "--keep_batch_categories", "celseq:inDrop4:smarter",
-        "--keep_features", ":".join(keep_features),
+        "--keep_cell_type_categories", "acinar;beta",
+        "--keep_batch_categories", "celseq;inDrop4;smarter",
+        "--keep_features", ";".join(keep_features),
         "--output", output_path,
         "--seed", "123"
     ])
diff --git a/src/datasets/processors/svd/config.vsh.yaml b/src/datasets/processors/svd/config.vsh.yaml
index bbad17f58c..bd71cae4c8 100644
--- a/src/datasets/processors/svd/config.vsh.yaml
+++ b/src/datasets/processors/svd/config.vsh.yaml
@@ -1,16 +1,17 @@
 __merge__: ../../api/comp_processor_svd.yaml
-functionality:
-  name: "svd"
-  description: "Compute SVD pca reduction"
-  resources:
-    - type: python_script
-      path: script.py
-platforms:
+name: svd
+description: Compute SVD pca reduction
+resources:
+  - type: python_script
+    path: script.py
+engines:
   - type: docker
     image: openproblems/base_python:1.0.0
     setup:
       - type: python
         pypi: [scikit-learn]
+runners:
+  - type: executable
   - type: nextflow
     directives:
       label: [midtime, highmem, midcpu]
diff --git a/src/datasets/resource_scripts/cellxgene_census.sh b/src/datasets/resource_scripts/cellxgene_census.sh
index 5d6181f91e..62eaff1f34 100755
--- a/src/datasets/resource_scripts/cellxgene_census.sh
+++ b/src/datasets/resource_scripts/cellxgene_census.sh
@@ -126,7 +126,7 @@ output_normalized: force_null
 output_pca: force_null
 output_hvg: force_null
 output_knn: force_null
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/scrnaseq
 HERE
 
 cat > /tmp/nextflow.config << HERE
@@ -145,7 +145,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \
+  --main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "/tmp/params.yaml" \
diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh
index 04c032916f..ead3d45506 100755
--- a/src/datasets/resource_scripts/dataset_info.sh
+++ b/src/datasets/resource_scripts/dataset_info.sh
@@ -5,13 +5,13 @@ DATASETS_DIR="s3://openproblems-data/resources/datasets"
 cat > "/tmp/params.yaml" << HERE
 param_list:
   - id: openproblems_v1
-    input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml"
+    input_states: "$DATASETS_DIR/scrnaseq/openproblems_v1/**/log_cp10k/state.yaml"
     rename_keys: 'input:output_dataset'
   - id: openproblems_v1_multimodal
-    input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml"
+    input_states: "$DATASETS_DIR/multimodal/openproblems_v1_multimodal/**/log_cp10k/state.yaml"
     rename_keys: 'input:output_mod1'
   - id: cellxgene_census
-    input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml"
+    input_states: "$DATASETS_DIR/scrnaseq/cellxgene_census/**/log_cp10k/state.yaml"
     rename_keys: 'input:output_dataset'
 settings: '{"output": "dataset_info.yaml"}'
 output_state: state.yaml
diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh
index a306ba2ef8..42c3456b1b 100755
--- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh
+++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh
@@ -32,13 +32,13 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/multimodal
 HERE
 
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \
+  --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh
index be8444371b..652d39aa0a 100755
--- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh
+++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh
@@ -32,12 +32,12 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: resources/datasets/openproblems_neurips2021
+publish_dir: resources/datasets/multimodal/openproblems_neurips2021
 HERE
 
 export NXF_VER=23.10.1
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \
+  -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \
   -profile docker \
   -resume \
   -params-file "$params_file"
diff --git a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh
index e3e6783a8e..681d8f3d36 100755
--- a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh
+++ b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh
@@ -34,7 +34,7 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/multimodal
 HERE
 
 cat > /tmp/nextflow.config << HERE
@@ -49,7 +49,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \
+  --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \
   --workspace 53907369739130 \
   --compute-env 1pK56PjjzeraOOC2LDZvN2 \
   --params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/openproblems_v1.sh b/src/datasets/resource_scripts/openproblems_v1.sh
index 8d40e57c46..7e5b12c348 100755
--- a/src/datasets/resource_scripts/openproblems_v1.sh
+++ b/src/datasets/resource_scripts/openproblems_v1.sh
@@ -162,7 +162,7 @@ output_normalized: force_null
 output_pca: force_null
 output_hvg: force_null
 output_knn: force_null
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/scrnaseq
 HERE
 
 cat > /tmp/nextflow.config << HERE
@@ -174,7 +174,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \
+  --main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh
index 2d516a8ccb..f8e83f3582 100755
--- a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh
+++ b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh
@@ -60,7 +60,7 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/multimodal
 HERE
 
 
@@ -77,7 +77,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \
+  --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh
index 268a17cf7d..e3d012e3c5 100755
--- a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh
+++ b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 export TOWER_WORKSPACE_ID=53907369739130
 
-OUTPUT_DIR="resources/datasets"
+OUTPUT_DIR="resources/datasets/multimodal"
 
 if [ ! -d "$OUTPUT_DIR" ]; then
   mkdir -p "$OUTPUT_DIR"
@@ -38,7 +38,7 @@ HERE
 export NXF_VER=22.04.5
 nextflow \
   run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \
+  -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \
   -profile docker \
   -resume \
   -params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/openproblems_v1_test.sh b/src/datasets/resource_scripts/openproblems_v1_test.sh
index a79545f052..dab792fd13 100755
--- a/src/datasets/resource_scripts/openproblems_v1_test.sh
+++ b/src/datasets/resource_scripts/openproblems_v1_test.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 export TOWER_WORKSPACE_ID=53907369739130
 
-OUTPUT_DIR="resources/datasets"
+OUTPUT_DIR="resources/datasets/scrnasrq"
 
 if [ ! -d "$OUTPUT_DIR" ]; then
   mkdir -p "$OUTPUT_DIR"
@@ -42,7 +42,7 @@ HERE
 
 export NXF_VER=23.04.2
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \
+  -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \
   -profile docker \
   -resume \
   -params-file "$params_file" \
diff --git a/src/datasets/resource_scripts/tenx_visium.sh b/src/datasets/resource_scripts/tenx_visium.sh
index 3e2fb68a61..cc7199c81f 100755
--- a/src/datasets/resource_scripts/tenx_visium.sh
+++ b/src/datasets/resource_scripts/tenx_visium.sh
@@ -125,7 +125,7 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
@@ -253,7 +253,7 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
@@ -290,13 +290,13 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \
+  --main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "/tmp/params.yaml" \
diff --git a/src/datasets/resource_scripts/zenodo_spatial.sh b/src/datasets/resource_scripts/zenodo.sh
similarity index 92%
rename from src/datasets/resource_scripts/zenodo_spatial.sh
rename to src/datasets/resource_scripts/zenodo.sh
index c1386aeb84..dc297492c2 100755
--- a/src/datasets/resource_scripts/zenodo_spatial.sh
+++ b/src/datasets/resource_scripts/zenodo.sh
@@ -2,7 +2,7 @@
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/visium/human_heart_myocardial_infarction_1
+#   - id: zenodo/visium/human_heart_myocardial_infarction_1
 #     input_data: "https://zenodo.org/records/13328275/files/10X0018.h5ad?download=1"
 #     dataset_name: 10X Visium - Human Heart MI 1
 #     dataset_url: "https://www.nature.com/articles/s41586-022-05060-x"
@@ -14,7 +14,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/visium/human_heart_myocardial_infarction_2
+#   - id: zenodo/visium/human_heart_myocardial_infarction_2
 #     input_data: "https://zenodo.org/records/13328275/files/10X009.h5ad?download=1"
 #     dataset_name: 10X Visium - Human Heart MI 2
 #     dataset_url: "https://www.nature.com/articles/s41586-022-05060-x"
@@ -32,13 +32,13 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # remove_mitochondrial: true
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/dbitseq/mouse_e10_brain
+#   - id: zenodo/dbitseq/mouse_e10_brain
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_brain_gene_25um_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Brain (E10)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -50,7 +50,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/dbitseq/mouse_e10_eye
+#   - id: zenodo/dbitseq/mouse_e10_eye
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_eye_and_nearby_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Eye (E10)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -62,7 +62,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/dbitseq/mouse_e10_whole_body
+#   - id: zenodo/dbitseq/mouse_e10_whole_body
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_whole_gene_best_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Whole Body (E10)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -74,7 +74,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/dbitseq/mouse_e11_lower_body
+#   - id: zenodo/dbitseq/mouse_e11_lower_body
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E11_lower_body_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Lower Body (E11)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -86,7 +86,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/dbitseq/mouse_e11_1
+#   - id: zenodo/dbitseq/mouse_e11_1
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364244_E11-FL-1L_gene_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Whole Body 1 (E11)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -98,7 +98,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/dbitseq/mouse_e11_2
+#   - id: zenodo/dbitseq/mouse_e11_2
 #     input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364245_E11-FL-2L_gene_data.h5ad?download=1"
 #     dataset_name: DBiT-seq - Mouse Whole Body 2 (E11)
 #     dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8"
@@ -116,12 +116,12 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/merfish/human_cortex_1
+#   - id: zenodo/merfish/human_cortex_1
 #     input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.250.expand.rep1_data.h5ad?download=1"
 #     dataset_name: MERFISH - Human Cortex 1
 #     dataset_url: "https://www.science.org/doi/10.1126/science.abm1741"
@@ -133,7 +133,7 @@
 #     gene_filter_min_spots: 100
 #     remove_mitochondrial: false
 
-#   - id: zenodo_spatial/merfish/human_cortex_2
+#   - id: zenodo/merfish/human_cortex_2
 #     input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep1_data.h5ad?download=1"
 #     dataset_name: MERFISH - Human Cortex 2
 #     dataset_url: "https://www.science.org/doi/10.1126/science.abm1741"
@@ -145,7 +145,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: false
 
-#   - id: zenodo_spatial/merfish/human_cortex_3
+#   - id: zenodo/merfish/human_cortex_3
 #     input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep2_data.h5ad?download=1"
 #     dataset_name: MERFISH - Human Cortex 3
 #     dataset_url: "https://www.science.org/doi/10.1126/science.abm1741"
@@ -157,7 +157,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: false
 
-#   - id: zenodo_spatial/merfish/human_cortex_4
+#   - id: zenodo/merfish/human_cortex_4
 #     input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep3_data.h5ad?download=1"
 #     dataset_name: MERFISH - Human Cortex 4
 #     dataset_url: "https://www.science.org/doi/10.1126/science.abm1741"
@@ -169,7 +169,7 @@
 #     gene_filter_min_spots: 50
 #     remove_mitochondrial: false
 
-#   - id: zenodo_spatial/merfish/mouse_cortex
+#   - id: zenodo/merfish/mouse_cortex
 #     input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_mouse1.AUD_TEA_VIS.242.unexpand_data.h5ad?download=1"
 #     dataset_name: MERFISH - Mouse Cortex
 #     dataset_url: "https://www.science.org/doi/10.1126/science.abm1741"
@@ -187,12 +187,12 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/seqfish/mouse_organogenesis_seqfish
+#   - id: zenodo/seqfish/mouse_organogenesis
 #     input_data: "https://zenodo.org/records/12785822/files/seqfish.h5ad?download=1"
 #     dataset_name: Seqfish - Mouse Organogenesis
 #     dataset_url: "https://www.nature.com/articles/s41587-021-01006-2"
@@ -210,13 +210,13 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # remove_mitochondrial: true
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/slideseqv2/mouse_olfactory_bulb_puck
+#   - id: zenodo/slideseqv2/mouse_olfactory_bulb_puck
 #     input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1"
 #     dataset_name: Slide-seqV2 - Mouse Olfactory Bulb Puck
 #     dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
@@ -228,7 +228,7 @@
 #     gene_filter_min_spots: 500
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/slideseqv2/mouse_cortex
+#   - id: zenodo/slideseqv2/mouse_cortex
 #     input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_palla2021squidpy_Slide-seqV2_Mouse_Cortex_data_whole.h5ad?download=1"
 #     dataset_name: Slide-seqV2 - Mouse Cortex
 #     dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
@@ -240,7 +240,7 @@
 #     gene_filter_min_spots: 500
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/slideseqv2/mouse_cerebellum
+#   - id: zenodo/slideseqv2/mouse_cerebellum
 #     input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Cerebellum_SCP948_data_whole.h5ad?download=1"
 #     dataset_name: Slide-seqV2 - Mouse Cerebellum
 #     dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
@@ -252,7 +252,7 @@
 #     gene_filter_min_spots: 500
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/slideseqv2/mouse_hippocampus_puck
+#   - id: zenodo/slideseqv2/mouse_hippocampus_puck
 #     input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Hippocampus_Puck_200115_08_data_whole.h5ad?download=1"
 #     dataset_name: Slide-seqV2 - Mouse Hippocampus Puck
 #     dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
@@ -264,7 +264,7 @@
 #     gene_filter_min_spots: 500
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/slideseqv2/mouse_somatosensory_cortex_puck
+#   - id: zenodo/slideseqv2/mouse_somatosensory_cortex_puck
 #     input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_SomatosensoryCortex_Puck_200306_03_data_whole.h5ad?download=1"
 #     dataset_name: Slide-seqV2 - Mouse Somatosensory Cortex Puck
 #     dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary"
@@ -282,12 +282,12 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 # cat > "/tmp/params.yaml" << 'HERE'
 # param_list:
-#   - id: zenodo_spatial/starmap/mouse_brain_2d_zstep10_0
+#   - id: zenodo/starmap/mouse_brain_2d_zstep10_0
 #     input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep10_0_data.h5ad?download=1"
 #     dataset_name: STARmap - Mouse Brain 1
 #     dataset_url: "https://www.science.org/doi/10.1126/science.aat5691"
@@ -299,7 +299,7 @@
 #     gene_filter_min_spots: 1
 #     remove_mitochondrial: true
 
-#   - id: zenodo_spatial/starmap/mouse_brain_2d_zstep15_0
+#   - id: zenodo/starmap/mouse_brain_2d_zstep15_0
 #     input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep15_0_data.h5ad?download=1"
 #     dataset_name: STARmap - Mouse Brain 2
 #     dataset_url: "https://www.science.org/doi/10.1126/science.aat5691"
@@ -317,12 +317,12 @@
 # output_state: '$id/state.yaml'
 # output_raw: force_null
 # output_normalized: force_null
-# publish_dir: s3://openproblems-data/resources/datasets
+# publish_dir: s3://openproblems-data/resources/datasets/spatial
 # HERE
 
 cat > "/tmp/params.yaml" << 'HERE'
 param_list:
-  - id: zenodo_spatial/stereoseq/drosophila_embryo_e5_6
+  - id: zenodo/stereoseq/drosophila_embryo_e5_6
     input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_5.6.h5ad?download=1"
     dataset_name: Stereo-seq - Drosophila embryo E5_6
     dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465"
@@ -334,7 +334,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial/stereoseq/drosophila_embryo_e6_3
+  - id: zenodo/stereoseq/drosophila_embryo_e6_3
     input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_6.3.h5ad?download=1"
     dataset_name: Stereo-seq - Drosophila embryo E6_3
     dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465"
@@ -346,7 +346,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial/stereoseq/drosophila_embryo_e7
+  - id: zenodo/stereoseq/drosophila_embryo_e7
     input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_7.h5ad?download=1"
     dataset_name: Stereo-seq - Drosophila embryo E7
     dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465"
@@ -358,7 +358,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial/stereoseq/drosophila_embryo_e9_1
+  - id: zenodo/stereoseq/drosophila_embryo_e9_1
     input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_9.1.h5ad?download=1"
     dataset_name: Stereo-seq - Drosophila embryo E9_1
     dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465"
@@ -370,7 +370,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial/stereoseq/drosophila_embryo_e10
+  - id: zenodo/stereoseq/drosophila_embryo_e10
     input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_10.5.h5ad?download=1"
     dataset_name: Stereo-seq - Drosophila embryo E10
     dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465"
@@ -388,7 +388,7 @@ output_meta: '$id/dataset_metadata.yaml'
 output_state: '$id/state.yaml'
 output_raw: force_null
 output_normalized: force_null
-publish_dir: s3://openproblems-data/resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/spatial
 HERE
 
 cat > /tmp/nextflow.config << HERE
@@ -407,7 +407,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_zenodo_spatial/main.nf \
+  --main-script target/nextflow/datasets/workflows/spatial/process_zenodo/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "/tmp/params.yaml" \
diff --git a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh b/src/datasets/resource_scripts/zenodo_slidetags.sh
similarity index 89%
rename from src/datasets/resource_scripts/zenodo_spatial_slidetags.sh
rename to src/datasets/resource_scripts/zenodo_slidetags.sh
index aa4e7e094b..e35df8edc3 100755
--- a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh
+++ b/src/datasets/resource_scripts/zenodo_slidetags.sh
@@ -2,7 +2,7 @@
 
 cat > "/tmp/params.yaml" << 'HERE'
 param_list:
-  - id: zenodo_spatial_slidetags/slidetags/human_cortex
+  - id: zenodo_slidetags/slidetags/human_cortex
     input_data: "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1"
     dataset_name: Slide-tags - Human Cortex
     dataset_url: "https://www.nature.com/articles/s41586-023-06837-4"
@@ -14,7 +14,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial_slidetags/slidetags/human_skin_melanoma
+  - id: zenodo_slidetags/slidetags/human_skin_melanoma
     input_data: "https://zenodo.org/records/12785822/files/slidetag_human_skin_melanoma.tar.gz?download=1"
     dataset_name: Slide-tags - Human Skin Melanoma
     dataset_url: "https://www.nature.com/articles/s41586-023-06837-4"
@@ -26,7 +26,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial_slidetags/slidetags/human_tonsil
+  - id: zenodo_slidetags/slidetags/human_tonsil
     input_data: "https://zenodo.org/records/12785822/files/slidetag_human_tonsil.tar.gz?download=1"
     dataset_name: Slide-tags - Human Tonsil
     dataset_url: "https://www.nature.com/articles/s41586-023-06837-4"
@@ -38,7 +38,7 @@ param_list:
     gene_filter_min_spots: 50
     remove_mitochondrial: true
 
-  - id: zenodo_spatial_slidetags/slidetags/mouse_embryo
+  - id: zenodo_slidetags/slidetags/mouse_embryo
     input_data: "https://zenodo.org/records/12785822/files/slidetag_mouse_embryo.tar.gz?download=1"
     dataset_name: Slide-tags - Mouse Embryo
     dataset_url: "https://www.nature.com/articles/s41586-023-06837-4"
@@ -56,7 +56,7 @@ output_meta: '$id/dataset_metadata.yaml'
 output_state: '$id/state.yaml'
 output_raw: force_null
 output_normalized: force_null
-publish_dir: resources/datasets
+publish_dir: s3://openproblems-data/resources/datasets/spatial
 HERE
 
 cat > /tmp/nextflow.config << HERE
@@ -75,7 +75,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_zenodo_spatial_slidetags/main.nf \
+  --main-script target/nextflow/datasets/workflows/spatial/process_zenodo_slidetags/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "/tmp/params.yaml" \
diff --git a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh
index 285dc55ec4..8a9d7de486 100755
--- a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh
+++ b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
-DATASET_DIR=resources_test/common
-
+DATASET_DIR=resources_test/common/scrnaseq
 
 mkdir -p $DATASET_DIR
 
@@ -40,7 +39,7 @@ keep_features: '$KEEP_FEATURES'
 HERE
 
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \
+  -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \
   -c src/wf_utils/labels_ci.config \
   -profile docker \
   -params-file "/tmp/params.yaml"
diff --git a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh
index 3b5d35ee5c..c90ae39ac7 100755
--- a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh
+++ b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-DATASET_DIR=resources_test/common
+DATASET_DIR=resources_test/common/scrnaseq
 
 
 mkdir -p $DATASET_DIR
@@ -40,7 +40,7 @@ keep_features: '$KEEP_FEATURES'
 HERE
 
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \
+  -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \
   -c src/wf_utils/labels_ci.config \
   -profile docker \
   -params-file "/tmp/params.yaml"
diff --git a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh
index 962c4c067d..ed65408dc6 100755
--- a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh
+++ b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh
@@ -22,7 +22,7 @@ output_meta: '$id/dataset_metadata.yaml'
 output_state: '$id/state.yaml'
 output_raw: force_null
 output_normalized: force_null
-publish_dir: resources_test/common
+publish_dir: resources_test/common/spatial
 do_subsample: true
 spot_filter_min_genes: 200
 gene_filter_min_spots: 50
@@ -30,7 +30,7 @@ remove_mitochondrial: true
 HERE
 
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \
+  -main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \
   -c src/wf_utils/labels_ci.config \
   -profile docker \
   -params-file "/tmp/params.yaml"
diff --git a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh
index 98644d9dbf..ebd8faf892 100755
--- a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh
+++ b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh
@@ -38,7 +38,7 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-# publish_dir: s3://openproblems-data/resources_test/common
+# publish_dir: s3://openproblems-data/resources_test/common/multimodal
 HERE
 
 # cat > /tmp/nextflow.config << HERE
@@ -51,10 +51,10 @@ HERE
 # HERE
 
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \
+  -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \
   -profile docker \
   -resume \
-  --publish_dir resources_test/common \
+  --publish_dir resources_test/common/multimodal \
   -params-file "$params_file" \
   -c src/wf_utils/labels.config
 
@@ -68,4 +68,4 @@ nextflow run . \
 #   --labels predict_modality
 
 # run task process dataset components
-src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh
\ No newline at end of file
+# src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh
\ No newline at end of file
diff --git a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh
index b62e6f40e1..b3a11eb67e 100755
--- a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh
+++ b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh
@@ -38,11 +38,11 @@ output_mod2: '$id/dataset_mod2.h5ad'
 output_meta_mod1: '$id/dataset_metadata_mod1.yaml'
 output_meta_mod2: '$id/dataset_metadata_mod2.yaml'
 output_state: '$id/state.yaml'
-publish_dir: s3://openproblems-data/resources_test/common
+publish_dir: s3://openproblems-data/resources_test/common/multimodal
 HERE
 
 # nextflow run . \
-#   -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \
+#   -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \
 #   -profile docker \
 #   -resume \
 #   --publish_dir resources_test/common \
@@ -63,7 +63,7 @@ HERE
 tw launch https://github.com/openproblems-bio/openproblems.git \
   --revision main_build \
   --pull-latest \
-  --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \
+  --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \
   --workspace 53907369739130 \
   --compute-env 1pK56PjjzeraOOC2LDZvN2 \
   --params-file "$params_file" \
diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh
index fb26f7ef30..3857449636 100755
--- a/src/datasets/resource_test_scripts/pancreas.sh
+++ b/src/datasets/resource_test_scripts/pancreas.sh
@@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 
-DATASET_DIR=resources_test/common
+DATASET_DIR=resources_test/common/scrnaseq
 
 set -e
 
@@ -18,7 +18,7 @@ KEEP_FEATURES=`cat $DATASET_DIR/temp_g2m_genes_tirosh_hm.txt $DATASET_DIR/temp_s
 
 # download dataset
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \
+  -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \
   -profile docker \
   -c src/wf_utils/labels_ci.config \
   -resume \
@@ -55,7 +55,7 @@ nextflow run . \
 rm -r $DATASET_DIR/temp_*
 
 # run task process dataset components
-src/tasks/batch_integration/resources_test_scripts/process.sh
-src/tasks/denoising/resources_test_scripts/pancreas.sh
-src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh
-src/tasks/label_projection/resources_test_scripts/pancreas.sh
\ No newline at end of file
+# src/tasks/batch_integration/resources_test_scripts/process.sh
+# src/tasks/denoising/resources_test_scripts/pancreas.sh
+# src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh
+# src/tasks/label_projection/resources_test_scripts/pancreas.sh
\ No newline at end of file
diff --git a/src/datasets/resource_test_scripts/scicar_cell_lines.sh b/src/datasets/resource_test_scripts/scicar_cell_lines.sh
index f765744136..f9c9a7b842 100755
--- a/src/datasets/resource_test_scripts/scicar_cell_lines.sh
+++ b/src/datasets/resource_test_scripts/scicar_cell_lines.sh
@@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 
-DATASET_DIR=resources_test/common
+DATASET_DIR=resources_test/common/multimodal
 
 set -e
 
@@ -14,7 +14,7 @@ mkdir -p $DATASET_DIR
 
 # download dataset
 nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \
+  -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \
   -profile docker \
   -resume \
   --id scicar_cell_lines \
diff --git a/src/datasets/resource_test_scripts/slideseq_test.sh b/src/datasets/resource_test_scripts/slideseq_test.sh
deleted file mode 100755
index a9050be40a..0000000000
--- a/src/datasets/resource_test_scripts/slideseq_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cat > /tmp/params.yaml << 'HERE'
-param_list:
-  - id: mouse_cerebellum
-    input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1"
-    dataset_name: Mouse cerebellum
-    dataset_url: "..."
-    dataset_summary: ...
-    dataset_description: "..."
-    dataset_reference: ref
-    dataset_organism: Mus musculus
-
-normalization_methods: [log_cp10k]
-n_obs: 600
-n_vars: 500
-output_dataset: '$id/dataset.h5ad'
-output_meta: '$id/dataset_metadata.yaml'
-output_state: '$id/state.yaml'
-output_raw: force_null
-output_normalized: force_null
-publish_dir: resources_test/common
-do_subsample: true
-spot_filter_min_genes: 200
-gene_filter_min_spots: 50
-remove_mitochondrial: true
-HERE
-
-nextflow run . \
-  -main-script target/nextflow/datasets/workflows/process_spatial_from_zenodo/main.nf \
-  -c src/wf_utils/labels_ci.config \
-  -profile docker \
-  -params-file "/tmp/params.yaml"
-
diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml
index 58433db567..0ebe6f06df 100644
--- a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml
+++ b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml
@@ -1,34 +1,34 @@
-functionality:
-  name: "extract_dataset_info"
-  namespace: "datasets/workflows"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input"
-          __merge__: /src/datasets/api/file_raw.yaml
-          required: true
-          direction: input
-    - name: Filter arguments
-      arguments:
-        - name: "--filter_normalization_id"
-          type: string
-          required: false
-          direction: input
-          description: If defined, only the normalization with this ID will be included in the output.
-          multiple: true
-          example: [ log_cp10k ]
-    - name: Outputs
-      arguments:
-        - name: "--output"
-          type: file
-          required: true
-          direction: output
-          example: dataset_uns.yaml
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-  dependencies: 
-    - name: common/extract_metadata
-platforms:
+name: extract_dataset_info
+namespace: datasets/workflows
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        __merge__: /src/datasets/api/file_raw.yaml
+        required: true
+        direction: input
+  - name: Filter arguments
+    arguments:
+      - name: --filter_normalization_id
+        type: string
+        required: false
+        direction: input
+        description: If defined, only the normalization with this ID will be included
+          in the output.
+        multiple: true
+        example: [log_cp10k]
+  - name: Outputs
+    arguments:
+      - name: --output
+        type: file
+        required: true
+        direction: output
+        example: dataset_uns.yaml
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+dependencies:
+  - name: common/extract_metadata
+runners:
   - type: nextflow
diff --git a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml
index 26041b1039..a138ea895c 100644
--- a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml
+++ b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml
@@ -1,25 +1,24 @@
-functionality:
-  name: "extract_dataset_meta"
-  namespace: "datasets/workflows"
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--input"
-          __merge__: /src/datasets/api/file_raw.yaml
-          required: true
-          direction: input
-    - name: Outputs
-      arguments:
-        - name: "--output"
-          type: file
-          required: true
-          direction: output
-          example: meta.yaml
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-  dependencies: 
-    - name: common/extract_metadata
-platforms:
+name: extract_dataset_meta
+namespace: datasets/workflows
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input
+        __merge__: /src/datasets/api/file_raw.yaml
+        required: true
+        direction: input
+  - name: Outputs
+    arguments:
+      - name: --output
+        type: file
+        required: true
+        direction: output
+        example: meta.yaml
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+dependencies:
+  - name: common/extract_metadata
+runners:
   - type: nextflow
diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml
new file mode 100644
index 0000000000..91f0bbcfc1
--- /dev/null
+++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml
@@ -0,0 +1,138 @@
+name: process_openproblems_neurips2021_bmmc
+namespace: datasets/workflows/multimodal
+description: |
+  Fetch and process Neurips 2021 multimodal datasets
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --id
+        type: string
+        description: The ID of the dataset
+        required: true
+      - name: --input
+        type: file
+        description: Path to the input dataset
+        required: true
+      - name: --mod1
+        type: string
+        description: Name of the first modality.
+        required: true
+        example: GEX
+      - name: --mod2
+        type: string
+        description: Name of the second modality.
+        required: true
+        example: ADT
+  - name: Metadata
+    arguments:
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --keep_features
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: --keep_cell_type_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --keep_batch_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --even
+        type: boolean_true
+        description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_mod2
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_meta_mod1
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod1.yaml
+      - name: --output_meta_mod2
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod2.yaml
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/multimodal/openproblems_neurips2021_bmmc
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/normalization/prot_clr
+  - name: datasets/normalization/atac_tfidf
+  - name: datasets/processors/subsample
+  - name: datasets/processors/svd
+  - name: datasets/processors/hvg
+  - name: common/extract_metadata
+  - name: common/decompress_gzip
+  # test_resources:
+  #   - type: nextflow_script
+  #     path: main.nf
+  #     entrypoint: test_wf
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf
similarity index 100%
rename from src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf
rename to src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf
diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml
new file mode 100644
index 0000000000..88f341aae4
--- /dev/null
+++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml
@@ -0,0 +1,144 @@
+name: process_openproblems_neurips2022_pbmc
+namespace: datasets/workflows/multimodal
+description: |
+  Fetch and process Neurips 2022 multimodal datasets
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --id
+        type: string
+        description: The ID of the dataset
+        required: true
+      - name: --input_mod1
+        type: file
+        description: Processed RNA h5ad file
+        required: true
+        example: cite_rna_merged.h5ad
+      - name: --input_mod2
+        type: file
+        description: Processed ADT or ATAC h5ad file
+        required: true
+        example: cite_prot_merged.h5ad
+      - name: --mod1
+        type: string
+        description: Name of the first modality.
+        required: true
+        example: GEX
+      - name: --mod2
+        type: string
+        description: Name of the second modality.
+        required: true
+        example: ADT
+  - name: Metadata
+    arguments:
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --keep_features
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: --keep_cell_type_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --keep_batch_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --even
+        type: boolean_true
+        description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_mod2
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_meta_mod1
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod1.yaml
+      - name: --output_meta_mod2
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod2.yaml
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/multimodal/openproblems_neurips2022_pbmc
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/normalization/prot_clr
+  - name: datasets/normalization/atac_tfidf
+  - name: datasets/processors/subsample
+  - name: datasets/processors/svd
+  - name: datasets/processors/hvg
+  - name: common/extract_metadata
+  - name: common/decompress_gzip
+  # test_resources:
+  #   - type: nextflow_script
+  #     path: main.nf
+  #     entrypoint: test_wf
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf
similarity index 100%
rename from src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf
rename to src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf
diff --git a/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml
new file mode 100644
index 0000000000..f70e3ab19e
--- /dev/null
+++ b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml
@@ -0,0 +1,165 @@
+name: process_openproblems_v1_multimodal
+namespace: datasets/workflows/multimodal
+description: |
+  Fetch and process legacy OpenProblems v1 multimodal datasets
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --input_id
+        type: string
+        description: The ID of the dataset in OpenProblems v1
+        required: true
+      - name: --obs_cell_type
+        type: string
+        description: Location of where to find the observation cell types.
+      - name: --obs_batch
+        type: string
+        description: Location of where to find the observation batch IDs.
+      - name: --obs_tissue
+        type: string
+        description: Location of where to find the observation tissue information.
+      - name: --layer_counts
+        type: string
+        description: In which layer to find the counts matrix. Leave undefined to
+          use `.X`.
+        example: counts
+      - name: --sparse
+        type: boolean
+        default: true
+        description: Convert layers to a sparse CSR format.
+      - name: --var_feature_id
+        type: string
+        description: Location of where to find the feature IDs. Can be set to index
+          if the feature IDs are the index.
+        example: gene_ids
+      - name: --var_feature_name
+        type: string
+        description: Location of where to find the feature names. Can be set to index
+          if the feature names are the index.
+        default: index
+      - name: --mod1
+        type: string
+        description: Name of the first modality.
+        required: true
+        example: GEX
+      - name: --mod2
+        type: string
+        description: Name of the second modality.
+        required: true
+        example: ADT
+  - name: Metadata
+    arguments:
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --keep_features
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: --keep_cell_type_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --keep_batch_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --even
+        type: boolean_true
+        description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+  - name: Outputs
+    arguments:
+      - name: --output_mod1
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_mod2
+        direction: output
+        __merge__: /src/datasets/api/file_multimodal_dataset.yaml
+      - name: --output_meta_mod1
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod1.yaml
+      - name: --output_meta_mod2
+        direction: output
+        type: file
+        description: Dataset metadata
+        example: dataset_metadata_mod2.yaml
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/multimodal/openproblems_v1_multimodal
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/normalization/prot_clr
+  - name: datasets/normalization/atac_tfidf
+  - name: datasets/processors/subsample
+  - name: datasets/processors/svd
+  - name: datasets/processors/hvg
+  - name: common/extract_metadata
+  # test_resources:
+  #   - type: nextflow_script
+  #     path: main.nf
+  #     entrypoint: test_wf
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf
similarity index 100%
rename from src/datasets/workflows/process_openproblems_v1_multimodal/main.nf
rename to src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf
diff --git a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml
deleted file mode 100644
index 3e1fd5263b..0000000000
--- a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml
+++ /dev/null
@@ -1,201 +0,0 @@
-functionality:
-  name: process_cellxgene_census
-  namespace: datasets/workflows
-  description: |
-    Fetch and process datasets originating from the CELLxGENE census.
-  argument_groups:
-    - name: Input database
-      description: "Open CellxGene Census by version or URI."
-      arguments:
-        - name: "--input_uri"
-          type: string
-          description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument."
-          required: false
-          example: "s3://bucket/path"
-        - name: "--census_version"
-          description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)."
-          type: string
-          example: "stable"
-          required: false
-    - name: Cell query
-      description: Arguments related to the query.
-      arguments:
-        - name: "--species"
-          type: string
-          description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`.
-          required: false
-          default: "homo_sapiens"
-          multiple: false
-        - name: "--obs_value_filter"
-          type: string
-          description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax."
-          required: false
-          example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'"
-    - name: Cell filter
-      description: Filter the cells based on a minimum cell count per specified group
-      arguments:
-        - name: "--cell_filter_grouping"
-          type: string
-          description: |
-            A subset of 'obs' columns by which to group the cells for filtering.
-            Only groups surpassing or equal to the `--cell_filter_minimum_count`
-            threshold will be retained. Take care not to introduce a selection
-            bias against cells with more fine-grained ontology annotations.
-          required: false
-          example: ["dataset_id", "tissue", "assay", "disease", "cell_type"]
-          multiple: true
-        - name: "--cell_filter_minimum_count"
-          type: double
-          description: |
-            A minimum number of cells per group to retain. If `--cell_filter_grouping`
-            is defined, this parameter should also be provided and vice versa.
-          required: false
-          example: 100
-    - name: Cell metadata
-      description: Cell metadata arguments
-      arguments:
-        - name: "--obs_batch"
-          type: string
-          description: |
-            Location of where to find the observation batch IDs.  
-            
-            * If not specified, the `.obs["batch"]` field will not be included.
-            * If one or more values are specified, the `.obs["batch"]` field will be 
-              set to the concatenated values of the specified fields, separated by
-              the `obs_batch_separator`.
-          required: false
-          multiple: true
-          multiple_sep: ","
-          example: ["batch"]
-        - name: "--obs_batch_separator"
-          type: string
-          description: Separator to use when concatenating the values of the `--obs_batch` fields.
-          required: false
-          default: "+"
-    - name: Dataset metadata
-      description: Information about the dataset that will be stored in the `.uns` slot.
-      arguments:
-        - name: "--id"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: true
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--keep_features"
-          type: string
-          multiple: true
-          description: A list of genes to keep.
-        - name: "--keep_cell_type_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--keep_batch_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--even"
-          type: "boolean_true"
-          description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-    - name: Outputs
-      arguments:
-        - name: "--output_dataset"
-          __merge__: /src/datasets/api/file_common_dataset.yaml
-          direction: "output"
-          required: true
-        - name: "--output_meta"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          default: "dataset_metadata.yaml"
-        - name: "--output_raw"
-          __merge__: /src/datasets/api/file_raw.yaml
-          direction: "output"
-          required: false
-        - name: "--output_normalized"
-          __merge__: /src/datasets/api/file_normalized.yaml
-          direction: "output"
-          required: false
-        - name: "--output_pca"
-          __merge__: /src/datasets/api/file_pca.yaml
-          direction: "output"
-          required: false
-        - name: "--output_hvg"
-          __merge__: /src/datasets/api/file_hvg.yaml
-          direction: "output"
-          required: false
-        - name: "--output_knn"
-          __merge__: /src/datasets/api/file_knn.yaml
-          direction: "output"
-          required: false
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies: 
-    - name: datasets/loaders/cellxgene_census
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/processors/subsample
-    - name: datasets/processors/pca
-    - name: datasets/processors/hvg
-    - name: datasets/processors/knn
-    - name: common/extract_metadata
-  # test_resources:
-  #   - type: nextflow_script
-  #     path: main.nf
-  #     entrypoint: test_wf
-platforms:
-  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml
deleted file mode 100644
index 8d3ca51d0b..0000000000
--- a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml
+++ /dev/null
@@ -1,137 +0,0 @@
-functionality:
-  name: process_openproblems_neurips2021_bmmc
-  namespace: datasets/workflows
-  description: |
-    Fetch and process Neurips 2021 multimodal datasets
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--id"
-          type: "string"
-          description: "The ID of the dataset"
-          required: true
-        - name: "--input"
-          type: "file"
-          description: "Path to the input dataset"
-          required: true
-        - name: "--mod1"
-          type: string
-          description: Name of the first modality.
-          required: true
-          example: GEX
-        - name: "--mod2"
-          type: string
-          description: Name of the second modality.
-          required: true
-          example: ADT
-    - name: Metadata
-      arguments:
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--keep_features"
-          type: string
-          multiple: true
-          description: A list of genes to keep.
-        - name: "--keep_cell_type_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--keep_batch_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--even"
-          type: "boolean_true"
-          description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_mod2"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_meta_mod1"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod1.yaml"
-        - name: "--output_meta_mod2"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod2.yaml"
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies:
-    - name: datasets/loaders/openproblems_neurips2021_bmmc
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/normalization/prot_clr
-    - name: datasets/normalization/atac_tfidf
-    - name: datasets/processors/subsample
-    - name: datasets/processors/svd
-    - name: datasets/processors/hvg
-    - name: common/extract_metadata
-    - name: common/decompress_gzip
-  # test_resources:
-  #   - type: nextflow_script
-  #     path: main.nf
-  #     entrypoint: test_wf
-platforms:
-  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml
deleted file mode 100644
index 96bcc3ee2c..0000000000
--- a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml
+++ /dev/null
@@ -1,143 +0,0 @@
-functionality:
-  name: process_openproblems_neurips2022_pbmc
-  namespace: datasets/workflows
-  description: |
-    Fetch and process Neurips 2022 multimodal datasets
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--id"
-          type: "string"
-          description: "The ID of the dataset"
-          required: true
-        - name: "--input_mod1"
-          type: file
-          description: "Processed RNA h5ad file"
-          required: true
-          example: cite_rna_merged.h5ad
-        - name: "--input_mod2"
-          type: file
-          description: "Processed ADT or ATAC h5ad file"
-          required: true
-          example: cite_prot_merged.h5ad
-        - name: "--mod1"
-          type: string
-          description: Name of the first modality.
-          required: true
-          example: GEX
-        - name: "--mod2"
-          type: string
-          description: Name of the second modality.
-          required: true
-          example: ADT
-    - name: Metadata
-      arguments:
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--keep_features"
-          type: string
-          multiple: true
-          description: A list of genes to keep.
-        - name: "--keep_cell_type_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--keep_batch_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--even"
-          type: "boolean_true"
-          description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_mod2"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_meta_mod1"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod1.yaml"
-        - name: "--output_meta_mod2"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod2.yaml"
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies:
-    - name: datasets/loaders/openproblems_neurips2022_pbmc
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/normalization/prot_clr
-    - name: datasets/normalization/atac_tfidf
-    - name: datasets/processors/subsample
-    - name: datasets/processors/svd
-    - name: datasets/processors/hvg
-    - name: common/extract_metadata
-    - name: common/decompress_gzip
-  # test_resources:
-  #   - type: nextflow_script
-  #     path: main.nf
-  #     entrypoint: test_wf
-platforms:
-  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml
deleted file mode 100644
index fb0cd73a65..0000000000
--- a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml
+++ /dev/null
@@ -1,163 +0,0 @@
-functionality:
-  name: process_openproblems_v1
-  namespace: datasets/workflows
-  description: |
-    Fetch and process legacy OpenProblems v1 datasets
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--input_id"
-          type: "string"
-          description: "The ID of the dataset in OpenProblems v1"
-          required: true
-        - name: "--obs_cell_type"
-          type: "string"
-          description: "Location of where to find the observation cell types."
-        - name: "--obs_batch"
-          type: "string"
-          description: "Location of where to find the observation batch IDs."
-        - name: "--obs_tissue"
-          type: "string"
-          description: "Location of where to find the observation tissue information."
-        - name: "--layer_counts"
-          type: "string"
-          description: "In which layer to find the counts matrix. Leave undefined to use `.X`."
-          example: counts
-        - name: "--sparse"
-          type: boolean
-          default: true
-          description: Convert layers to a sparse CSR format.
-        - name: "--var_feature_id"
-          type: "string"
-          description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index."
-          example: gene_ids
-        - name: "--var_feature_name"
-          type: "string"
-          description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
-          default: index
-    - name: Metadata
-      arguments:
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--keep_features"
-          type: string
-          multiple: true
-          description: A list of genes to keep.
-        - name: "--keep_cell_type_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--keep_batch_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--even"
-          type: "boolean_true"
-          description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-    - name: Outputs
-      arguments:
-        - name: "--output_dataset"
-          __merge__: /src/datasets/api/file_common_dataset.yaml
-          direction: "output"
-          required: true
-        - name: "--output_meta"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          default: "dataset_metadata.yaml"
-        - name: "--output_raw"
-          __merge__: /src/datasets/api/file_raw.yaml
-          direction: "output"
-          required: false
-        - name: "--output_normalized"
-          __merge__: /src/datasets/api/file_normalized.yaml
-          direction: "output"
-          required: false
-        - name: "--output_pca"
-          __merge__: /src/datasets/api/file_pca.yaml
-          direction: "output"
-          required: false
-        - name: "--output_hvg"
-          __merge__: /src/datasets/api/file_hvg.yaml
-          direction: "output"
-          required: false
-        - name: "--output_knn"
-          __merge__: /src/datasets/api/file_knn.yaml
-          direction: "output"
-          required: false
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies: 
-    - name: datasets/loaders/openproblems_v1
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/processors/subsample
-    - name: datasets/processors/pca
-    - name: datasets/processors/hvg
-    - name: datasets/processors/knn
-    - name: common/extract_metadata
-  # test_resources:
-  #   - type: nextflow_script
-  #     path: main.nf
-  #     entrypoint: test_wf
-platforms:
-  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml
deleted file mode 100644
index 58b045cc3b..0000000000
--- a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml
+++ /dev/null
@@ -1,161 +0,0 @@
-functionality:
-  name: process_openproblems_v1_multimodal
-  namespace: datasets/workflows
-  description: |
-    Fetch and process legacy OpenProblems v1 multimodal datasets
-  argument_groups:
-    - name: Inputs
-      arguments:
-        - name: "--id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--input_id"
-          type: "string"
-          description: "The ID of the dataset in OpenProblems v1"
-          required: true
-        - name: "--obs_cell_type"
-          type: "string"
-          description: "Location of where to find the observation cell types."
-        - name: "--obs_batch"
-          type: "string"
-          description: "Location of where to find the observation batch IDs."
-        - name: "--obs_tissue"
-          type: "string"
-          description: "Location of where to find the observation tissue information."
-        - name: "--layer_counts"
-          type: "string"
-          description: "In which layer to find the counts matrix. Leave undefined to use `.X`."
-          example: counts
-        - name: "--sparse"
-          type: boolean
-          default: true
-          description: Convert layers to a sparse CSR format.
-        - name: "--var_feature_id"
-          type: "string"
-          description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index."
-          example: gene_ids
-        - name: "--var_feature_name"
-          type: "string"
-          description: "Location of where to find the feature names. Can be set to index if the feature names are the index."
-          default: index
-        - name: "--mod1"
-          type: string
-          description: Name of the first modality.
-          required: true
-          example: GEX
-        - name: "--mod2"
-          type: string
-          description: Name of the second modality.
-          required: true
-          example: ADT
-    - name: Metadata
-      arguments:
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--keep_features"
-          type: string
-          multiple: true
-          description: A list of genes to keep.
-        - name: "--keep_cell_type_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--keep_batch_categories"
-          type: "string"
-          multiple: true
-          description: "Categories indexes to be selected"
-          required: false
-        - name: "--even"
-          type: "boolean_true"
-          description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-    - name: Outputs
-      arguments:
-        - name: "--output_mod1"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_mod2"
-          direction: "output"
-          __merge__: /src/datasets/api/file_multimodal_dataset.yaml
-        - name: "--output_meta_mod1"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod1.yaml"
-        - name: "--output_meta_mod2"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          example: "dataset_metadata_mod2.yaml"
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies:
-    - name: datasets/loaders/openproblems_v1_multimodal
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/normalization/prot_clr
-    - name: datasets/normalization/atac_tfidf
-    - name: datasets/processors/subsample
-    - name: datasets/processors/svd
-    - name: datasets/processors/hvg
-    - name: common/extract_metadata
-  # test_resources:
-  #   - type: nextflow_script
-  #     path: main.nf
-  #     entrypoint: test_wf
-platforms:
-  - type: nextflow
diff --git a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml
deleted file mode 100644
index 91a2867820..0000000000
--- a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-functionality:
-  name: process_tenx_visium
-  namespace: datasets/workflows
-  description: | 
-    Download and process datasets originating from 10x Genomics.
-  argument_groups: 
-    - name: Input
-      arguments:
-        - name: "--input_expression"
-          type: string
-          description: URL to the feature / barcode matrix HDF5.
-          required: true
-        - name: "--input_spatial"
-          type: string
-          description: URL to the Spatial imaging data.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--output_dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          __merge__: /src/datasets/api/file_raw.yaml
-        - name: "--output_meta"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          default: "dataset_metadata.yaml"
-    - name: Metadata
-      arguments:
-        - name: "--id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitovhondrial genes?
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        # - name: "--keep_features"
-        #   type: string
-        #   multiple: true
-        #   description: A list of genes to keep.
-        # - name: "--keep_cell_type_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--keep_batch_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--even"
-        #   type: "boolean_true"
-        #   description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies: 
-    - name: datasets/loaders/tenx_visium
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/processors/subsample
-    - name: common/extract_metadata
-platforms:
-  - type: nextflow
\ No newline at end of file
diff --git a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml
deleted file mode 100644
index 45b938b716..0000000000
--- a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml
+++ /dev/null
@@ -1,138 +0,0 @@
-functionality:
-  name: process_zenodo_spatial
-  namespace: datasets/workflows
-  description: | 
-    Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo.
-  argument_groups: 
-    - name: Input
-      arguments:
-        - name: "--input_data"
-          type: string
-          description: URL to the Anndata file.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--output_dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          __merge__: /src/datasets/api/file_raw.yaml
-        - name: "--output_meta"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          default: "dataset_metadata.yaml"
-    - name: Metadata
-      arguments:
-        - name: "--id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitovhondrial genes?
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 600
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        # - name: "--keep_features"
-        #   type: string
-        #   multiple: true
-        #   description: A list of genes to keep.
-        # - name: "--keep_cell_type_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--keep_batch_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--even"
-        #   type: "boolean_true"
-        #   description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies: 
-    - name: datasets/loaders/zenodo_spatial
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/processors/subsample
-    - name: common/extract_metadata
-platforms:
-  - type: nextflow
\ No newline at end of file
diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml
deleted file mode 100644
index 23934fe161..0000000000
--- a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml
+++ /dev/null
@@ -1,138 +0,0 @@
-functionality:
-  name: process_zenodo_spatial_slidetags
-  namespace: datasets/workflows
-  description: | 
-    Download and process slide tags datasets originating from Zenodo.
-  argument_groups: 
-    - name: Input
-      arguments:
-        - name: "--input_data"
-          type: string
-          description: URL to the Anndata file.
-          required: true
-    - name: Outputs
-      arguments:
-        - name: "--output_dataset"
-          type: file
-          direction: output
-          description: Output h5ad file
-          required: true
-          __merge__: /src/datasets/api/file_raw.yaml
-        - name: "--output_meta"
-          direction: "output"
-          type: file
-          description: "Dataset metadata"
-          default: "dataset_metadata.yaml"
-    - name: Metadata
-      arguments:
-        - name: "--id"
-          type: string
-          description: Unique identifier of the dataset.
-          required: true
-        - name: "--dataset_name"
-          type: string
-          description: Nicely formatted name.
-          required: true
-        - name: "--dataset_url"
-          type: string
-          description: Link to the original source of the dataset.
-          required: false
-        - name: "--dataset_reference"
-          type: string
-          description: Bibtex reference of the paper in which the dataset was published.
-          required: false
-        - name: "--dataset_summary"
-          type: string
-          description: Short description of the dataset.
-          required: true
-        - name: "--dataset_description"
-          type: string
-          description: Long description of the dataset.
-          required: true
-        - name: "--dataset_organism"
-          type: string
-          description: The organism of the dataset.
-          required: false
-    - name: Gene or spot filtering
-      description: Arguments related to filtering cells and genes by counts.
-      arguments:
-        - name: "--spot_filter_min_genes"
-          type: integer
-          description: Remove spots with less than this number of genes.
-          required: false
-          example: 200
-        - name: "--spot_filter_min_counts"
-          type: integer
-          description: Remove spots with less than this number of counts.
-          required: false
-        - name: "--gene_filter_min_spots"
-          type: integer
-          description: Remove genes expressed in less than this number of cells.
-          required: false
-          example: 50
-        - name: "--gene_filter_min_counts"
-          type: integer
-          description: Remove genes with less than this number of counts.
-          required: false
-        - name: "--remove_mitochondrial"
-          type: boolean
-          description: Remove mitovhondrial genes?
-          required: false
-    - name: Sampling options
-      arguments:
-        - name: "--do_subsample"
-          type: boolean
-          default: false
-          description: "Whether or not to subsample the dataset"
-        - name: "--n_obs"
-          type: integer
-          description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-          default: 600
-        - name: "--n_vars"
-          type: integer
-          description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-          default: 500
-        # - name: "--keep_features"
-        #   type: string
-        #   multiple: true
-        #   description: A list of genes to keep.
-        # - name: "--keep_cell_type_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--keep_batch_categories"
-        #   type: "string"
-        #   multiple: true
-        #   description: "Categories indexes to be selected"
-        #   required: false
-        # - name: "--even"
-        #   type: "boolean_true"
-        #   description: Subsample evenly from different batches
-        - name: "--seed"
-          type: "integer"
-          description: "A seed for the subsampling."
-          example: 123
-    - name: Normalization
-      arguments:
-        - name: "--normalization_methods"
-          type: string
-          multiple: true
-          choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"]
-          default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"]
-          description: "Which normalization methods to run."
-  resources:
-    - type: nextflow_script
-      path: main.nf
-      entrypoint: run_wf
-    - path: /src/wf_utils/helper.nf
-  dependencies: 
-    - name: datasets/loaders/zenodo_spatial_slidetags
-    - name: datasets/normalization/log_cp
-    - name: datasets/normalization/log_scran_pooling
-    - name: datasets/normalization/sqrt_cp
-    - name: datasets/normalization/l1_sqrt
-    - name: datasets/processors/subsample
-    - name: common/extract_metadata
-platforms:
-  - type: nextflow
\ No newline at end of file
diff --git a/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml
new file mode 100644
index 0000000000..e379077261
--- /dev/null
+++ b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml
@@ -0,0 +1,209 @@
+name: process_cellxgene_census
+namespace: datasets/workflows/scrnaseq
+description: |
+  Fetch and process datasets originating from the CELLxGENE census.
+argument_groups:
+  - name: Input database
+    description: Open CellxGene Census by version or URI.
+    arguments:
+      - name: --input_uri
+        type: string
+        description: If specified, a URI containing the Census SOMA objects. If specified,
+          will take precedence over the `--census_version` argument.
+        required: false
+        example: s3://bucket/path
+      - name: --census_version
+        description: Which release of CellxGene census to use. Possible values are
+          "latest", "stable", or the date of one of the releases (e.g. "2023-07-25").
+          For more information, check the documentation on [Census data 
+          releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html).
+        type: string
+        example: stable
+        required: false
+  - name: Cell query
+    description: Arguments related to the query.
+    arguments:
+      - name: --species
+        type: string
+        description: The organism to query, usually one of `Homo sapiens` or `Mus
+          musculus`.
+        required: false
+        default: homo_sapiens
+        multiple: false
+      - name: --obs_value_filter
+        type: string
+        description: Filter for selecting the `obs` metadata (i.e. cells). Value is
+          a filter query written in the SOMA `value_filter` syntax.
+        required: false
+        example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'
+  - name: Cell filter
+    description: Filter the cells based on a minimum cell count per specified group
+    arguments:
+      - name: --cell_filter_grouping
+        type: string
+        description: |
+          A subset of 'obs' columns by which to group the cells for filtering.
+          Only groups surpassing or equal to the `--cell_filter_minimum_count`
+          threshold will be retained. Take care not to introduce a selection
+          bias against cells with more fine-grained ontology annotations.
+        required: false
+        example: [dataset_id, tissue, assay, disease, cell_type]
+        multiple: true
+      - name: --cell_filter_minimum_count
+        type: double
+        description: |
+          A minimum number of cells per group to retain. If `--cell_filter_grouping`
+          is defined, this parameter should also be provided and vice versa.
+        required: false
+        example: 100
+  - name: Cell metadata
+    description: Cell metadata arguments
+    arguments:
+      - name: --obs_batch
+        type: string
+        description: |
+          Location of where to find the observation batch IDs.  
+
+          * If not specified, the `.obs["batch"]` field will not be included.
+          * If one or more values are specified, the `.obs["batch"]` field will be 
+            set to the concatenated values of the specified fields, separated by
+            the `obs_batch_separator`.
+        required: false
+        multiple: true
+        multiple_sep: ','
+        example: [batch]
+      - name: --obs_batch_separator
+        type: string
+        description: Separator to use when concatenating the values of the `--obs_batch`
+          fields.
+        required: false
+        default: +
+  - name: Dataset metadata
+    description: Information about the dataset that will be stored in the `.uns` slot.
+    arguments:
+      - name: --id
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: true
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --keep_features
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: --keep_cell_type_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --keep_batch_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --even
+        type: boolean_true
+        description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+  - name: Outputs
+    arguments:
+      - name: --output_dataset
+        __merge__: /src/datasets/api/file_common_dataset.yaml
+        direction: output
+        required: true
+      - name: --output_meta
+        direction: output
+        type: file
+        description: Dataset metadata
+        default: dataset_metadata.yaml
+      - name: --output_raw
+        __merge__: /src/datasets/api/file_raw.yaml
+        direction: output
+        required: false
+      - name: --output_normalized
+        __merge__: /src/datasets/api/file_normalized.yaml
+        direction: output
+        required: false
+      - name: --output_pca
+        __merge__: /src/datasets/api/file_pca.yaml
+        direction: output
+        required: false
+      - name: --output_hvg
+        __merge__: /src/datasets/api/file_hvg.yaml
+        direction: output
+        required: false
+      - name: --output_knn
+        __merge__: /src/datasets/api/file_knn.yaml
+        direction: output
+        required: false
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/scrnaseq/cellxgene_census
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/processors/subsample
+  - name: datasets/processors/pca
+  - name: datasets/processors/hvg
+  - name: datasets/processors/knn
+  - name: common/extract_metadata
+  # test_resources:
+  #   - type: nextflow_script
+  #     path: main.nf
+  #     entrypoint: test_wf
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_cellxgene_census/main.nf b/src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf
similarity index 100%
rename from src/datasets/workflows/process_cellxgene_census/main.nf
rename to src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf
diff --git a/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml
new file mode 100644
index 0000000000..fe96bf166d
--- /dev/null
+++ b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml
@@ -0,0 +1,167 @@
+name: process_openproblems_v1
+namespace: datasets/workflows/scrnaseq
+description: |
+  Fetch and process legacy OpenProblems v1 datasets
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --input_id
+        type: string
+        description: The ID of the dataset in OpenProblems v1
+        required: true
+      - name: --obs_cell_type
+        type: string
+        description: Location of where to find the observation cell types.
+      - name: --obs_batch
+        type: string
+        description: Location of where to find the observation batch IDs.
+      - name: --obs_tissue
+        type: string
+        description: Location of where to find the observation tissue information.
+      - name: --layer_counts
+        type: string
+        description: In which layer to find the counts matrix. Leave undefined to
+          use `.X`.
+        example: counts
+      - name: --sparse
+        type: boolean
+        default: true
+        description: Convert layers to a sparse CSR format.
+      - name: --var_feature_id
+        type: string
+        description: Location of where to find the feature IDs. Can be set to index
+          if the feature IDs are the index.
+        example: gene_ids
+      - name: --var_feature_name
+        type: string
+        description: Location of where to find the feature names. Can be set to index
+          if the feature names are the index.
+        default: index
+  - name: Metadata
+    arguments:
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --keep_features
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: --keep_cell_type_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --keep_batch_categories
+        type: string
+        multiple: true
+        description: Categories indexes to be selected
+        required: false
+      - name: --even
+        type: boolean_true
+        description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+  - name: Outputs
+    arguments:
+      - name: --output_dataset
+        __merge__: /src/datasets/api/file_common_dataset.yaml
+        direction: output
+        required: true
+      - name: --output_meta
+        direction: output
+        type: file
+        description: Dataset metadata
+        default: dataset_metadata.yaml
+      - name: --output_raw
+        __merge__: /src/datasets/api/file_raw.yaml
+        direction: output
+        required: false
+      - name: --output_normalized
+        __merge__: /src/datasets/api/file_normalized.yaml
+        direction: output
+        required: false
+      - name: --output_pca
+        __merge__: /src/datasets/api/file_pca.yaml
+        direction: output
+        required: false
+      - name: --output_hvg
+        __merge__: /src/datasets/api/file_hvg.yaml
+        direction: output
+        required: false
+      - name: --output_knn
+        __merge__: /src/datasets/api/file_knn.yaml
+        direction: output
+        required: false
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/scrnaseq/openproblems_v1
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/processors/subsample
+  - name: datasets/processors/pca
+  - name: datasets/processors/hvg
+  - name: datasets/processors/knn
+  - name: common/extract_metadata
+  # test_resources:
+  #   - type: nextflow_script
+  #     path: main.nf
+  #     entrypoint: test_wf
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf
similarity index 100%
rename from src/datasets/workflows/process_openproblems_v1/main.nf
rename to src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf
diff --git a/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml
new file mode 100644
index 0000000000..fac91adc72
--- /dev/null
+++ b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml
@@ -0,0 +1,143 @@
+name: process_tenx_visium
+namespace: datasets/workflows/spatial
+description: |
+  Download and process datasets originating from 10x Genomics.
+argument_groups:
+  - name: Input
+    arguments:
+      - name: --input_expression
+        type: string
+        description: URL to the feature / barcode matrix HDF5.
+        required: true
+      - name: --input_spatial
+        type: string
+        description: URL to the Spatial imaging data.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --output_dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        __merge__: /src/datasets/api/file_spatial_dataset.yaml
+      - name: --output_meta
+        direction: output
+        type: file
+        description: Dataset metadata
+        default: dataset_metadata.yaml
+  - name: Metadata
+    arguments:
+      - name: --id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitovhondrial genes?
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+        # - name: "--keep_features"
+        #   type: string
+        #   multiple: true
+        #   description: A list of genes to keep.
+        # - name: "--keep_cell_type_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--keep_batch_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--even"
+        #   type: "boolean_true"
+        #   description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/spatial/tenx_visium
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/processors/subsample
+  - name: common/extract_metadata
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_tenx_visium/main.nf b/src/datasets/workflows/spatial/process_tenx_visium/main.nf
similarity index 100%
rename from src/datasets/workflows/process_tenx_visium/main.nf
rename to src/datasets/workflows/spatial/process_tenx_visium/main.nf
diff --git a/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml
new file mode 100644
index 0000000000..b2feb4bcb5
--- /dev/null
+++ b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml
@@ -0,0 +1,139 @@
+name: process_zenodo
+namespace: datasets/workflows/spatial
+description: |
+  Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo.
+argument_groups:
+  - name: Input
+    arguments:
+      - name: --input_data
+        type: string
+        description: URL to the Anndata file.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --output_dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        __merge__: /src/datasets/api/file_spatial_dataset.yaml
+      - name: --output_meta
+        direction: output
+        type: file
+        description: Dataset metadata
+        default: dataset_metadata.yaml
+  - name: Metadata
+    arguments:
+      - name: --id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitovhondrial genes?
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 600
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+        # - name: "--keep_features"
+        #   type: string
+        #   multiple: true
+        #   description: A list of genes to keep.
+        # - name: "--keep_cell_type_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--keep_batch_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--even"
+        #   type: "boolean_true"
+        #   description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/spatial/zenodo
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/processors/subsample
+  - name: common/extract_metadata
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_zenodo_spatial/main.nf b/src/datasets/workflows/spatial/process_zenodo/main.nf
similarity index 99%
rename from src/datasets/workflows/process_zenodo_spatial/main.nf
rename to src/datasets/workflows/spatial/process_zenodo/main.nf
index a5893c0ab4..6343cdc277 100644
--- a/src/datasets/workflows/process_zenodo_spatial/main.nf
+++ b/src/datasets/workflows/spatial/process_zenodo/main.nf
@@ -49,7 +49,7 @@ workflow run_wf {
     }
 
     // fetch data from legacy openproblems
-    | zenodo_spatial.run(
+    | zenodo.run(
       fromState: [
         "input_data": "input_data",
         "dataset_id": "id",
diff --git a/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml
new file mode 100644
index 0000000000..2907477c9a
--- /dev/null
+++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml
@@ -0,0 +1,139 @@
+name: process_zenodo_slidetags
+namespace: datasets/workflows/spatial
+description: |
+  Download and process slide tags datasets originating from Zenodo.
+argument_groups:
+  - name: Input
+    arguments:
+      - name: --input_data
+        type: string
+        description: URL to the Anndata file.
+        required: true
+  - name: Outputs
+    arguments:
+      - name: --output_dataset
+        type: file
+        direction: output
+        description: Output h5ad file
+        required: true
+        __merge__: /src/datasets/api/file_spatial_dataset.yaml
+      - name: --output_meta
+        direction: output
+        type: file
+        description: Dataset metadata
+        default: dataset_metadata.yaml
+  - name: Metadata
+    arguments:
+      - name: --id
+        type: string
+        description: Unique identifier of the dataset.
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - name: --dataset_url
+        type: string
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the dataset.
+        required: false
+  - name: Gene or spot filtering
+    description: Arguments related to filtering cells and genes by counts.
+    arguments:
+      - name: --spot_filter_min_genes
+        type: integer
+        description: Remove spots with less than this number of genes.
+        required: false
+        example: 200
+      - name: --spot_filter_min_counts
+        type: integer
+        description: Remove spots with less than this number of counts.
+        required: false
+      - name: --gene_filter_min_spots
+        type: integer
+        description: Remove genes expressed in less than this number of cells.
+        required: false
+        example: 50
+      - name: --gene_filter_min_counts
+        type: integer
+        description: Remove genes with less than this number of counts.
+        required: false
+      - name: --remove_mitochondrial
+        type: boolean
+        description: Remove mitovhondrial genes?
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: --do_subsample
+        type: boolean
+        default: false
+        description: Whether or not to subsample the dataset
+      - name: --n_obs
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 600
+      - name: --n_vars
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being
+          less because empty cells / genes are removed.
+        default: 500
+        # - name: "--keep_features"
+        #   type: string
+        #   multiple: true
+        #   description: A list of genes to keep.
+        # - name: "--keep_cell_type_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--keep_batch_categories"
+        #   type: "string"
+        #   multiple: true
+        #   description: "Categories indexes to be selected"
+        #   required: false
+        # - name: "--even"
+        #   type: "boolean_true"
+        #   description: Subsample evenly from different batches
+      - name: --seed
+        type: integer
+        description: A seed for the subsampling.
+        example: 123
+  - name: Normalization
+    arguments:
+      - name: --normalization_methods
+        type: string
+        multiple: true
+        choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling]
+        default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt]
+        description: Which normalization methods to run.
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+dependencies:
+  - name: datasets/loaders/spatial/zenodo_slidetags
+  - name: datasets/normalization/log_cp
+  - name: datasets/normalization/log_scran_pooling
+  - name: datasets/normalization/sqrt_cp
+  - name: datasets/normalization/l1_sqrt
+  - name: datasets/processors/subsample
+  - name: common/extract_metadata
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf
similarity index 98%
rename from src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf
rename to src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf
index 2bb6b9300a..e2f43188a9 100644
--- a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf
+++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf
@@ -49,7 +49,7 @@ workflow run_wf {
     }
 
     // fetch data from legacy openproblems
-    | zenodo_spatial_slidetags.run(
+    | zenodo_slidetags.run(
       fromState: [
         "input_data": "input_data",
         "dataset_id": "id",