Update dataset loaders (#909)

* update to viash 0.9 and categorise datasets * group workflows * add api for spatial datasets * add more metadata * update publish dir path * update project config * update namespace * fix id * update example * fix example * update test resources * update helper resources * fix multiple separator --------- Co-authored-by: Robrecht Cannoodt <[email protected]>
openproblems-bio · Oct 18, 2024 · e7b3859 · e7b3859
1 parent 8f49337
commit e7b3859
Show file tree

Hide file tree

Showing 117 changed files with 2,985 additions and 2,761 deletions.
diff --git a/_viash.yaml b/_viash.yaml
@@ -6,7 +6,7 @@ viash_version: 0.9.0
 description: |
   Open Problems is a living, extensible, community-guided benchmarking platform.
 license: MIT
-keywords: [openproblems, benchmarking, single-cell]
+keywords: [openproblems, benchmarking, single-cell omics]
 
 references:
   doi:
@@ -24,6 +24,7 @@ config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
   .runners[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'"
 
+
 info:
   test_resources:
     - type: s3

diff --git a/src/datasets/api/comp_dataset_loader.yaml b/src/datasets/api/comp_dataset_loader.yaml
@@ -1,16 +1,15 @@
-functionality:
-  namespace: "datasets/loaders"
-  info:
-    type: dataset_loader
-    type_info:
-      label: Dataset loader
-      summary: A component which generates a "Common dataset". 
-      description: |
-        A dataset loader will typically have an identifier (e.g. a GEO identifier)
-        or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it.
-  arguments:
-    - name: "--output"
-      __merge__: file_raw.yaml
-      direction: "output"
-      required: true
-  test_resources: []
+# namespace: "datasets/loaders"
+info:
+  type: dataset_loader
+  type_info:
+    label: Dataset loader
+    summary: A component which generates a "Common dataset". 
+    description: |
+      A dataset loader will typically have an identifier (e.g. a GEO identifier)
+      or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it.
+arguments:
+  - name: "--output"
+    __merge__: file_raw.yaml
+    direction: "output"
+    required: true
+test_resources: []
diff --git a/src/datasets/api/comp_normalization.yaml b/src/datasets/api/comp_normalization.yaml
@@ -1,36 +1,35 @@
-functionality:
-  namespace: "datasets/normalization"
-  info:
-    type: dataset_normalization
-    type_info:
-      label: Dataset normalization
-      summary: |
-        A normalization method which processes the raw counts into a normalized dataset.
-      description:
-        A component for normalizing the raw counts as output by dataset loaders into a normalized dataset.
-  arguments:
-    - name: "--input"
-      __merge__: file_raw.yaml
-      direction: input
-      required: true
-    - name: "--output"
-      __merge__: file_normalized.yaml
-      direction: output
-      required: true
-    - name: "--normalization_id"
-      type: string
-      description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used."
-      required: false
-    - name: "--layer_output"
-      type: string
-      default: "normalized"
-      description: The name of the layer in which to store the normalized data.
-    - name: "--obs_size_factors"
-      type: string
-      default: "size_factors"
-      description: In which .obs slot to store the size factors (if any).
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/normalization"
+info:
+  type: dataset_normalization
+  type_info:
+    label: Dataset normalization
+    summary: |
+      A normalization method which processes the raw counts into a normalized dataset.
+    description:
+      A component for normalizing the raw counts as output by dataset loaders into a normalized dataset.
+arguments:
+  - name: "--input"
+    __merge__: file_raw.yaml
+    direction: input
+    required: true
+  - name: "--output"
+    __merge__: file_normalized.yaml
+    direction: output
+    required: true
+  - name: "--normalization_id"
+    type: string
+    description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used."
+    required: false
+  - name: "--layer_output"
+    type: string
+    default: "normalized"
+    description: The name of the layer in which to store the normalized data.
+  - name: "--obs_size_factors"
+    type: string
+    default: "size_factors"
+    description: In which .obs slot to store the size factors (if any).
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_hvg.yaml b/src/datasets/api/comp_processor_hvg.yaml
@@ -1,40 +1,39 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: HVG
-      summary: |
-        Computes the highly variable genes scores.
-      description: |
-        The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'.
-  arguments:
-    - name: "--input"
-      __merge__: file_normalized.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--output"
-      direction: output
-      __merge__: file_hvg.yaml
-      required: true
-    - name: "--var_hvg"
-      type: string
-      default: "hvg"
-      description: "In which .var slot to store whether a feature is considered to be hvg."
-    - name: "--var_hvg_score"
-      type: string
-      default: "hvg_score"
-      description: "In which .var slot to store the gene variance score (normalized dispersion)."
-    - name: "--num_features"
-      type: integer
-      default: 1000
-      description: "The number of HVG to select"
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: HVG
+    summary: |
+      Computes the highly variable genes scores.
+    description: |
+      The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'.
+arguments:
+  - name: "--input"
+    __merge__: file_normalized.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--output"
+    direction: output
+    __merge__: file_hvg.yaml
+    required: true
+  - name: "--var_hvg"
+    type: string
+    default: "hvg"
+    description: "In which .var slot to store whether a feature is considered to be hvg."
+  - name: "--var_hvg_score"
+    type: string
+    default: "hvg_score"
+    description: "In which .var slot to store the gene variance score (normalized dispersion)."
+  - name: "--num_features"
+    type: integer
+    default: 1000
+    description: "The number of HVG to select"
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_knn.yaml b/src/datasets/api/comp_processor_knn.yaml
@@ -1,39 +1,38 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: KNN
-      summary: |
-        Computes the k-nearest-neighbours for each cell.
-      description: |
-        The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'.
-  arguments:
-    - name: "--input"
-      __merge__: file_pca.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--output"
-      direction: output
-      __merge__: file_knn.yaml
-      required: true
-    - name: "--key_added"
-      type: string
-      default: "knn"
-      description: |
-        The neighbors data is added to `.uns[key_added]`, 
-        distances are stored in `.obsp[key_added+'_distances']` and 
-        connectivities in `.obsp[key_added+'_connectivities']`.
-    - name: "--num_neighbors"
-      type: integer
-      default: 15
-      description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation."
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: KNN
+    summary: |
+      Computes the k-nearest-neighbours for each cell.
+    description: |
+      The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'.
+arguments:
+  - name: "--input"
+    __merge__: file_pca.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--output"
+    direction: output
+    __merge__: file_knn.yaml
+    required: true
+  - name: "--key_added"
+    type: string
+    default: "knn"
+    description: |
+      The neighbors data is added to `.uns[key_added]`, 
+      distances are stored in `.obsp[key_added+'_distances']` and 
+      connectivities in `.obsp[key_added+'_connectivities']`.
+  - name: "--num_neighbors"
+    type: integer
+    default: 15
+    description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation."
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/datasets/api/comp_processor_pca.yaml b/src/datasets/api/comp_processor_pca.yaml
@@ -1,49 +1,48 @@
-functionality:
-  namespace: "datasets/processors"
-  info:
-    type: dataset_processor
-    type_info:
-      label: PCA
-      summary: |
-        Computes a PCA embedding of the normalized data.
-      description:
-        The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'.
-  arguments:
-    - name: "--input"
-      __merge__: file_hvg.yaml
-      required: true
-      direction: input
-    - name: "--input_layer"
-      type: string
-      default: "normalized"
-      description: Which layer to use as input.
-    - name: "--input_var_features"
-      type: string
-      description: Column name in .var matrix that will be used to select which genes to run the PCA on.
-      default: hvg
-    - name: "--output"
-      direction: output
-      __merge__: file_pca.yaml
-      required: true
-    - name: "--obsm_embedding"
-      type: string
-      default: "X_pca"
-      description: "In which .obsm slot to store the resulting embedding."
-    - name: "--varm_loadings"
-      type: string
-      default: "pca_loadings"
-      description: "In which .varm slot to store the resulting loadings matrix."
-    - name: "--uns_variance"
-      type: string
-      default: "pca_variance"
-      description: "In which .uns slot to store the resulting variance objects."
-    - name: "--num_components"
-      type: integer
-      example: 25
-      description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.
-  test_resources:
-    - path: /resources_test/common/pancreas
-      dest: resources_test/common/pancreas
-    - type: python_script
-      path: /src/common/comp_tests/run_and_check_adata.py
+namespace: "datasets/processors"
+info:
+  type: dataset_processor
+  type_info:
+    label: PCA
+    summary: |
+      Computes a PCA embedding of the normalized data.
+    description:
+      The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'.
+arguments:
+  - name: "--input"
+    __merge__: file_hvg.yaml
+    required: true
+    direction: input
+  - name: "--input_layer"
+    type: string
+    default: "normalized"
+    description: Which layer to use as input.
+  - name: "--input_var_features"
+    type: string
+    description: Column name in .var matrix that will be used to select which genes to run the PCA on.
+    default: hvg
+  - name: "--output"
+    direction: output
+    __merge__: file_pca.yaml
+    required: true
+  - name: "--obsm_embedding"
+    type: string
+    default: "X_pca"
+    description: "In which .obsm slot to store the resulting embedding."
+  - name: "--varm_loadings"
+    type: string
+    default: "pca_loadings"
+    description: "In which .varm slot to store the resulting loadings matrix."
+  - name: "--uns_variance"
+    type: string
+    default: "pca_variance"
+    description: "In which .uns slot to store the resulting variance objects."
+  - name: "--num_components"
+    type: integer
+    example: 25
+    description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation.
+test_resources:
+  - path: /resources_test/common/pancreas
+    dest: resources_test/common/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py