From e7b38592d868cee0751e4b2b810cca371e7cd061 Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Fri, 18 Oct 2024 18:18:56 +0200 Subject: [PATCH] Update dataset loaders (#909) * update to viash 0.9 and categorise datasets * group workflows * add api for spatial datasets * add more metadata * update publish dir path * update project config * update namespace * fix id * update example * fix example * update test resources * update helper resources * fix multiple separator --------- Co-authored-by: Robrecht Cannoodt --- _viash.yaml | 3 +- src/datasets/api/comp_dataset_loader.yaml | 31 ++- src/datasets/api/comp_normalization.yaml | 71 +++--- src/datasets/api/comp_processor_hvg.yaml | 79 ++++--- src/datasets/api/comp_processor_knn.yaml | 77 ++++--- src/datasets/api/comp_processor_pca.yaml | 95 ++++---- src/datasets/api/comp_processor_subset.yaml | 59 +++-- src/datasets/api/comp_processor_svd.yaml | 87 ++++---- src/datasets/api/file_common_dataset.yaml | 11 +- src/datasets/api/file_hvg.yaml | 7 +- src/datasets/api/file_knn.yaml | 7 +- src/datasets/api/file_multimodal_dataset.yaml | 17 +- src/datasets/api/file_normalized.yaml | 7 +- src/datasets/api/file_pca.yaml | 7 +- src/datasets/api/file_raw.yaml | 27 ++- src/datasets/api/file_spatial_dataset.yaml | 194 ++++++++++++++++ src/datasets/api/file_svd.yaml | 7 +- .../loaders/cellxgene_census/config.vsh.yaml | 167 -------------- .../config.vsh.yaml | 130 ----------- .../config.vsh.yaml | 75 +++++++ .../openproblems_neurips2021_bmmc/script.py | 2 +- .../openproblems_neurips2021_bmmc/test.py | 0 .../config.vsh.yaml | 81 +++++++ .../openproblems_neurips2022_pbmc/script.py | 2 +- .../openproblems_neurips2022_pbmc/test.py | 0 .../config.vsh.yaml | 95 ++++++++ .../openproblems_v1_multimodal/script.py | 0 .../openproblems_v1_multimodal/test.py | 0 .../config.vsh.yaml | 74 ------- .../config.vsh.yaml | 80 ------- .../loaders/openproblems_v1/config.vsh.yaml | 86 ------- .../config.vsh.yaml | 94 -------- .../scrnaseq/cellxgene_census/config.vsh.yaml | 176 +++++++++++++++ .../{ => scrnaseq}/cellxgene_census/script.py | 0 .../{ => scrnaseq}/cellxgene_census/test.py | 0 .../config.vsh.yaml | 132 +++++++++++ .../script.py | 0 .../cellxgene_census_from_source_h5ad/test.py | 0 .../scrnaseq/openproblems_v1/config.vsh.yaml | 91 ++++++++ .../{ => scrnaseq}/openproblems_v1/script.py | 0 .../{ => scrnaseq}/openproblems_v1/test.py | 0 .../spatial/tenx_visium/config.vsh.yaml | 96 ++++++++ .../{ => spatial}/tenx_visium/script.py | 2 +- .../loaders/{ => spatial}/tenx_visium/test.py | 0 .../loaders/spatial/zenodo/config.vsh.yaml | 88 ++++++++ .../zenodo}/script.py | 4 +- .../zenodo}/test.py | 2 +- .../spatial/zenodo_slidetags/config.vsh.yaml | 88 ++++++++ .../zenodo_slidetags}/script.py | 4 +- .../zenodo_slidetags}/test.py | 2 +- .../loaders/tenx_visium/config.vsh.yaml | 96 -------- .../loaders/zenodo_spatial/config.vsh.yaml | 87 -------- .../zenodo_spatial_slidetags/config.vsh.yaml | 88 -------- .../normalization/atac_tfidf/config.vsh.yaml | 23 +- .../normalization/atac_tfidf/script.py | 4 +- .../normalization/l1_sqrt/config.vsh.yaml | 29 +-- src/datasets/normalization/l1_sqrt/script.py | 4 +- .../normalization/log_cp/config.vsh.yaml | 28 +-- src/datasets/normalization/log_cp/script.py | 4 +- .../log_scran_pooling/config.vsh.yaml | 20 +- .../normalization/log_scran_pooling/script.R | 2 +- .../normalization/prot_clr/config.vsh.yaml | 29 +-- src/datasets/normalization/prot_clr/script.py | 4 +- .../normalization/sqrt_cp/config.vsh.yaml | 27 +-- src/datasets/normalization/sqrt_cp/script.py | 4 +- src/datasets/processors/hvg/config.vsh.yaml | 15 +- src/datasets/processors/knn/config.vsh.yaml | 15 +- src/datasets/processors/pca/config.vsh.yaml | 15 +- .../processors/subsample/config.vsh.yaml | 85 +++---- .../processors/subsample/test_script.py | 6 +- src/datasets/processors/svd/config.vsh.yaml | 15 +- .../resource_scripts/cellxgene_census.sh | 4 +- src/datasets/resource_scripts/dataset_info.sh | 6 +- .../openproblems_neurips2021_multimodal.sh | 4 +- ...penproblems_neurips2021_multimodal_test.sh | 4 +- .../openproblems_neurips2022_pbmc.sh | 4 +- .../resource_scripts/openproblems_v1.sh | 4 +- .../openproblems_v1_multimodal.sh | 4 +- .../openproblems_v1_multimodal_test.sh | 4 +- .../resource_scripts/openproblems_v1_test.sh | 4 +- src/datasets/resource_scripts/tenx_visium.sh | 8 +- .../{zenodo_spatial.sh => zenodo.sh} | 68 +++--- ...atial_slidetags.sh => zenodo_slidetags.sh} | 12 +- .../cxg_immune_cell_atlas.sh | 5 +- .../cxg_mouse_pancreas_atlas.sh | 4 +- .../mouse_brain_coronal.sh | 4 +- .../resource_test_scripts/neurips2021_bmmc.sh | 8 +- .../resource_test_scripts/neurips2022_pbmc.sh | 6 +- .../resource_test_scripts/pancreas.sh | 12 +- .../scicar_cell_lines.sh | 4 +- .../resource_test_scripts/slideseq_test.sh | 36 --- .../extract_dataset_info/config.vsh.yaml | 66 +++--- .../extract_dataset_meta/config.vsh.yaml | 47 ++-- .../config.vsh.yaml | 138 ++++++++++++ .../main.nf | 0 .../config.vsh.yaml | 144 ++++++++++++ .../main.nf | 0 .../config.vsh.yaml | 165 ++++++++++++++ .../main.nf | 0 .../process_cellxgene_census/config.vsh.yaml | 201 ----------------- .../config.vsh.yaml | 137 ------------ .../config.vsh.yaml | 143 ------------ .../process_openproblems_v1/config.vsh.yaml | 163 -------------- .../config.vsh.yaml | 161 -------------- .../process_tenx_visium/config.vsh.yaml | 142 ------------ .../process_zenodo_spatial/config.vsh.yaml | 138 ------------ .../config.vsh.yaml | 138 ------------ .../process_cellxgene_census/config.vsh.yaml | 209 ++++++++++++++++++ .../process_cellxgene_census/main.nf | 0 .../process_openproblems_v1/config.vsh.yaml | 167 ++++++++++++++ .../process_openproblems_v1/main.nf | 0 .../process_tenx_visium/config.vsh.yaml | 143 ++++++++++++ .../{ => spatial}/process_tenx_visium/main.nf | 0 .../spatial/process_zenodo/config.vsh.yaml | 139 ++++++++++++ .../process_zenodo}/main.nf | 2 +- .../process_zenodo_slidetags/config.vsh.yaml | 139 ++++++++++++ .../process_zenodo_slidetags}/main.nf | 2 +- 117 files changed, 2985 insertions(+), 2761 deletions(-) create mode 100644 src/datasets/api/file_spatial_dataset.yaml delete mode 100644 src/datasets/loaders/cellxgene_census/config.vsh.yaml delete mode 100644 src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml create mode 100644 src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2021_bmmc/script.py (98%) rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2021_bmmc/test.py (100%) create mode 100644 src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2022_pbmc/script.py (98%) rename src/datasets/loaders/{ => multimodal}/openproblems_neurips2022_pbmc/test.py (100%) create mode 100644 src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml rename src/datasets/loaders/{ => multimodal}/openproblems_v1_multimodal/script.py (100%) rename src/datasets/loaders/{ => multimodal}/openproblems_v1_multimodal/test.py (100%) delete mode 100644 src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml delete mode 100644 src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml delete mode 100644 src/datasets/loaders/openproblems_v1/config.vsh.yaml delete mode 100644 src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml create mode 100644 src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census/script.py (100%) rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census/test.py (100%) create mode 100644 src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census_from_source_h5ad/script.py (100%) rename src/datasets/loaders/{ => scrnaseq}/cellxgene_census_from_source_h5ad/test.py (100%) create mode 100644 src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml rename src/datasets/loaders/{ => scrnaseq}/openproblems_v1/script.py (100%) rename src/datasets/loaders/{ => scrnaseq}/openproblems_v1/test.py (100%) create mode 100644 src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml rename src/datasets/loaders/{ => spatial}/tenx_visium/script.py (98%) rename src/datasets/loaders/{ => spatial}/tenx_visium/test.py (100%) create mode 100644 src/datasets/loaders/spatial/zenodo/config.vsh.yaml rename src/datasets/loaders/{zenodo_spatial => spatial/zenodo}/script.py (96%) rename src/datasets/loaders/{zenodo_spatial => spatial/zenodo}/test.py (97%) create mode 100644 src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml rename src/datasets/loaders/{zenodo_spatial_slidetags => spatial/zenodo_slidetags}/script.py (96%) rename src/datasets/loaders/{zenodo_spatial_slidetags => spatial/zenodo_slidetags}/test.py (97%) delete mode 100644 src/datasets/loaders/tenx_visium/config.vsh.yaml delete mode 100644 src/datasets/loaders/zenodo_spatial/config.vsh.yaml delete mode 100644 src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml rename src/datasets/resource_scripts/{zenodo_spatial.sh => zenodo.sh} (92%) rename src/datasets/resource_scripts/{zenodo_spatial_slidetags.sh => zenodo_slidetags.sh} (89%) delete mode 100755 src/datasets/resource_test_scripts/slideseq_test.sh create mode 100644 src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml rename src/datasets/workflows/{ => multimodal}/process_openproblems_neurips2021_bmmc/main.nf (100%) create mode 100644 src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml rename src/datasets/workflows/{ => multimodal}/process_openproblems_neurips2022_pbmc/main.nf (100%) create mode 100644 src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml rename src/datasets/workflows/{ => multimodal}/process_openproblems_v1_multimodal/main.nf (100%) delete mode 100644 src/datasets/workflows/process_cellxgene_census/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_openproblems_v1/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_tenx_visium/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml delete mode 100644 src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml create mode 100644 src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml rename src/datasets/workflows/{ => scrnaseq}/process_cellxgene_census/main.nf (100%) create mode 100644 src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml rename src/datasets/workflows/{ => scrnaseq}/process_openproblems_v1/main.nf (100%) create mode 100644 src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml rename src/datasets/workflows/{ => spatial}/process_tenx_visium/main.nf (100%) create mode 100644 src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml rename src/datasets/workflows/{process_zenodo_spatial => spatial/process_zenodo}/main.nf (99%) create mode 100644 src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml rename src/datasets/workflows/{process_zenodo_spatial_slidetags => spatial/process_zenodo_slidetags}/main.nf (98%) diff --git a/_viash.yaml b/_viash.yaml index f262b4964b..0d06459772 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -6,7 +6,7 @@ viash_version: 0.9.0 description: | Open Problems is a living, extensible, community-guided benchmarking platform. license: MIT -keywords: [openproblems, benchmarking, single-cell] +keywords: [openproblems, benchmarking, single-cell omics] references: doi: @@ -24,6 +24,7 @@ config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } .runners[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" + info: test_resources: - type: s3 diff --git a/src/datasets/api/comp_dataset_loader.yaml b/src/datasets/api/comp_dataset_loader.yaml index 75909b106a..f3ea6426bb 100644 --- a/src/datasets/api/comp_dataset_loader.yaml +++ b/src/datasets/api/comp_dataset_loader.yaml @@ -1,16 +1,15 @@ -functionality: - namespace: "datasets/loaders" - info: - type: dataset_loader - type_info: - label: Dataset loader - summary: A component which generates a "Common dataset". - description: | - A dataset loader will typically have an identifier (e.g. a GEO identifier) - or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it. - arguments: - - name: "--output" - __merge__: file_raw.yaml - direction: "output" - required: true - test_resources: [] \ No newline at end of file +# namespace: "datasets/loaders" +info: + type: dataset_loader + type_info: + label: Dataset loader + summary: A component which generates a "Common dataset". + description: | + A dataset loader will typically have an identifier (e.g. a GEO identifier) + or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it. +arguments: + - name: "--output" + __merge__: file_raw.yaml + direction: "output" + required: true +test_resources: [] \ No newline at end of file diff --git a/src/datasets/api/comp_normalization.yaml b/src/datasets/api/comp_normalization.yaml index 6f2c1ffa64..38cd4efe81 100644 --- a/src/datasets/api/comp_normalization.yaml +++ b/src/datasets/api/comp_normalization.yaml @@ -1,36 +1,35 @@ -functionality: - namespace: "datasets/normalization" - info: - type: dataset_normalization - type_info: - label: Dataset normalization - summary: | - A normalization method which processes the raw counts into a normalized dataset. - description: - A component for normalizing the raw counts as output by dataset loaders into a normalized dataset. - arguments: - - name: "--input" - __merge__: file_raw.yaml - direction: input - required: true - - name: "--output" - __merge__: file_normalized.yaml - direction: output - required: true - - name: "--normalization_id" - type: string - description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used." - required: false - - name: "--layer_output" - type: string - default: "normalized" - description: The name of the layer in which to store the normalized data. - - name: "--obs_size_factors" - type: string - default: "size_factors" - description: In which .obs slot to store the size factors (if any). - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/normalization" +info: + type: dataset_normalization + type_info: + label: Dataset normalization + summary: | + A normalization method which processes the raw counts into a normalized dataset. + description: + A component for normalizing the raw counts as output by dataset loaders into a normalized dataset. +arguments: + - name: "--input" + __merge__: file_raw.yaml + direction: input + required: true + - name: "--output" + __merge__: file_normalized.yaml + direction: output + required: true + - name: "--normalization_id" + type: string + description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used." + required: false + - name: "--layer_output" + type: string + default: "normalized" + description: The name of the layer in which to store the normalized data. + - name: "--obs_size_factors" + type: string + default: "size_factors" + description: In which .obs slot to store the size factors (if any). +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_hvg.yaml b/src/datasets/api/comp_processor_hvg.yaml index 2e24033aac..bfed255d02 100644 --- a/src/datasets/api/comp_processor_hvg.yaml +++ b/src/datasets/api/comp_processor_hvg.yaml @@ -1,40 +1,39 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: HVG - summary: | - Computes the highly variable genes scores. - description: | - The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'. - arguments: - - name: "--input" - __merge__: file_normalized.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_hvg.yaml - required: true - - name: "--var_hvg" - type: string - default: "hvg" - description: "In which .var slot to store whether a feature is considered to be hvg." - - name: "--var_hvg_score" - type: string - default: "hvg_score" - description: "In which .var slot to store the gene variance score (normalized dispersion)." - - name: "--num_features" - type: integer - default: 1000 - description: "The number of HVG to select" - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: HVG + summary: | + Computes the highly variable genes scores. + description: | + The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'. +arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_hvg.yaml + required: true + - name: "--var_hvg" + type: string + default: "hvg" + description: "In which .var slot to store whether a feature is considered to be hvg." + - name: "--var_hvg_score" + type: string + default: "hvg_score" + description: "In which .var slot to store the gene variance score (normalized dispersion)." + - name: "--num_features" + type: integer + default: 1000 + description: "The number of HVG to select" +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_knn.yaml b/src/datasets/api/comp_processor_knn.yaml index b0e16f8fc4..be95b83e38 100644 --- a/src/datasets/api/comp_processor_knn.yaml +++ b/src/datasets/api/comp_processor_knn.yaml @@ -1,39 +1,38 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: KNN - summary: | - Computes the k-nearest-neighbours for each cell. - description: | - The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'. - arguments: - - name: "--input" - __merge__: file_pca.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_knn.yaml - required: true - - name: "--key_added" - type: string - default: "knn" - description: | - The neighbors data is added to `.uns[key_added]`, - distances are stored in `.obsp[key_added+'_distances']` and - connectivities in `.obsp[key_added+'_connectivities']`. - - name: "--num_neighbors" - type: integer - default: 15 - description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation." - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: KNN + summary: | + Computes the k-nearest-neighbours for each cell. + description: | + The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'. +arguments: + - name: "--input" + __merge__: file_pca.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_knn.yaml + required: true + - name: "--key_added" + type: string + default: "knn" + description: | + The neighbors data is added to `.uns[key_added]`, + distances are stored in `.obsp[key_added+'_distances']` and + connectivities in `.obsp[key_added+'_connectivities']`. + - name: "--num_neighbors" + type: integer + default: 15 + description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation." +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_pca.yaml b/src/datasets/api/comp_processor_pca.yaml index a7ca82bc07..051532cf1e 100644 --- a/src/datasets/api/comp_processor_pca.yaml +++ b/src/datasets/api/comp_processor_pca.yaml @@ -1,49 +1,48 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: PCA - summary: | - Computes a PCA embedding of the normalized data. - description: - The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'. - arguments: - - name: "--input" - __merge__: file_hvg.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--input_var_features" - type: string - description: Column name in .var matrix that will be used to select which genes to run the PCA on. - default: hvg - - name: "--output" - direction: output - __merge__: file_pca.yaml - required: true - - name: "--obsm_embedding" - type: string - default: "X_pca" - description: "In which .obsm slot to store the resulting embedding." - - name: "--varm_loadings" - type: string - default: "pca_loadings" - description: "In which .varm slot to store the resulting loadings matrix." - - name: "--uns_variance" - type: string - default: "pca_variance" - description: "In which .uns slot to store the resulting variance objects." - - name: "--num_components" - type: integer - example: 25 - description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: PCA + summary: | + Computes a PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'. +arguments: + - name: "--input" + __merge__: file_hvg.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--input_var_features" + type: string + description: Column name in .var matrix that will be used to select which genes to run the PCA on. + default: hvg + - name: "--output" + direction: output + __merge__: file_pca.yaml + required: true + - name: "--obsm_embedding" + type: string + default: "X_pca" + description: "In which .obsm slot to store the resulting embedding." + - name: "--varm_loadings" + type: string + default: "pca_loadings" + description: "In which .varm slot to store the resulting loadings matrix." + - name: "--uns_variance" + type: string + default: "pca_variance" + description: "In which .uns slot to store the resulting variance objects." + - name: "--num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_subset.yaml b/src/datasets/api/comp_processor_subset.yaml index bad64a6762..c49e7f2ece 100644 --- a/src/datasets/api/comp_processor_subset.yaml +++ b/src/datasets/api/comp_processor_subset.yaml @@ -1,31 +1,30 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: Subset - summary: Sample cells and genes randomly. - description: This component subsets the layers, obs and var to create smaller test datasets. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - required: true - direction: input - - name: "--input_mod2" - __merge__: file_common_dataset.yaml - direction: input - required: false - - name: "--output" - __merge__: file_common_dataset.yaml - direction: output - required: true - - name: "--output_mod2" - __merge__: file_common_dataset.yaml - direction: output - required: false - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: Subset + summary: Sample cells and genes randomly. + description: This component subsets the layers, obs and var to create smaller test datasets. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_common_dataset.yaml + direction: input + required: false + - name: "--output" + __merge__: file_common_dataset.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_common_dataset.yaml + direction: output + required: false +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_svd.yaml b/src/datasets/api/comp_processor_svd.yaml index 91413c2624..d5c0ae8ba8 100644 --- a/src/datasets/api/comp_processor_svd.yaml +++ b/src/datasets/api/comp_processor_svd.yaml @@ -1,45 +1,44 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: SVD - summary: | - Computes a SVD PCA embedding of the normalized data. - description: - The resulting AnnData will contain an embedding in obsm. - arguments: - - name: "--input" - __merge__: file_normalized.yaml - required: true - direction: input - - name: "--input_mod2" - __merge__: file_normalized.yaml - required: false - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_svd.yaml - required: true - - name: "--output_mod2" - direction: output - __merge__: file_svd.yaml - required: false - - name: "--obsm_embedding" - type: string - default: "X_svd" - description: "In which .obsm slot to store the resulting embedding." - - name: "--num_components" - type: integer - default: 100 - description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation. - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: SVD + summary: | + Computes a SVD PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm. +arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_normalized.yaml + required: false + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_svd.yaml + required: true + - name: "--output_mod2" + direction: output + __merge__: file_svd.yaml + required: false + - name: "--obsm_embedding" + type: string + default: "X_svd" + description: "In which .obsm slot to store the resulting embedding." + - name: "--num_components" + type: integer + default: 100 + description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation. +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/file_common_dataset.yaml b/src/datasets/api/file_common_dataset.yaml index ed7836bf5c..4ca8722aa7 100644 --- a/src/datasets/api/file_common_dataset.yaml +++ b/src/datasets/api/file_common_dataset.yaml @@ -1,9 +1,8 @@ __merge__: file_knn.yaml type: file example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common dataset" - summary: A dataset processed by the common dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a PCA embedding, HVG selection and a kNN graph. +label: "Common dataset" +summary: A dataset processed by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices, + as well as a PCA embedding, HVG selection and a kNN graph. \ No newline at end of file diff --git a/src/datasets/api/file_hvg.yaml b/src/datasets/api/file_hvg.yaml index 697be29e32..47b8f88922 100644 --- a/src/datasets/api/file_hvg.yaml +++ b/src/datasets/api/file_hvg.yaml @@ -1,10 +1,11 @@ __merge__: file_normalized.yaml type: file example: "resources_test/common/pancreas/hvg.h5ad" +label: "Dataset+HVG" +summary: "A normalised dataset with a PCA embedding and HVG selection." info: - label: "Dataset+HVG" - summary: "A normalised dataset with a PCA embedding and HVG selection." - slots: + format: + type: h5ad var: - type: boolean name: hvg diff --git a/src/datasets/api/file_knn.yaml b/src/datasets/api/file_knn.yaml index de7d2b8df5..c2f320e08e 100644 --- a/src/datasets/api/file_knn.yaml +++ b/src/datasets/api/file_knn.yaml @@ -1,10 +1,11 @@ __merge__: file_pca.yaml type: file example: "resources_test/common/pancreas/knn.h5ad" +label: "Dataset+HVG+PCA+kNN" +summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph" info: - label: "Dataset+HVG+PCA+kNN" - summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph" - slots: + format: + type: h5ad obsp: - type: double name: knn_distances diff --git a/src/datasets/api/file_multimodal_dataset.yaml b/src/datasets/api/file_multimodal_dataset.yaml index daac29d77b..b8ae760225 100644 --- a/src/datasets/api/file_multimodal_dataset.yaml +++ b/src/datasets/api/file_multimodal_dataset.yaml @@ -1,14 +1,15 @@ type: file example: "resources_test/common/pancreas/dataset.h5ad" +label: "Common dataset" +summary: A dataset processed by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices, + as well as a SVD embedding and a HVG selection. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). info: - label: "Common dataset" - summary: A dataset processed by the common dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a SVD embedding and a HVG selection. - - The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). - slots: + format: + type: h5ad layers: - type: integer name: counts diff --git a/src/datasets/api/file_normalized.yaml b/src/datasets/api/file_normalized.yaml index ea6f14e9fb..f163e31db9 100644 --- a/src/datasets/api/file_normalized.yaml +++ b/src/datasets/api/file_normalized.yaml @@ -1,10 +1,11 @@ __merge__: file_raw.yaml type: file example: "resources_test/common/pancreas/normalized.h5ad" +label: "Normalized dataset" +summary: "A normalized dataset" info: - label: "Normalized dataset" - summary: "A normalized dataset" - slots: + format: + type: h5ad layers: - type: double name: normalized diff --git a/src/datasets/api/file_pca.yaml b/src/datasets/api/file_pca.yaml index daa26618e1..2d2e48f95b 100644 --- a/src/datasets/api/file_pca.yaml +++ b/src/datasets/api/file_pca.yaml @@ -1,10 +1,11 @@ __merge__: file_hvg.yaml type: file example: "resources_test/common/pancreas/pca.h5ad" +label: "Dataset+HVG+PCA" +summary: "A normalised dataset with a PCA embedding" info: - label: "Dataset+HVG+PCA" - summary: "A normalised dataset with a PCA embedding" - slots: + format: + type: h5ad obsm: - type: double name: X_pca diff --git a/src/datasets/api/file_raw.yaml b/src/datasets/api/file_raw.yaml index 7ffab3b43e..f42b022a38 100644 --- a/src/datasets/api/file_raw.yaml +++ b/src/datasets/api/file_raw.yaml @@ -1,13 +1,14 @@ type: file example: "resources_test/common/pancreas/raw.h5ad" -info: - label: "Raw dataset" - summary: An unprocessed dataset as output by a dataset loader. - description: | - This dataset contains raw counts and metadata as output by a dataset loader. +label: "Raw dataset" +summary: An unprocessed dataset as output by a dataset loader. +description: | + This dataset contains raw counts and metadata as output by a dataset loader. - The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). - slots: + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). +info: + format: + type: h5ad layers: - type: integer name: counts @@ -203,3 +204,15 @@ info: description: The organism of the sample in the dataset. required: false multiple: true + - name: dataset_technology + type: string + description: The technology used to generate the dataset. + required: false + - name: dataset_organ + type: string + description: The organ of the sample in the dataset. + required: false + - name: dataset_tissue + type: string + description: The tissue of the sample in the dataset. + required: false diff --git a/src/datasets/api/file_spatial_dataset.yaml b/src/datasets/api/file_spatial_dataset.yaml new file mode 100644 index 0000000000..d7971c52ac --- /dev/null +++ b/src/datasets/api/file_spatial_dataset.yaml @@ -0,0 +1,194 @@ +type: file +example: "resources_test/common/mouse_brain_coronal/dataset.h5ad" +label: "Common dataset" +summary: An unprocessed dataset as output by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalised expression values + required: true + obs: + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + obsm: + - type: double + name: spatial + description: Spatial coordinates of the cells in the format `x, y`. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true + - name: dataset_technology + type: string + description: The technology used to generate the dataset. + required: false + - name: dataset_organ + type: string + description: The organ of the sample in the dataset. + required: false + - name: dataset_tissue + type: string + description: The tissue of the sample in the dataset. + required: false \ No newline at end of file diff --git a/src/datasets/api/file_svd.yaml b/src/datasets/api/file_svd.yaml index 2a727369e3..c9f22b50f7 100644 --- a/src/datasets/api/file_svd.yaml +++ b/src/datasets/api/file_svd.yaml @@ -1,10 +1,11 @@ __merge__: file_normalized.yaml type: file example: "resources_test/common/pancreas/svd.h5ad" +label: "Dataset+SVD" +summary: "A normalised dataset with a SVD embedding" info: - label: "Dataset+SVD" - summary: "A normalised dataset with a SVD embedding" - slots: + format: + type: h5ad obsm: - type: double name: X_svd diff --git a/src/datasets/loaders/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/cellxgene_census/config.vsh.yaml deleted file mode 100644 index 667e1c6a6b..0000000000 --- a/src/datasets/loaders/cellxgene_census/config.vsh.yaml +++ /dev/null @@ -1,167 +0,0 @@ -functionality: - name: cellxgene_census - namespace: datasets/loaders - description: | - Query cells from a CellxGene Census or custom TileDBSoma object. - Aside from fetching the cells' RNA counts (`.X`), cell metadata - (`.obs`) and gene metadata (`.var`), this component also fetches - the dataset metadata and joins it into the cell metadata. - argument_groups: - - name: Input database - description: "Open CellxGene Census by version or URI." - arguments: - - name: "--input_uri" - type: string - description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." - required: false - example: "s3://bucket/path" - - name: "--census_version" - description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." - type: string - example: "stable" - required: false - - name: Cell query - description: Arguments related to the query. - arguments: - - name: "--species" - type: string - description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. - required: true - example: "homo_sapiens" - - name: "--obs_value_filter" - type: string - description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." - required: true - example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" - - name: Filter cells by grouping - description: - arguments: - - name: "--cell_filter_grouping" - type: string - description: | - A subset of 'obs' columns by which to group the cells for filtering. - Only groups surpassing or equal to the `--cell_filter_minimum_count` - threshold will be retained. Take care not to introduce a selection - bias against cells with more fine-grained ontology annotations. - required: false - example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] - multiple: true - - name: "--cell_filter_minimum_count" - type: integer - description: | - A minimum number of cells per group to retain. If `--cell_filter_grouping` - is defined, this parameter should also be provided and vice versa. - required: false - example: 100 - - name: Count filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--cell_filter_min_genes" - type: integer - description: Remove cells with less than this number of genes. - required: false - default: 50 - - name: "--cell_filter_min_counts" - type: integer - description: Remove cells with less than this number of counts. - required: false - default: 0 - - name: "--gene_filter_min_cells" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - default: 5 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - default: 0 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Outputs - description: Output arguments. - arguments: - - name: "--output" - type: file - description: Output h5ad file. - direction: output - required: true - example: output.h5ad - - name: "--output_compression" - type: string - choices: ["gzip", "lzf"] - required: false - example: "gzip" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/setup_logger.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - #image: openproblems/base_python:1.0.0 - image: python:3.11 - setup: - - type: python - packages: - - cellxgene-census - - scanpy - test_setup: - - type: python - packages: - - viashpy - - type: nextflow - directives: - label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml deleted file mode 100644 index 7ee4166d9d..0000000000 --- a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml +++ /dev/null @@ -1,130 +0,0 @@ -functionality: - name: cellxgene_census_from_source_h5ad - namespace: datasets/loaders - description: | - Query cells from a CellxGene Census or custom TileDBSoma object. - Aside from fetching the cells' RNA counts (`.X`), cell metadata - (`.obs`) and gene metadata (`.var`), this component also fetches - the dataset metadata and joins it into the cell metadata. - argument_groups: - - name: Input - description: Input arguments - arguments: - - name: "--input_id" - type: string - description: | - The dataset ID of the CellxGene Census dataset to query. - required: true - example: "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4" - - name: Count filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--cell_filter_min_genes" - type: integer - description: Remove cells with less than this number of genes. - required: false - default: 50 - - name: "--cell_filter_min_counts" - type: integer - description: Remove cells with less than this number of counts. - required: false - default: 0 - - name: "--gene_filter_min_cells" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - default: 5 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - default: 0 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Outputs - description: Output arguments. - arguments: - - name: "--output" - type: file - description: Output h5ad file. - direction: output - required: true - example: output.h5ad - - name: "--output_compression" - type: string - choices: ["gzip", "lzf"] - required: false - example: "gzip" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/setup_logger.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - #image: openproblems/base_python:1.0.0 - image: python:3.11 - setup: - - type: python - packages: - - cellxgene-census - - scanpy - test_setup: - - type: python - packages: - - viashpy - - type: nextflow - directives: - label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..3a73fe0538 --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,75 @@ +name: openproblems_neurips2021_bmmc +namespace: datasets/loaders/multimodal +description: Fetch a dataset from the OpenProblems NeurIPS2021 competition +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122. + required: true + example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: A unique identifier for the dataset + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + # - type: file + # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py similarity index 98% rename from src/datasets/loaders/openproblems_neurips2021_bmmc/script.py rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py index de62f039f6..eb62dd67e9 100644 --- a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py +++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py @@ -19,7 +19,7 @@ "output_mod2": "output/mod2.h5ad" } meta = { - "functionality_name": "openproblems_neurips2021_bmmc", + "name": "openproblems_neurips2021_bmmc", "resources_dir": "/tmp/viash_inject_openproblems_neurips2021_bmmc14365472827677740971", } ## VIASH END diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py similarity index 100% rename from src/datasets/loaders/openproblems_neurips2021_bmmc/test.py rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..5994e4ccc9 --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,81 @@ +name: openproblems_neurips2022_pbmc +namespace: datasets/loaders/multimodal +description: Fetch a dataset from the OpenProblems NeurIPS2022 competition +argument_groups: + - name: Inputs + arguments: + - name: --input_mod1 + type: file + description: Processed RNA h5ad file + required: true + example: cite_rna_merged.h5ad + - name: --input_mod2 + type: file + description: Processed ADT or ATAC h5ad file + required: true + example: cite_prot_merged.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: A unique identifier for the dataset + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +# skip unit test until data is public +# test_resources: +# - type: python_script +# path: test.py +# - type: file +# path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py similarity index 98% rename from src/datasets/loaders/openproblems_neurips2022_pbmc/script.py rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py index d0dd855b55..8940afed26 100644 --- a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py +++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py @@ -18,7 +18,7 @@ "output_mod2": "output/mod2.h5ad" } meta = { - "functionality_name": "openproblems_neurips2022_pbmc", + "name": "openproblems_neurips2022_pbmc", } ## VIASH END diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py similarity index 100% rename from src/datasets/loaders/openproblems_neurips2022_pbmc/test.py rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py diff --git a/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..b0afb9311b --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,95 @@ +name: openproblems_v1_multimodal +namespace: datasets/loaders/multimodal +description: Fetch a dataset from OpenProblems v1 +argument_groups: + - name: Inputs + arguments: + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_v1_multimodal/script.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py similarity index 100% rename from src/datasets/loaders/openproblems_v1_multimodal/script.py rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py diff --git a/src/datasets/loaders/openproblems_v1_multimodal/test.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py similarity index 100% rename from src/datasets/loaders/openproblems_v1_multimodal/test.py rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml deleted file mode 100644 index 96dad30e76..0000000000 --- a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml +++ /dev/null @@ -1,74 +0,0 @@ -functionality: - name: "openproblems_neurips2021_bmmc" - namespace: "datasets/loaders" - description: "Fetch a dataset from the OpenProblems NeurIPS2021 competition" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122. - required: true - example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: "A unique identifier for the dataset" - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - # - type: file - # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml deleted file mode 100644 index b2141482f1..0000000000 --- a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml +++ /dev/null @@ -1,80 +0,0 @@ -functionality: - name: "openproblems_neurips2022_pbmc" - namespace: "datasets/loaders" - description: "Fetch a dataset from the OpenProblems NeurIPS2022 competition" - argument_groups: - - name: Inputs - arguments: - - name: "--input_mod1" - type: file - description: "Processed RNA h5ad file" - required: true - example: cite_rna_merged.h5ad - - name: "--input_mod2" - type: file - description: "Processed ADT or ATAC h5ad file" - required: true - example: cite_prot_merged.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: "A unique identifier for the dataset" - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - # skip unit test until data is public - # test_resources: - # - type: python_script - # path: test.py - # - type: file - # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/openproblems_v1/config.vsh.yaml deleted file mode 100644 index d3a3ad846f..0000000000 --- a/src/datasets/loaders/openproblems_v1/config.vsh.yaml +++ /dev/null @@ -1,86 +0,0 @@ -__merge__: ../../api/comp_dataset_loader.yaml -functionality: - name: "openproblems_v1" - description: "Fetch a dataset from OpenProblems v1" - argument_groups: - - name: Inputs - arguments: - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: git - - type: docker - run: | - git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ - pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ - pip install --no-cache-dir --editable /opt/openproblems - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml deleted file mode 100644 index 6247ae3bf9..0000000000 --- a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml +++ /dev/null @@ -1,94 +0,0 @@ -functionality: - name: "openproblems_v1_multimodal" - namespace: "datasets/loaders" - description: "Fetch a dataset from OpenProblems v1" - argument_groups: - - name: Inputs - arguments: - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: git - - type: docker - run: | - git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ - pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ - pip install --no-cache-dir --editable /opt/openproblems - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..6ab9fe785d --- /dev/null +++ b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml @@ -0,0 +1,176 @@ +name: cellxgene_census +namespace: datasets/loaders/scrnaseq +description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. +argument_groups: + - name: Input database + description: Open CellxGene Census by version or URI. + arguments: + - name: --input_uri + type: string + description: If specified, a URI containing the Census SOMA objects. If specified, + will take precedence over the `--census_version` argument. + required: false + example: s3://bucket/path + - name: --census_version + description: Which release of CellxGene census to use. Possible values are + "latest", "stable", or the date of one of the releases (e.g. "2023-07-25"). + For more information, check the documentation on [Census data + releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html). + type: string + example: stable + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: --species + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus + musculus`. + required: true + example: homo_sapiens + - name: --obs_value_filter + type: string + description: Filter for selecting the `obs` metadata (i.e. cells). Value is + a filter query written in the SOMA `value_filter` syntax. + required: true + example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', + 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell' + - name: Filter cells by grouping + description: + arguments: + - name: --cell_filter_grouping + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: [dataset_id, tissue, assay, disease, cell_type] + multiple: true + - name: --cell_filter_minimum_count + type: integer + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --cell_filter_min_genes + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: --cell_filter_min_counts + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: --gene_filter_min_cells + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: --output + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: --output_compression + type: string + choices: [gzip, lzf] + required: false + example: gzip +resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/setup_logger.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu] diff --git a/src/datasets/loaders/cellxgene_census/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census/script.py similarity index 100% rename from src/datasets/loaders/cellxgene_census/script.py rename to src/datasets/loaders/scrnaseq/cellxgene_census/script.py diff --git a/src/datasets/loaders/cellxgene_census/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census/test.py similarity index 100% rename from src/datasets/loaders/cellxgene_census/test.py rename to src/datasets/loaders/scrnaseq/cellxgene_census/test.py diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml new file mode 100644 index 0000000000..99ae8929a5 --- /dev/null +++ b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml @@ -0,0 +1,132 @@ +name: cellxgene_census_from_source_h5ad +namespace: datasets/loaders/scrnaseq +description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. +argument_groups: + - name: Input + description: Input arguments + arguments: + - name: --input_id + type: string + description: | + The dataset ID of the CellxGene Census dataset to query. + required: true + example: a93eab58-3d82-4b61-8a2f-d7666dcdb7c4 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --cell_filter_min_genes + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: --cell_filter_min_counts + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: --gene_filter_min_cells + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: --output + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: --output_compression + type: string + choices: [gzip, lzf] + required: false + example: gzip +resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/setup_logger.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu] diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py similarity index 100% rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py similarity index 100% rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py diff --git a/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..6ebb63b410 --- /dev/null +++ b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml @@ -0,0 +1,91 @@ +__merge__: ../../../api/comp_dataset_loader.yaml +name: openproblems_v1 +namespace: datasets/loaders/scrnaseq +description: Fetch a dataset from OpenProblems v1 +argument_groups: + - name: Inputs + arguments: + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_v1/script.py b/src/datasets/loaders/scrnaseq/openproblems_v1/script.py similarity index 100% rename from src/datasets/loaders/openproblems_v1/script.py rename to src/datasets/loaders/scrnaseq/openproblems_v1/script.py diff --git a/src/datasets/loaders/openproblems_v1/test.py b/src/datasets/loaders/scrnaseq/openproblems_v1/test.py similarity index 100% rename from src/datasets/loaders/openproblems_v1/test.py rename to src/datasets/loaders/scrnaseq/openproblems_v1/test.py diff --git a/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..b673826a91 --- /dev/null +++ b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml @@ -0,0 +1,96 @@ +name: tenx_visium +namespace: datasets/loaders/spatial +description: | + Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). +argument_groups: + - name: Inputs + arguments: + - name: --input_expression + type: string + description: URL to the feature / barcode matrix HDF5 of the 10x dataset. + required: true + - name: --input_spatial + type: string + description: URL to the Spatial imaging data of the 10x dataset. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - squidpy +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/tenx_visium/script.py b/src/datasets/loaders/spatial/tenx_visium/script.py similarity index 98% rename from src/datasets/loaders/tenx_visium/script.py rename to src/datasets/loaders/spatial/tenx_visium/script.py index 100bfde555..2cfa3c9054 100644 --- a/src/datasets/loaders/tenx_visium/script.py +++ b/src/datasets/loaders/spatial/tenx_visium/script.py @@ -18,7 +18,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "tenx_visium" + "name": "tenx_visium" } ## VIASH END diff --git a/src/datasets/loaders/tenx_visium/test.py b/src/datasets/loaders/spatial/tenx_visium/test.py similarity index 100% rename from src/datasets/loaders/tenx_visium/test.py rename to src/datasets/loaders/spatial/tenx_visium/test.py diff --git a/src/datasets/loaders/spatial/zenodo/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml new file mode 100644 index 0000000000..b4e06238a8 --- /dev/null +++ b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml @@ -0,0 +1,88 @@ +name: zenodo +namespace: datasets/loaders/spatial +description: | + Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. +argument_groups: + - name: Inputs + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/script.py b/src/datasets/loaders/spatial/zenodo/script.py similarity index 96% rename from src/datasets/loaders/zenodo_spatial/script.py rename to src/datasets/loaders/spatial/zenodo/script.py index 83aeb86056..7392274a42 100644 --- a/src/datasets/loaders/zenodo_spatial/script.py +++ b/src/datasets/loaders/spatial/zenodo/script.py @@ -5,7 +5,7 @@ # VIASH START par = { "input_data": "ps://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1", - "dataset_id": "zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2", + "dataset_id": "zenodo/mouse_olfactory_bulb_puck_slideseqv2", "dataset_name": "Mouse Olfactory Bulk Puck", "dataset_url": "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary", "dataset_summary": "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2", @@ -16,7 +16,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "zenodo_spatial" + "name": "zenodo" } # VIASH END diff --git a/src/datasets/loaders/zenodo_spatial/test.py b/src/datasets/loaders/spatial/zenodo/test.py similarity index 97% rename from src/datasets/loaders/zenodo_spatial/test.py rename to src/datasets/loaders/spatial/zenodo/test.py index 07dcd953a8..17a87366ec 100644 --- a/src/datasets/loaders/zenodo_spatial/test.py +++ b/src/datasets/loaders/spatial/zenodo/test.py @@ -3,7 +3,7 @@ import anndata as ad input_data ="https://zenodo.org/records/12784832/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" -dataset_id = "zenodo_spatial/mouse_olfactory_bulb_puck" +dataset_id = "zenodo/mouse_olfactory_bulb_puck" dataset_name = "mouse_olfactory_bulb_puck" dataset_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" dataset_summary = "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2" diff --git a/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..0355c8bb64 --- /dev/null +++ b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml @@ -0,0 +1,88 @@ +name: zenodo_slidetags +namespace: datasets/loaders/spatial +description: | + Download a compressed file containing gene expression matrix and spatial locations from zenodo. +argument_groups: + - name: Inputs + arguments: + - name: --input_data + type: string + description: URL to the file. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/script.py b/src/datasets/loaders/spatial/zenodo_slidetags/script.py similarity index 96% rename from src/datasets/loaders/zenodo_spatial_slidetags/script.py rename to src/datasets/loaders/spatial/zenodo_slidetags/script.py index 5a8cf212fa..777f7e9e45 100644 --- a/src/datasets/loaders/zenodo_spatial_slidetags/script.py +++ b/src/datasets/loaders/spatial/zenodo_slidetags/script.py @@ -6,7 +6,7 @@ # VIASH START par = { "input_data": "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1", - "dataset_id": "zenodo_spatial_slidetags/human_cortex_slidetags", + "dataset_id": "zenodo_slidetags/human_cortex_slidetags", "dataset_name": "slidetag_human_cortex", "dataset_url": "https://www.nature.com/articles/s41586-023-06837-4", "dataset_summary": "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics", @@ -17,7 +17,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "zenodo_spatial_slidetags" + "name": "zenodo_slidetags" } # VIASH END diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/test.py b/src/datasets/loaders/spatial/zenodo_slidetags/test.py similarity index 97% rename from src/datasets/loaders/zenodo_spatial_slidetags/test.py rename to src/datasets/loaders/spatial/zenodo_slidetags/test.py index 9f859ebea6..c97203735b 100644 --- a/src/datasets/loaders/zenodo_spatial_slidetags/test.py +++ b/src/datasets/loaders/spatial/zenodo_slidetags/test.py @@ -3,7 +3,7 @@ import anndata as ad input_data ="https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" -dataset_id = "zenodo_spatial_slidetags/human_cortex" +dataset_id = "zenodo_slidetags/human_cortex" dataset_name = "slidetag_human_cortex" dataset_url = "https://www.nature.com/articles/s41586-023-06837-4" dataset_summary = "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics" diff --git a/src/datasets/loaders/tenx_visium/config.vsh.yaml b/src/datasets/loaders/tenx_visium/config.vsh.yaml deleted file mode 100644 index ba28b32b89..0000000000 --- a/src/datasets/loaders/tenx_visium/config.vsh.yaml +++ /dev/null @@ -1,96 +0,0 @@ -functionality: - name: tenx_visium - namespace: datasets/loaders - description: | - Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). - - argument_groups: - - name: Inputs - arguments: - - name: "--input_expression" - type: string - description: URL to the feature / barcode matrix HDF5 of the 10x dataset. - required: true - - name: "--input_spatial" - type: string - description: URL to the Spatial imaging data of the 10x dataset. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - packages: - - squidpy - - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml deleted file mode 100644 index 776b177481..0000000000 --- a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml +++ /dev/null @@ -1,87 +0,0 @@ -functionality: - name: zenodo_spatial - namespace: datasets/loaders - description: | - Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. - argument_groups: - - name: Inputs - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml deleted file mode 100644 index 905be3514c..0000000000 --- a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml +++ /dev/null @@ -1,88 +0,0 @@ -functionality: - name: zenodo_spatial_slidetags - namespace: datasets/loaders - description: | - Download a compressed file containing gene expression matrix and spatial locations from zenodo. - - argument_groups: - - name: Inputs - arguments: - - name: "--input_data" - type: string - description: URL to the file. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml index 31319f0958..850b49363b 100644 --- a/src/datasets/normalization/atac_tfidf/config.vsh.yaml +++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml @@ -1,16 +1,15 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "atac_tfidf" - description: | - Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency). +name: atac_tfidf +description: | + Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency). - TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF + TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF - By default, log(TF) * log(IDF) is returned. - resources: - - type: python_script - path: script.py -platforms: + By default, log(TF) * log(IDF) is returned. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: @@ -18,6 +17,8 @@ platforms: packages: - muon - numpy<2 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/atac_tfidf/script.py b/src/datasets/normalization/atac_tfidf/script.py index ecb772bd64..1d38a8fcc8 100644 --- a/src/datasets/normalization/atac_tfidf/script.py +++ b/src/datasets/normalization/atac_tfidf/script.py @@ -7,7 +7,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "tfidf" + 'name': "tfidf" } ## VIASH END @@ -20,7 +20,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = normalized_counts -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/l1_sqrt/config.vsh.yaml b/src/datasets/normalization/l1_sqrt/config.vsh.yaml index 212eadc968..3a3aca8ed0 100644 --- a/src/datasets/normalization/l1_sqrt/config.vsh.yaml +++ b/src/datasets/normalization/l1_sqrt/config.vsh.yaml @@ -1,20 +1,19 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "l1_sqrt" - description: | - Scaled L1 sqrt normalization. +name: l1_sqrt +description: | + Scaled L1 sqrt normalization. - This normalization method causes all cells to have the same sum of values. + This normalization method causes all cells to have the same sum of values. - Steps: + Steps: - * Compute the square root of the counts. - * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1). - * Multiply by the median UMI count per cell, causing all cells to have the sum of values. - resources: - - type: python_script - path: script.py -platforms: + * Compute the square root of the counts. + * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1). + * Multiply by the median UMI count per cell, causing all cells to have the sum of values. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: @@ -22,6 +21,8 @@ platforms: packages: - scprep - numpy<2 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/l1_sqrt/script.py b/src/datasets/normalization/l1_sqrt/script.py index 76c69cf897..9dd5c96505 100644 --- a/src/datasets/normalization/l1_sqrt/script.py +++ b/src/datasets/normalization/l1_sqrt/script.py @@ -8,7 +8,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "l1_sqrt" + 'name': "l1_sqrt" } ## VIASH END @@ -23,7 +23,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = l1_sqrt -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml index 89b2a283f9..d686c54147 100644 --- a/src/datasets/normalization/log_cp/config.vsh.yaml +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -1,18 +1,20 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "log_cp" - description: "Normalize data using Log CP" - resources: - - type: python_script - path: script.py - arguments: - - name: "--n_cp" - type: integer - default: 1e4 - description: "Number of counts per cell. When set to -1, will use None." -platforms: +name: log_cp +links: {} +description: Normalize data using Log CP +resources: + - type: python_script + path: script.py +arguments: + - name: --n_cp + type: integer + default: 1e4 + description: Number of counts per cell. When set to -1, will use None. +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_cp/script.py b/src/datasets/normalization/log_cp/script.py index 39ddf61636..d537ee4a7c 100644 --- a/src/datasets/normalization/log_cp/script.py +++ b/src/datasets/normalization/log_cp/script.py @@ -9,7 +9,7 @@ 'n_cp': 1e6, } meta = { - "functionality_name": "normalize_log_cp10k" + "name": "normalize_log_cp10k" } ## VIASH END @@ -36,7 +36,7 @@ print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml index 4cbf81ff5a..e010c1032b 100644 --- a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml +++ b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml @@ -1,18 +1,20 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "log_scran_pooling" - description: "Normalize data using scran pooling" - resources: - - type: r_script - path: script.R -platforms: +name: log_scran_pooling +links: {} +description: Normalize data using scran pooling +resources: + - type: r_script + path: script.R +engines: - type: docker image: openproblems/base_r:1.0.0 setup: - type: r - cran: [ Matrix, rlang, scran, BiocParallel ] + cran: [Matrix, rlang, scran, BiocParallel] - type: python pip: scanpy +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_scran_pooling/script.R b/src/datasets/normalization/log_scran_pooling/script.R index be51e21f38..33bb13e8eb 100644 --- a/src/datasets/normalization/log_scran_pooling/script.R +++ b/src/datasets/normalization/log_scran_pooling/script.R @@ -30,7 +30,7 @@ adata$obs[[par$obs_size_factors]] <- size_factors adata$layers[[par$layer_output]] <- lognorm norm_id <- par[["normalization_id"]] if (is.null(norm_id)) { - norm_id <- meta[["functionality_name"]] + norm_id <- meta[["name"]] } adata$uns[["normalization_id"]] <- norm_id diff --git a/src/datasets/normalization/prot_clr/config.vsh.yaml b/src/datasets/normalization/prot_clr/config.vsh.yaml index 8f6bbe269f..3262ca73b7 100644 --- a/src/datasets/normalization/prot_clr/config.vsh.yaml +++ b/src/datasets/normalization/prot_clr/config.vsh.yaml @@ -1,26 +1,27 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "prot_clr" - description: | - Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017). +name: prot_clr +description: | + Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017). - The CLR transformation is defined as: + The CLR transformation is defined as: - $$ - x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right) - $$ + $$ + x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right) + $$ - where $\(g(x)\)$ is the geometric mean of the row $\(x\)$. - resources: - - type: python_script - path: script.py -platforms: + where $\(g(x)\)$ is the geometric mean of the row $\(x\)$. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: - type: python packages: - muon +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/prot_clr/script.py b/src/datasets/normalization/prot_clr/script.py index 3f0a2fb3fd..4741625935 100644 --- a/src/datasets/normalization/prot_clr/script.py +++ b/src/datasets/normalization/prot_clr/script.py @@ -7,7 +7,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "clr" + 'name': "clr" } ## VIASH END @@ -22,7 +22,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = normalized_counts.X -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/sqrt_cp/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml index 4d95636f4c..1e241e89d0 100644 --- a/src/datasets/normalization/sqrt_cp/config.vsh.yaml +++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml @@ -1,18 +1,19 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "sqrt_cp" - description: "Normalize data using Log Sqrt" - resources: - - type: python_script - path: script.py - arguments: - - name: "--n_cp" - type: integer - default: 1e4 - description: "Number of counts per cell" -platforms: +name: sqrt_cp +description: Normalize data using Log Sqrt +resources: + - type: python_script + path: script.py +arguments: + - name: --n_cp + type: integer + default: 1e4 + description: Number of counts per cell +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/sqrt_cp/script.py b/src/datasets/normalization/sqrt_cp/script.py index 84afdaa19d..d2540a519a 100644 --- a/src/datasets/normalization/sqrt_cp/script.py +++ b/src/datasets/normalization/sqrt_cp/script.py @@ -10,7 +10,7 @@ 'n_cp': 1e6, } meta = { - "functionality_name": "normalize_sqrt_cpm" + "name": "normalize_sqrt_cpm" } ## VIASH END @@ -29,7 +29,7 @@ print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/processors/hvg/config.vsh.yaml b/src/datasets/processors/hvg/config.vsh.yaml index aed18c6d38..886fe92f94 100644 --- a/src/datasets/processors/hvg/config.vsh.yaml +++ b/src/datasets/processors/hvg/config.vsh.yaml @@ -1,13 +1,14 @@ __merge__: ../../api/comp_processor_hvg.yaml -functionality: - name: "hvg" - description: "Compute HVG" - resources: - - type: python_script - path: script.py -platforms: +name: hvg +description: Compute HVG +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/knn/config.vsh.yaml b/src/datasets/processors/knn/config.vsh.yaml index 9908fe9086..b3cf894420 100644 --- a/src/datasets/processors/knn/config.vsh.yaml +++ b/src/datasets/processors/knn/config.vsh.yaml @@ -1,13 +1,14 @@ __merge__: ../../api/comp_processor_knn.yaml -functionality: - name: "knn" - description: "Compute KNN" - resources: - - type: python_script - path: script.py -platforms: +name: knn +description: Compute KNN +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/pca/config.vsh.yaml b/src/datasets/processors/pca/config.vsh.yaml index 7f0213b922..b1c1a7ba14 100644 --- a/src/datasets/processors/pca/config.vsh.yaml +++ b/src/datasets/processors/pca/config.vsh.yaml @@ -1,17 +1,18 @@ __merge__: ../../api/comp_processor_pca.yaml -functionality: - name: "pca" - description: "Compute PCA" - resources: - - type: python_script - path: script.py +name: pca +description: Compute PCA +resources: + - type: python_script + path: script.py # test_resources: # - type: python_script # path: test_script.py # - path: "../../../resources_test/common/pancreas" -platforms: +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/config.vsh.yaml b/src/datasets/processors/subsample/config.vsh.yaml index 4e52e93db5..9cc62c3602 100644 --- a/src/datasets/processors/subsample/config.vsh.yaml +++ b/src/datasets/processors/subsample/config.vsh.yaml @@ -1,51 +1,54 @@ __merge__: ../../api/comp_processor_subset.yaml -functionality: - name: "subsample" - description: "Subsample an h5ad file" - arguments: - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Cell type indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test_script.py - - path: /resources_test/common/pancreas -platforms: +name: subsample +description: Subsample an h5ad file +arguments: + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being less + because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Cell type indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test_script.py + - path: /resources_test/common/pancreas +engines: - type: docker image: openproblems/base_python:1.0.0 test_setup: - type: python packages: - viashpy +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/test_script.py b/src/datasets/processors/subsample/test_script.py index 80dde5d383..cb7f90189a 100644 --- a/src/datasets/processors/subsample/test_script.py +++ b/src/datasets/processors/subsample/test_script.py @@ -42,9 +42,9 @@ def test_keep_functionality(run_component): run_component([ "--input", input_path, - "--keep_cell_type_categories", "acinar:beta", - "--keep_batch_categories", "celseq:inDrop4:smarter", - "--keep_features", ":".join(keep_features), + "--keep_cell_type_categories", "acinar;beta", + "--keep_batch_categories", "celseq;inDrop4;smarter", + "--keep_features", ";".join(keep_features), "--output", output_path, "--seed", "123" ]) diff --git a/src/datasets/processors/svd/config.vsh.yaml b/src/datasets/processors/svd/config.vsh.yaml index bbad17f58c..bd71cae4c8 100644 --- a/src/datasets/processors/svd/config.vsh.yaml +++ b/src/datasets/processors/svd/config.vsh.yaml @@ -1,16 +1,17 @@ __merge__: ../../api/comp_processor_svd.yaml -functionality: - name: "svd" - description: "Compute SVD pca reduction" - resources: - - type: python_script - path: script.py -platforms: +name: svd +description: Compute SVD pca reduction +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: - type: python pypi: [scikit-learn] +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/resource_scripts/cellxgene_census.sh b/src/datasets/resource_scripts/cellxgene_census.sh index 5d6181f91e..62eaff1f34 100755 --- a/src/datasets/resource_scripts/cellxgene_census.sh +++ b/src/datasets/resource_scripts/cellxgene_census.sh @@ -126,7 +126,7 @@ output_normalized: force_null output_pca: force_null output_hvg: force_null output_knn: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/scrnaseq HERE cat > /tmp/nextflow.config << HERE @@ -145,7 +145,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + --main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh index 04c032916f..ead3d45506 100755 --- a/src/datasets/resource_scripts/dataset_info.sh +++ b/src/datasets/resource_scripts/dataset_info.sh @@ -5,13 +5,13 @@ DATASETS_DIR="s3://openproblems-data/resources/datasets" cat > "/tmp/params.yaml" << HERE param_list: - id: openproblems_v1 - input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/scrnaseq/openproblems_v1/**/log_cp10k/state.yaml" rename_keys: 'input:output_dataset' - id: openproblems_v1_multimodal - input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/multimodal/openproblems_v1_multimodal/**/log_cp10k/state.yaml" rename_keys: 'input:output_mod1' - id: cellxgene_census - input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/scrnaseq/cellxgene_census/**/log_cp10k/state.yaml" rename_keys: 'input:output_dataset' settings: '{"output": "dataset_info.yaml"}' output_state: state.yaml diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh index a306ba2ef8..42c3456b1b 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh @@ -32,13 +32,13 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh index be8444371b..652d39aa0a 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh @@ -32,12 +32,12 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: resources/datasets/openproblems_neurips2021 +publish_dir: resources/datasets/multimodal/openproblems_neurips2021 HERE export NXF_VER=23.10.1 nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ -profile docker \ -resume \ -params-file "$params_file" diff --git a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh index e3e6783a8e..681d8f3d36 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh @@ -34,7 +34,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE cat > /tmp/nextflow.config << HERE @@ -49,7 +49,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ --workspace 53907369739130 \ --compute-env 1pK56PjjzeraOOC2LDZvN2 \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1.sh b/src/datasets/resource_scripts/openproblems_v1.sh index 8d40e57c46..7e5b12c348 100755 --- a/src/datasets/resource_scripts/openproblems_v1.sh +++ b/src/datasets/resource_scripts/openproblems_v1.sh @@ -162,7 +162,7 @@ output_normalized: force_null output_pca: force_null output_hvg: force_null output_knn: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/scrnaseq HERE cat > /tmp/nextflow.config << HERE @@ -174,7 +174,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + --main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh index 2d516a8ccb..f8e83f3582 100755 --- a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh @@ -60,7 +60,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE @@ -77,7 +77,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh index 268a17cf7d..e3d012e3c5 100755 --- a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" export TOWER_WORKSPACE_ID=53907369739130 -OUTPUT_DIR="resources/datasets" +OUTPUT_DIR="resources/datasets/multimodal" if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" @@ -38,7 +38,7 @@ HERE export NXF_VER=22.04.5 nextflow \ run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ -profile docker \ -resume \ -params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_test.sh b/src/datasets/resource_scripts/openproblems_v1_test.sh index a79545f052..dab792fd13 100755 --- a/src/datasets/resource_scripts/openproblems_v1_test.sh +++ b/src/datasets/resource_scripts/openproblems_v1_test.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" export TOWER_WORKSPACE_ID=53907369739130 -OUTPUT_DIR="resources/datasets" +OUTPUT_DIR="resources/datasets/scrnasrq" if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" @@ -42,7 +42,7 @@ HERE export NXF_VER=23.04.2 nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ -profile docker \ -resume \ -params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/tenx_visium.sh b/src/datasets/resource_scripts/tenx_visium.sh index 3e2fb68a61..cc7199c81f 100755 --- a/src/datasets/resource_scripts/tenx_visium.sh +++ b/src/datasets/resource_scripts/tenx_visium.sh @@ -125,7 +125,7 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' @@ -253,7 +253,7 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' @@ -290,13 +290,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/zenodo_spatial.sh b/src/datasets/resource_scripts/zenodo.sh similarity index 92% rename from src/datasets/resource_scripts/zenodo_spatial.sh rename to src/datasets/resource_scripts/zenodo.sh index c1386aeb84..dc297492c2 100755 --- a/src/datasets/resource_scripts/zenodo_spatial.sh +++ b/src/datasets/resource_scripts/zenodo.sh @@ -2,7 +2,7 @@ # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/visium/human_heart_myocardial_infarction_1 +# - id: zenodo/visium/human_heart_myocardial_infarction_1 # input_data: "https://zenodo.org/records/13328275/files/10X0018.h5ad?download=1" # dataset_name: 10X Visium - Human Heart MI 1 # dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" @@ -14,7 +14,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/visium/human_heart_myocardial_infarction_2 +# - id: zenodo/visium/human_heart_myocardial_infarction_2 # input_data: "https://zenodo.org/records/13328275/files/10X009.h5ad?download=1" # dataset_name: 10X Visium - Human Heart MI 2 # dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" @@ -32,13 +32,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # remove_mitochondrial: true # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/dbitseq/mouse_e10_brain +# - id: zenodo/dbitseq/mouse_e10_brain # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_brain_gene_25um_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Brain (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -50,7 +50,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e10_eye +# - id: zenodo/dbitseq/mouse_e10_eye # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_eye_and_nearby_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Eye (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -62,7 +62,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e10_whole_body +# - id: zenodo/dbitseq/mouse_e10_whole_body # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_whole_gene_best_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -74,7 +74,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_lower_body +# - id: zenodo/dbitseq/mouse_e11_lower_body # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E11_lower_body_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Lower Body (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -86,7 +86,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_1 +# - id: zenodo/dbitseq/mouse_e11_1 # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364244_E11-FL-1L_gene_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body 1 (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -98,7 +98,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_2 +# - id: zenodo/dbitseq/mouse_e11_2 # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364245_E11-FL-2L_gene_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body 2 (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -116,12 +116,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/merfish/human_cortex_1 +# - id: zenodo/merfish/human_cortex_1 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.250.expand.rep1_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 1 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -133,7 +133,7 @@ # gene_filter_min_spots: 100 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_2 +# - id: zenodo/merfish/human_cortex_2 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep1_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 2 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -145,7 +145,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_3 +# - id: zenodo/merfish/human_cortex_3 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep2_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 3 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -157,7 +157,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_4 +# - id: zenodo/merfish/human_cortex_4 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep3_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 4 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -169,7 +169,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/mouse_cortex +# - id: zenodo/merfish/mouse_cortex # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_mouse1.AUD_TEA_VIS.242.unexpand_data.h5ad?download=1" # dataset_name: MERFISH - Mouse Cortex # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -187,12 +187,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/seqfish/mouse_organogenesis_seqfish +# - id: zenodo/seqfish/mouse_organogenesis # input_data: "https://zenodo.org/records/12785822/files/seqfish.h5ad?download=1" # dataset_name: Seqfish - Mouse Organogenesis # dataset_url: "https://www.nature.com/articles/s41587-021-01006-2" @@ -210,13 +210,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # remove_mitochondrial: true # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/slideseqv2/mouse_olfactory_bulb_puck +# - id: zenodo/slideseqv2/mouse_olfactory_bulb_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Olfactory Bulb Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -228,7 +228,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_cortex +# - id: zenodo/slideseqv2/mouse_cortex # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_palla2021squidpy_Slide-seqV2_Mouse_Cortex_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Cortex # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -240,7 +240,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_cerebellum +# - id: zenodo/slideseqv2/mouse_cerebellum # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Cerebellum_SCP948_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Cerebellum # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -252,7 +252,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_hippocampus_puck +# - id: zenodo/slideseqv2/mouse_hippocampus_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Hippocampus_Puck_200115_08_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Hippocampus Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -264,7 +264,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_somatosensory_cortex_puck +# - id: zenodo/slideseqv2/mouse_somatosensory_cortex_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_SomatosensoryCortex_Puck_200306_03_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Somatosensory Cortex Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -282,12 +282,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/starmap/mouse_brain_2d_zstep10_0 +# - id: zenodo/starmap/mouse_brain_2d_zstep10_0 # input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep10_0_data.h5ad?download=1" # dataset_name: STARmap - Mouse Brain 1 # dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" @@ -299,7 +299,7 @@ # gene_filter_min_spots: 1 # remove_mitochondrial: true -# - id: zenodo_spatial/starmap/mouse_brain_2d_zstep15_0 +# - id: zenodo/starmap/mouse_brain_2d_zstep15_0 # input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep15_0_data.h5ad?download=1" # dataset_name: STARmap - Mouse Brain 2 # dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" @@ -317,12 +317,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE cat > "/tmp/params.yaml" << 'HERE' param_list: - - id: zenodo_spatial/stereoseq/drosophila_embryo_e5_6 + - id: zenodo/stereoseq/drosophila_embryo_e5_6 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_5.6.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E5_6 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -334,7 +334,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e6_3 + - id: zenodo/stereoseq/drosophila_embryo_e6_3 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_6.3.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E6_3 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -346,7 +346,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e7 + - id: zenodo/stereoseq/drosophila_embryo_e7 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_7.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E7 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -358,7 +358,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e9_1 + - id: zenodo/stereoseq/drosophila_embryo_e9_1 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_9.1.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E9_1 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -370,7 +370,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e10 + - id: zenodo/stereoseq/drosophila_embryo_e10 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_10.5.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E10 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -388,7 +388,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/spatial HERE cat > /tmp/nextflow.config << HERE @@ -407,7 +407,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_zenodo_spatial/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_zenodo/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh b/src/datasets/resource_scripts/zenodo_slidetags.sh similarity index 89% rename from src/datasets/resource_scripts/zenodo_spatial_slidetags.sh rename to src/datasets/resource_scripts/zenodo_slidetags.sh index aa4e7e094b..e35df8edc3 100755 --- a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh +++ b/src/datasets/resource_scripts/zenodo_slidetags.sh @@ -2,7 +2,7 @@ cat > "/tmp/params.yaml" << 'HERE' param_list: - - id: zenodo_spatial_slidetags/slidetags/human_cortex + - id: zenodo_slidetags/slidetags/human_cortex input_data: "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" dataset_name: Slide-tags - Human Cortex dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -14,7 +14,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/human_skin_melanoma + - id: zenodo_slidetags/slidetags/human_skin_melanoma input_data: "https://zenodo.org/records/12785822/files/slidetag_human_skin_melanoma.tar.gz?download=1" dataset_name: Slide-tags - Human Skin Melanoma dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -26,7 +26,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/human_tonsil + - id: zenodo_slidetags/slidetags/human_tonsil input_data: "https://zenodo.org/records/12785822/files/slidetag_human_tonsil.tar.gz?download=1" dataset_name: Slide-tags - Human Tonsil dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -38,7 +38,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/mouse_embryo + - id: zenodo_slidetags/slidetags/mouse_embryo input_data: "https://zenodo.org/records/12785822/files/slidetag_mouse_embryo.tar.gz?download=1" dataset_name: Slide-tags - Mouse Embryo dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -56,7 +56,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/spatial HERE cat > /tmp/nextflow.config << HERE @@ -75,7 +75,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_zenodo_spatial_slidetags/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_zenodo_slidetags/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh index 285dc55ec4..8a9d7de486 100755 --- a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh +++ b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh @@ -1,7 +1,6 @@ #!/bin/bash -DATASET_DIR=resources_test/common - +DATASET_DIR=resources_test/common/scrnaseq mkdir -p $DATASET_DIR @@ -40,7 +39,7 @@ keep_features: '$KEEP_FEATURES' HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh index 3b5d35ee5c..c90ae39ac7 100755 --- a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh +++ b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh @@ -1,6 +1,6 @@ #!/bin/bash -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/scrnaseq mkdir -p $DATASET_DIR @@ -40,7 +40,7 @@ keep_features: '$KEEP_FEATURES' HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh index 962c4c067d..ed65408dc6 100755 --- a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh +++ b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh @@ -22,7 +22,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: resources_test/common +publish_dir: resources_test/common/spatial do_subsample: true spot_filter_min_genes: 200 gene_filter_min_spots: 50 @@ -30,7 +30,7 @@ remove_mitochondrial: true HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + -main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh index 98644d9dbf..ebd8faf892 100755 --- a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh +++ b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh @@ -38,7 +38,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -# publish_dir: s3://openproblems-data/resources_test/common +# publish_dir: s3://openproblems-data/resources_test/common/multimodal HERE # cat > /tmp/nextflow.config << HERE @@ -51,10 +51,10 @@ HERE # HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ -profile docker \ -resume \ - --publish_dir resources_test/common \ + --publish_dir resources_test/common/multimodal \ -params-file "$params_file" \ -c src/wf_utils/labels.config @@ -68,4 +68,4 @@ nextflow run . \ # --labels predict_modality # run task process dataset components -src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh \ No newline at end of file +# src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh index b62e6f40e1..b3a11eb67e 100755 --- a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh +++ b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh @@ -38,11 +38,11 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources_test/common +publish_dir: s3://openproblems-data/resources_test/common/multimodal HERE # nextflow run . \ -# -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ +# -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ # -profile docker \ # -resume \ # --publish_dir resources_test/common \ @@ -63,7 +63,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ --workspace 53907369739130 \ --compute-env 1pK56PjjzeraOOC2LDZvN2 \ --params-file "$params_file" \ diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh index fb26f7ef30..3857449636 100755 --- a/src/datasets/resource_test_scripts/pancreas.sh +++ b/src/datasets/resource_test_scripts/pancreas.sh @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/scrnaseq set -e @@ -18,7 +18,7 @@ KEEP_FEATURES=`cat $DATASET_DIR/temp_g2m_genes_tirosh_hm.txt $DATASET_DIR/temp_s # download dataset nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ -profile docker \ -c src/wf_utils/labels_ci.config \ -resume \ @@ -55,7 +55,7 @@ nextflow run . \ rm -r $DATASET_DIR/temp_* # run task process dataset components -src/tasks/batch_integration/resources_test_scripts/process.sh -src/tasks/denoising/resources_test_scripts/pancreas.sh -src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh -src/tasks/label_projection/resources_test_scripts/pancreas.sh \ No newline at end of file +# src/tasks/batch_integration/resources_test_scripts/process.sh +# src/tasks/denoising/resources_test_scripts/pancreas.sh +# src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh +# src/tasks/label_projection/resources_test_scripts/pancreas.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/scicar_cell_lines.sh b/src/datasets/resource_test_scripts/scicar_cell_lines.sh index f765744136..f9c9a7b842 100755 --- a/src/datasets/resource_test_scripts/scicar_cell_lines.sh +++ b/src/datasets/resource_test_scripts/scicar_cell_lines.sh @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/multimodal set -e @@ -14,7 +14,7 @@ mkdir -p $DATASET_DIR # download dataset nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ -profile docker \ -resume \ --id scicar_cell_lines \ diff --git a/src/datasets/resource_test_scripts/slideseq_test.sh b/src/datasets/resource_test_scripts/slideseq_test.sh deleted file mode 100755 index a9050be40a..0000000000 --- a/src/datasets/resource_test_scripts/slideseq_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -e - -cat > /tmp/params.yaml << 'HERE' -param_list: - - id: mouse_cerebellum - input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" - dataset_name: Mouse cerebellum - dataset_url: "..." - dataset_summary: ... - dataset_description: "..." - dataset_reference: ref - dataset_organism: Mus musculus - -normalization_methods: [log_cp10k] -n_obs: 600 -n_vars: 500 -output_dataset: '$id/dataset.h5ad' -output_meta: '$id/dataset_metadata.yaml' -output_state: '$id/state.yaml' -output_raw: force_null -output_normalized: force_null -publish_dir: resources_test/common -do_subsample: true -spot_filter_min_genes: 200 -gene_filter_min_spots: 50 -remove_mitochondrial: true -HERE - -nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_spatial_from_zenodo/main.nf \ - -c src/wf_utils/labels_ci.config \ - -profile docker \ - -params-file "/tmp/params.yaml" - diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml index 58433db567..0ebe6f06df 100644 --- a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml +++ b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml @@ -1,34 +1,34 @@ -functionality: - name: "extract_dataset_info" - namespace: "datasets/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_raw.yaml - required: true - direction: input - - name: Filter arguments - arguments: - - name: "--filter_normalization_id" - type: string - required: false - direction: input - description: If defined, only the normalization with this ID will be included in the output. - multiple: true - example: [ log_cp10k ] - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - direction: output - example: dataset_uns.yaml - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/extract_metadata -platforms: +name: extract_dataset_info +namespace: datasets/workflows +argument_groups: + - name: Inputs + arguments: + - name: --input + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Filter arguments + arguments: + - name: --filter_normalization_id + type: string + required: false + direction: input + description: If defined, only the normalization with this ID will be included + in the output. + multiple: true + example: [log_cp10k] + - name: Outputs + arguments: + - name: --output + type: file + required: true + direction: output + example: dataset_uns.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: common/extract_metadata +runners: - type: nextflow diff --git a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml index 26041b1039..a138ea895c 100644 --- a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml +++ b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml @@ -1,25 +1,24 @@ -functionality: - name: "extract_dataset_meta" - namespace: "datasets/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_raw.yaml - required: true - direction: input - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - direction: output - example: meta.yaml - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/extract_metadata -platforms: +name: extract_dataset_meta +namespace: datasets/workflows +argument_groups: + - name: Inputs + arguments: + - name: --input + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: --output + type: file + required: true + direction: output + example: meta.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: common/extract_metadata +runners: - type: nextflow diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..91f0bbcfc1 --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,138 @@ +name: process_openproblems_neurips2021_bmmc +namespace: datasets/workflows/multimodal +description: | + Fetch and process Neurips 2021 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: The ID of the dataset + required: true + - name: --input + type: file + description: Path to the input dataset + required: true + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_neurips2021_bmmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + - name: common/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..88f341aae4 --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,144 @@ +name: process_openproblems_neurips2022_pbmc +namespace: datasets/workflows/multimodal +description: | + Fetch and process Neurips 2022 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: The ID of the dataset + required: true + - name: --input_mod1 + type: file + description: Processed RNA h5ad file + required: true + example: cite_rna_merged.h5ad + - name: --input_mod2 + type: file + description: Processed ADT or ATAC h5ad file + required: true + example: cite_prot_merged.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_neurips2022_pbmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + - name: common/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf diff --git a/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..f70e3ab19e --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,165 @@ +name: process_openproblems_v1_multimodal +namespace: datasets/workflows/multimodal +description: | + Fetch and process legacy OpenProblems v1 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_v1_multimodal + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_v1_multimodal/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf diff --git a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml deleted file mode 100644 index 3e1fd5263b..0000000000 --- a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml +++ /dev/null @@ -1,201 +0,0 @@ -functionality: - name: process_cellxgene_census - namespace: datasets/workflows - description: | - Fetch and process datasets originating from the CELLxGENE census. - argument_groups: - - name: Input database - description: "Open CellxGene Census by version or URI." - arguments: - - name: "--input_uri" - type: string - description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." - required: false - example: "s3://bucket/path" - - name: "--census_version" - description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." - type: string - example: "stable" - required: false - - name: Cell query - description: Arguments related to the query. - arguments: - - name: "--species" - type: string - description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. - required: false - default: "homo_sapiens" - multiple: false - - name: "--obs_value_filter" - type: string - description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." - required: false - example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" - - name: Cell filter - description: Filter the cells based on a minimum cell count per specified group - arguments: - - name: "--cell_filter_grouping" - type: string - description: | - A subset of 'obs' columns by which to group the cells for filtering. - Only groups surpassing or equal to the `--cell_filter_minimum_count` - threshold will be retained. Take care not to introduce a selection - bias against cells with more fine-grained ontology annotations. - required: false - example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] - multiple: true - - name: "--cell_filter_minimum_count" - type: double - description: | - A minimum number of cells per group to retain. If `--cell_filter_grouping` - is defined, this parameter should also be provided and vice versa. - required: false - example: 100 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--id" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: "output" - required: true - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: "--output_raw" - __merge__: /src/datasets/api/file_raw.yaml - direction: "output" - required: false - - name: "--output_normalized" - __merge__: /src/datasets/api/file_normalized.yaml - direction: "output" - required: false - - name: "--output_pca" - __merge__: /src/datasets/api/file_pca.yaml - direction: "output" - required: false - - name: "--output_hvg" - __merge__: /src/datasets/api/file_hvg.yaml - direction: "output" - required: false - - name: "--output_knn" - __merge__: /src/datasets/api/file_knn.yaml - direction: "output" - required: false - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/cellxgene_census - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: datasets/processors/pca - - name: datasets/processors/hvg - - name: datasets/processors/knn - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml deleted file mode 100644 index 8d3ca51d0b..0000000000 --- a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml +++ /dev/null @@ -1,137 +0,0 @@ -functionality: - name: process_openproblems_neurips2021_bmmc - namespace: datasets/workflows - description: | - Fetch and process Neurips 2021 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: "string" - description: "The ID of the dataset" - required: true - - name: "--input" - type: "file" - description: "Path to the input dataset" - required: true - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_neurips2021_bmmc - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - - name: common/decompress_gzip - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml deleted file mode 100644 index 96bcc3ee2c..0000000000 --- a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml +++ /dev/null @@ -1,143 +0,0 @@ -functionality: - name: process_openproblems_neurips2022_pbmc - namespace: datasets/workflows - description: | - Fetch and process Neurips 2022 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: "string" - description: "The ID of the dataset" - required: true - - name: "--input_mod1" - type: file - description: "Processed RNA h5ad file" - required: true - example: cite_rna_merged.h5ad - - name: "--input_mod2" - type: file - description: "Processed ADT or ATAC h5ad file" - required: true - example: cite_prot_merged.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_neurips2022_pbmc - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - - name: common/decompress_gzip - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml deleted file mode 100644 index fb0cd73a65..0000000000 --- a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml +++ /dev/null @@ -1,163 +0,0 @@ -functionality: - name: process_openproblems_v1 - namespace: datasets/workflows - description: | - Fetch and process legacy OpenProblems v1 datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: "output" - required: true - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: "--output_raw" - __merge__: /src/datasets/api/file_raw.yaml - direction: "output" - required: false - - name: "--output_normalized" - __merge__: /src/datasets/api/file_normalized.yaml - direction: "output" - required: false - - name: "--output_pca" - __merge__: /src/datasets/api/file_pca.yaml - direction: "output" - required: false - - name: "--output_hvg" - __merge__: /src/datasets/api/file_hvg.yaml - direction: "output" - required: false - - name: "--output_knn" - __merge__: /src/datasets/api/file_knn.yaml - direction: "output" - required: false - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_v1 - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: datasets/processors/pca - - name: datasets/processors/hvg - - name: datasets/processors/knn - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml deleted file mode 100644 index 58b045cc3b..0000000000 --- a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml +++ /dev/null @@ -1,161 +0,0 @@ -functionality: - name: process_openproblems_v1_multimodal - namespace: datasets/workflows - description: | - Fetch and process legacy OpenProblems v1 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_v1_multimodal - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml deleted file mode 100644 index 91a2867820..0000000000 --- a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml +++ /dev/null @@ -1,142 +0,0 @@ -functionality: - name: process_tenx_visium - namespace: datasets/workflows - description: | - Download and process datasets originating from 10x Genomics. - argument_groups: - - name: Input - arguments: - - name: "--input_expression" - type: string - description: URL to the feature / barcode matrix HDF5. - required: true - - name: "--input_spatial" - type: string - description: URL to the Spatial imaging data. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/tenx_visium - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml deleted file mode 100644 index 45b938b716..0000000000 --- a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml +++ /dev/null @@ -1,138 +0,0 @@ -functionality: - name: process_zenodo_spatial - namespace: datasets/workflows - description: | - Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. - argument_groups: - - name: Input - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 600 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/zenodo_spatial - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml deleted file mode 100644 index 23934fe161..0000000000 --- a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml +++ /dev/null @@ -1,138 +0,0 @@ -functionality: - name: process_zenodo_spatial_slidetags - namespace: datasets/workflows - description: | - Download and process slide tags datasets originating from Zenodo. - argument_groups: - - name: Input - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 600 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/zenodo_spatial_slidetags - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..e379077261 --- /dev/null +++ b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml @@ -0,0 +1,209 @@ +name: process_cellxgene_census +namespace: datasets/workflows/scrnaseq +description: | + Fetch and process datasets originating from the CELLxGENE census. +argument_groups: + - name: Input database + description: Open CellxGene Census by version or URI. + arguments: + - name: --input_uri + type: string + description: If specified, a URI containing the Census SOMA objects. If specified, + will take precedence over the `--census_version` argument. + required: false + example: s3://bucket/path + - name: --census_version + description: Which release of CellxGene census to use. Possible values are + "latest", "stable", or the date of one of the releases (e.g. "2023-07-25"). + For more information, check the documentation on [Census data + releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html). + type: string + example: stable + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: --species + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus + musculus`. + required: false + default: homo_sapiens + multiple: false + - name: --obs_value_filter + type: string + description: Filter for selecting the `obs` metadata (i.e. cells). Value is + a filter query written in the SOMA `value_filter` syntax. + required: false + example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell' + - name: Cell filter + description: Filter the cells based on a minimum cell count per specified group + arguments: + - name: --cell_filter_grouping + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: [dataset_id, tissue, assay, disease, cell_type] + multiple: true + - name: --cell_filter_minimum_count + type: double + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --id + type: string + description: Nicely formatted name. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_dataset + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: output + required: true + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: --output_raw + __merge__: /src/datasets/api/file_raw.yaml + direction: output + required: false + - name: --output_normalized + __merge__: /src/datasets/api/file_normalized.yaml + direction: output + required: false + - name: --output_pca + __merge__: /src/datasets/api/file_pca.yaml + direction: output + required: false + - name: --output_hvg + __merge__: /src/datasets/api/file_hvg.yaml + direction: output + required: false + - name: --output_knn + __merge__: /src/datasets/api/file_knn.yaml + direction: output + required: false +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/scrnaseq/cellxgene_census + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_cellxgene_census/main.nf b/src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf similarity index 100% rename from src/datasets/workflows/process_cellxgene_census/main.nf rename to src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf diff --git a/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..fe96bf166d --- /dev/null +++ b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml @@ -0,0 +1,167 @@ +name: process_openproblems_v1 +namespace: datasets/workflows/scrnaseq +description: | + Fetch and process legacy OpenProblems v1 datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_dataset + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: output + required: true + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: --output_raw + __merge__: /src/datasets/api/file_raw.yaml + direction: output + required: false + - name: --output_normalized + __merge__: /src/datasets/api/file_normalized.yaml + direction: output + required: false + - name: --output_pca + __merge__: /src/datasets/api/file_pca.yaml + direction: output + required: false + - name: --output_hvg + __merge__: /src/datasets/api/file_hvg.yaml + direction: output + required: false + - name: --output_knn + __merge__: /src/datasets/api/file_knn.yaml + direction: output + required: false +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/scrnaseq/openproblems_v1 + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_v1/main.nf rename to src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf diff --git a/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..fac91adc72 --- /dev/null +++ b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml @@ -0,0 +1,143 @@ +name: process_tenx_visium +namespace: datasets/workflows/spatial +description: | + Download and process datasets originating from 10x Genomics. +argument_groups: + - name: Input + arguments: + - name: --input_expression + type: string + description: URL to the feature / barcode matrix HDF5. + required: true + - name: --input_spatial + type: string + description: URL to the Spatial imaging data. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/tenx_visium + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_tenx_visium/main.nf b/src/datasets/workflows/spatial/process_tenx_visium/main.nf similarity index 100% rename from src/datasets/workflows/process_tenx_visium/main.nf rename to src/datasets/workflows/spatial/process_tenx_visium/main.nf diff --git a/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml new file mode 100644 index 0000000000..b2feb4bcb5 --- /dev/null +++ b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml @@ -0,0 +1,139 @@ +name: process_zenodo +namespace: datasets/workflows/spatial +description: | + Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. +argument_groups: + - name: Input + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 600 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/zenodo + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_zenodo_spatial/main.nf b/src/datasets/workflows/spatial/process_zenodo/main.nf similarity index 99% rename from src/datasets/workflows/process_zenodo_spatial/main.nf rename to src/datasets/workflows/spatial/process_zenodo/main.nf index a5893c0ab4..6343cdc277 100644 --- a/src/datasets/workflows/process_zenodo_spatial/main.nf +++ b/src/datasets/workflows/spatial/process_zenodo/main.nf @@ -49,7 +49,7 @@ workflow run_wf { } // fetch data from legacy openproblems - | zenodo_spatial.run( + | zenodo.run( fromState: [ "input_data": "input_data", "dataset_id": "id", diff --git a/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..2907477c9a --- /dev/null +++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml @@ -0,0 +1,139 @@ +name: process_zenodo_slidetags +namespace: datasets/workflows/spatial +description: | + Download and process slide tags datasets originating from Zenodo. +argument_groups: + - name: Input + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 600 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/zenodo_slidetags + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf similarity index 98% rename from src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf rename to src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf index 2bb6b9300a..e2f43188a9 100644 --- a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf +++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf @@ -49,7 +49,7 @@ workflow run_wf { } // fetch data from legacy openproblems - | zenodo_spatial_slidetags.run( + | zenodo_slidetags.run( fromState: [ "input_data": "input_data", "dataset_id": "id",