openproblems-bio · Dec 15, 2023 · Dec 16, 2023 · Dec 16, 2023 · Dec 19, 2023 · Dec 19, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -307,6 +307,8 @@
 
 * `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`.
 
+* `methods/simlr`: Added new SIMLR method.
+
 
 ## match_modalities (PR #201)
 

diff --git a/src/common/library.bib b/src/common/library.bib
@@ -964,6 +964,18 @@ @article{nestorowa2016single
 	url = {https://doi.org/10.1182/blood-2016-05-716480}
 }
 
+@inproceedings{neurips,
+	author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M},
+	booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks},
+	editor = {J. Vanschoren and S. Yeung},
+	pages = {},
+	publisher = {Curran},
+	title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells},
+	url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf},
+	volume = {1},
+	year = {2021}
+}
+
 
 @string{nov = {Nov.}}
 
@@ -1348,6 +1360,26 @@ @article{wang2013target
 }
 
 
+@article{wang2017visualization,
+	title = {Visualization and analysis of single-cell {RNA}-seq data by kernel-based similarity learning},
+	volume = {14},
+	copyright = {2017 Springer Nature America, Inc.},
+	issn = {1548-7105},
+	url = {https://www.nature.com/articles/nmeth.4207},
+	doi = {10.1038/nmeth.4207},
+	abstract = {The SIMLR software identifies similarities between cells across a range of single-cell RNA-seq data, enabling effective dimension reduction, clustering and visualization.},
+	language = {en},
+	number = {4},
+	journal = {Nature Methods},
+	author = {Wang, Bo and Zhu, Junjie and Pierson, Emma and Ramazzotti, Daniele and Batzoglou, Serafim},
+	month = apr,
+	year = {2017},
+	publisher = {Nature Publishing Group},
+	keywords = {Gene expression, Genome informatics, Machine learning, Statistical methods},
+	pages = {414--416},
+}
+
+
 @article{welch2019single,
 	title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity},
 	author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko},

diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml
@@ -0,0 +1,20 @@
+__merge__: ../api/get_info.yaml
+functionality:
+  name: "get_dataset_info"
+  description: "Extract dataset info and convert to expected format for website results"
+  resources:
+    - type: r_script
+      path: script.R
+  test_resources:
+    - type: file
+      path: /resources_test/common/task_metadata/dataset_info.yaml
+      dest: test_file.yaml
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_r:1.0.2
+    setup:
+      - type: r
+        cran: [ yaml, jsonlite ]
+  - type: nextflow
+    directives:
+      label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_dataset_info/script.R b/src/common/process_task_results/get_dataset_info/script.R
@@ -0,0 +1,25 @@
+requireNamespace("jsonlite", quietly = TRUE)
+requireNamespace("yaml", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input = "resources_test/common/task_metadata/dataset_info.yaml",
+  output = "output/dataset_info.json"
+)
+## VIASH END
+
+datasets <- yaml::yaml.load_file(par$input)
+
+# transform into format expected by website
+datasets_formatted <- lapply(datasets, function(dataset) {
+  dataset$data_url <- dataset$dataset_url
+  dataset$data_reference <- dataset$dataset_reference
+  dataset
+})
+
+jsonlite::write_json(
+  datasets_formatted,
+  par$output,
+  auto_unbox = TRUE,
+  pretty = TRUE
+)
diff --git a/src/common/process_task_results/get_method_info/config.vsh.yaml b/src/common/process_task_results/get_method_info/config.vsh.yaml
@@ -15,10 +15,6 @@ platforms:
     setup:
       - type: r
         cran: [ purrr, dplyr, yaml, rlang, processx ]
-      - type: apt
-        packages: [ curl, default-jdk ]
-      - type: docker
-        run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
   - type: nextflow
     directives:
       label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_metric_info/config.vsh.yaml b/src/common/process_task_results/get_metric_info/config.vsh.yaml
@@ -15,10 +15,6 @@ platforms:
     setup:
       - type: r
         cran: [ purrr, dplyr, yaml, rlang, processx ]
-      - type: apt
-        packages: [ curl, default-jdk ]
-      - type: docker
-        run: "curl -fsSL dl.viash.io | bash && mv viash /usr/bin/viash"
   - type: nextflow
     directives:
       label: [lowmem, lowtime, lowcpu]
diff --git a/src/common/process_task_results/get_results/script.R b/src/common/process_task_results/get_results/script.R
@@ -3,8 +3,8 @@ library(rlang)
 
 ## VIASH START
 par <- list(
-  input_scores = "output/v2/batch_integration/scores.yaml",
-  input_execution = "output/v2/batch_integration/trace.txt",
+  input_scores = "resources/batch_integration/results/scores.yaml",
+  input_execution = "resources/batch_integration/results/trace.txt",
   output = "test.json"
 )
 ## VIASH END

diff --git a/src/common/process_task_results/run/config.vsh.yaml b/src/common/process_task_results/run/config.vsh.yaml
@@ -79,6 +79,7 @@ functionality:
     - name: common/process_task_results/get_results
     - name: common/process_task_results/get_method_info
     - name: common/process_task_results/get_metric_info
+    - name: common/process_task_results/get_dataset_info
     - name: common/process_task_results/yaml_to_json
 platforms:
   - type: nextflow
diff --git a/src/common/process_task_results/run/main.nf b/src/common/process_task_results/run/main.nf
@@ -34,8 +34,7 @@ workflow run_wf {
       }
     )
 
-    | yaml_to_json.run(
-      key: "dataset_info",
+    | get_dataset_info.run(
       fromState: [ 
         "input": "input_dataset_info",
         "output": "output_dataset_info"

diff --git a/src/common/process_task_results/run/run_nf_tower_test.sh b/src/common/process_task_results/run/run_nf_tower_test.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
-DATASETS_DIR="s3://openproblems-nextflow/output/v2/batch_integration"
+DATASETS_DIR="s3://openproblems-data/resources/batch_integration/results/"
 
 # try running on nf tower
-cat > /tmp/params.yaml << HERE
+cat > /tmp/params.yaml << 'HERE'
 id: batch_integration_transform
 input_scores: "$DATASETS_DIR/scores.yaml"
 input_dataset_info: "$DATASETS_DIR/dataset_info.yaml"
@@ -33,6 +33,6 @@ tw launch https://github.com/openproblems-bio/openproblems-v2.git \
   --pull-latest \
   --main-script target/nextflow/common/workflows/transform_meta/main.nf \
   --workspace 53907369739130 \
-  --compute-env 7IkB9ckC81O0dgNemcPJTD \
+  --compute-env 1pK56PjjzeraOOC2LDZvN2 \
   --params-file /tmp/params.yaml \
   --config /tmp/nextflow.config
diff --git a/src/common/process_task_results/run/run_test.sh b/src/common/process_task_results/run/run_test.sh
@@ -10,8 +10,8 @@ set -e
 
 # export TOWER_WORKSPACE_ID=53907369739130
 
-DATASETS_DIR="output/v2/batch_integration"
-OUTPUT_DIR="/home/kai/Documents/openroblems/website/results/batch_integration_feature/data"
+DATASETS_DIR="resources/batch_integration/results"
+OUTPUT_DIR="../website/results/batch_integration_feature/data"
 
 if [ ! -d "$OUTPUT_DIR" ]; then
   mkdir -p "$OUTPUT_DIR"

diff --git a/src/common/process_task_results/yaml_to_json/script.py b/src/common/process_task_results/yaml_to_json/script.py
@@ -1,21 +1,16 @@
-from os import path
 import yaml
 import json
 
 ## VIASH START
 par = {
-    "input" : ".",
-    "task_id" : "denoising",
+    "input": ".",
+    "task_id": "denoising",
     "output": "output/task.json",
-
 }
-meta = { "functionality" : "foo" }
-
 ## VIASH END
 
 with open(par["input"], "r") as f:
     yaml_file = yaml.safe_load(f)
 
-
 with open(par["output"], "w") as out:
-    json.dump(yaml_file, out, indent=2)
+    json.dump(yaml_file, out, indent=2)
diff --git a/src/common/resources_test_scripts/task_metadata.sh b/src/common/resources_test_scripts/task_metadata.sh
@@ -128,9 +128,6 @@ nextflow run . \
   -entry auto \
   --input_states "$DATASETS_DIR/**/state.yaml" \
   --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \
-  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml"}' \
+  --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \
   --publish_dir "$OUTPUT_DIR" \
   --output_state "state.yaml"
-
-# Copy task info
-cp src/tasks/batch_integration/api/task_info.yaml "$OUTPUT_DIR/task_info.yaml"
diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml
@@ -0,0 +1,70 @@
+functionality:
+  name: "openproblems_neurips2021_bmmc"
+  namespace: "datasets/loaders"
+  description: "Fetch a dataset from the OpenProblems NeurIPS2021 competition"
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--input"
+          type: file
+          description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122.
+          required: true
+          example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad
+        - name: "--mod1"
+          type: string
+          description: Name of the first modality.
+          required: true
+          example: GEX
+        - name: "--mod2"
+          type: string
+          description: Name of the second modality.
+          required: true
+          example: ADT
+    - name: Metadata
+      arguments:
+        - name: "--dataset_name"
+          type: string
+          description: Nicely formatted name.
+          required: true
+        - name: "--dataset_url"
+          type: string
+          description: Link to the original source of the dataset.
+          required: false
+        - name: "--dataset_reference"
+          type: string
+          description: Bibtex reference of the paper in which the dataset was published.
+          required: false
+        - name: "--dataset_summary"
+          type: string
+          description: Short description of the dataset.
+          required: true
+        - name: "--dataset_description"
+          type: string
+          description: Long description of the dataset.
+          required: true
+        - name: "--dataset_organism"
+          type: string
+          description: The organism of the dataset.
+          required: false
+    - name: Outputs
+      arguments:
+        - name: "--output_mod1"
+          __merge__: ../../api/file_raw.yaml
+          direction: "output"
+        - name: "--output_mod2"
+          __merge__: ../../api/file_raw.yaml
+          direction: "output"
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - type: file
+      path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.2
+  - type: nextflow
+    directives:
+      label: [ highmem, midcpu , midtime]
Original file line number	Diff line number	Diff line change
		@@ -307,6 +307,8 @@

		* `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`.

		* `methods/simlr`: Added new SIMLR method.


		## match_modalities (PR #201)