diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index be5d319748..804a4cbabb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,121 +6,16 @@ on: workflow_dispatch: inputs: version: - description: Version tag - required: true + description: | + The version of the project to build. Example: `1.0.3`. + + If not provided, a development build with a version name + based on the branch name will be built. Otherwise, a release + build with the provided version will be built. + required: false jobs: - # phase 1 - list: - runs-on: ubuntu-latest - - outputs: - target_branch: ${{ steps.variables.outputs.target_branch }} - version: ${{ steps.variables.outputs.version }} - docker_matrix: ${{ steps.ns_list_docker.outputs.output_matrix }} - - steps: - - name: Check out repository - uses: actions/checkout@v4 - with: - submodules: 'recursive' - fetch-depth: 0 - - - uses: viash-io/viash-actions/setup@v6 - - - name: Determine variables - id: variables - run: | - VERSION="${{ inputs.version }}" - SOURCE_BRANCH=$(echo "$GITHUB_REF" | sed 's/refs\/heads\///') - - if [[ -z $VERSION ]]; then - TARGET_BRANCH="build/$SOURCE_BRANCH" - VERSION=$(echo "$TARGET_BRANCH" | sed 's/[^a-zA-Z0-9_]/_/g') - else - if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+.*$ ]]; then - echo "Version '$VERSION' does not match PEP440" - exit 1 - fi - TARGET_BRANCH="release/${VERSION%.*}.x" - fi - - echo "Set version of Viash package to '$VERSION'" - echo "version=$VERSION" >> $GITHUB_OUTPUT - - echo "Set target branch to '$TARGET_BRANCH'" - echo "target_branch=$TARGET_BRANCH" >> $GITHUB_OUTPUT - - - name: Remove target folder from .gitignore - run: | - # allow publishing the target folder - sed -i 's#^/target/$##g' .gitignore - - - uses: viash-io/viash-actions/ns-build@v6 - with: - config_mod: .functionality.version := '${{ steps.variables.outputs.version }}' - parallel: true - - - name: Deploy to target branch - uses: peaceiris/actions-gh-pages@v4 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: . - publish_branch: ${{ steps.variables.outputs.target_branch }} - - - id: ns_list_docker - uses: viash-io/viash-actions/ns-list@v6 - with: - platform: docker - src: src - format: json - - # phase 2 build: - needs: list - - runs-on: ubuntu-latest - - permissions: - contents: read - packages: write - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.docker_matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - fetch-depth: 0 - ref: ${{ needs.list.outputs.target_branch }} - - - uses: viash-io/viash-actions/setup@v6 - - - name: Build container - uses: viash-io/viash-actions/ns-build@v6 - with: - config_mod: .functionality.version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: build - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Push container - uses: viash-io/viash-actions/ns-build@v6 - with: - config_mod: .functionality.version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: push \ No newline at end of file + uses: viash-io/viash-actions/.github/workflows/build.yaml@v6 + with: + version: ${{ github.event.inputs.version }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5d1582d854..1b8d1db6ae 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,88 +1,11 @@ name: Test on: - pull_request: push: - branches: [ main ] + branches: + - main + pull_request: jobs: - - # phase 1 - list: - env: - s3_bucket: s3://openproblems-data/resources_test/ - runs-on: ubuntu-latest - - outputs: - matrix: ${{ steps.set_matrix.outputs.matrix }} - cache_key: ${{ steps.cache.outputs.cache_key }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: viash-io/viash-actions/setup@v6 - - - uses: viash-io/viash-actions/project/sync-and-cache-s3@v6 - id: cache - with: - s3_bucket: $s3_bucket - dest_path: resources_test - cache_key_prefix: resources_test__ - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v6 - with: - platform: docker - format: json - - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v6 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.functionality.namespace + "/" + .functionality.name), - "config": .info.config - } - ]' ${{ steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 - viash_test: - needs: list - if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v6 - - # use cache - - name: Cache resources data - uses: actions/cache@v4 - timeout-minutes: 10 - with: - path: resources_test - key: ${{ needs.list.outputs.cache_key }} - - - name: Run test - timeout-minutes: 30 - run: | - VIASH_TEMP=$RUNNER_TEMP/viash viash test \ - "${{ matrix.component.config }}" \ - --cpus 2 \ - --memory "5gb" - + test: + uses: viash-io/viash-actions/.github/workflows/test.yaml@v6 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..c07c083aea --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "common"] + path = common + url = https://github.com/openproblems-bio/common_resources.git diff --git a/.vscode/settings.json b/.vscode/settings.json index e662fc6472..7695e2a406 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,10 +1,4 @@ { "yaml.schemas": { - "src/common/schemas/api_component.yaml": "src/**/api/comp_*.yaml", - "src/common/schemas/api_file.yaml": "src/**/api/file_*.yaml", - "src/common/schemas/task_info.yaml": "src/**/api/task_info.yaml", - "src/common/schemas/task_method.yaml": "src/tasks/**/methods/**/config.vsh.yaml", - "src/common/schemas/task_control_method.yaml": "src/tasks/**/control_methods/**/config.vsh.yaml", - "src/common/schemas/task_metric.yaml": "src/tasks/**/metrics/**/config.vsh.yaml" } } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d1d8a62ea..276eb63a85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,21 +2,25 @@ ## Breaking changes -- Moved `src/tasks/batch_integration` to [`task_batch_integration`](https://github.com/openproblems-bio/task_batch_integration). +- Moved `src/tasks/batch_integration` to [`task_batch_integration`](https://github.com/openproblems-bio/task_batch_integration) (PR #910). -- Moved `src/tasks/denoising` to [`task_denoising`](https://github.com/openproblems-bio/task_denoising). +- Moved `src/tasks/denoising` to [`task_denoising`](https://github.com/openproblems-bio/task_denoising) (PR #910). -- Moved `src/tasks/dimensionality_reduction` to [`task_dimensionality_reduction`](https://github.com/openproblems-bio/task_dimensionality_reduction). +- Moved `src/tasks/dimensionality_reduction` to [`task_dimensionality_reduction`](https://github.com/openproblems-bio/task_dimensionality_reduction) (PR #910). -- Moved `src/tasks/label_projection` to [`task_label_projection`](https://github.com/openproblems-bio/task_label_projection). +- Moved `src/tasks/label_projection` to [`task_label_projection`](https://github.com/openproblems-bio/task_label_projection) (PR #910). -- Moved `src/tasks/match_modalities` to [`task_match_modalities`](https://github.com/openproblems-bio/task_match_modalities). +- Moved `src/tasks/match_modalities` to [`task_match_modalities`](https://github.com/openproblems-bio/task_match_modalities) (PR #910). -- Moved `src/tasks/predict_modality` to [`task_predict_modality`](https://github.com/openproblems-bio/task_predict_modality). +- Moved `src/tasks/predict_modality` to [`task_predict_modality`](https://github.com/openproblems-bio/task_predict_modality) (PR #910). -- Moved `src/tasks/spatial_decomposition` to [`task_spatial_decomposition`](https://github.com/openproblems-bio/task_spatial_decomposition). +- Moved `src/tasks/spatial_decomposition` to [`task_spatial_decomposition`](https://github.com/openproblems-bio/task_spatial_decomposition) (PR #910). -- Moved `src/tasks/spatially_variable_genes` to [`task_spatially_variable_genes`](https://github.com/openproblems-bio/task_spatially_variable_genes). +- Moved `src/tasks/spatially_variable_genes` to [`task_spatially_variable_genes`](https://github.com/openproblems-bio/task_spatially_variable_genes) (PR #910). + +## Major changes + +- Update Viash to 0.9.0 (PR #911). ## Minor changes diff --git a/_viash.yaml b/_viash.yaml index de6a7af122..90409c8f69 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,14 +1,35 @@ -viash_version: 0.8.6 +name: openproblems +version: dev +organization: openproblems-bio +viash_version: 0.9.0 -source: src -target: target +description: | + Open Problems is a living, extensible, community-guided benchmarking platform. +license: MIT +keywords: [openproblems, benchmarking, single-cell omics] + +references: + doi: + # Malte Luecken, Scott Gigante, Daniel Burkhardt, Robrecht Cannoodt, et al. + # Defining and benchmarking open problems in single-cell analysis, + # 03 April 2024, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-4181617/v1] + - 10.21203/rs.3.rs-4181617/v1 + +links: + issue_tracker: https://github.com/openproblems-bio/openproblems/issues + repository: https://github.com/openproblems-bio/openproblems + docker_registry: ghcr.io config_mods: | - .functionality.version := 'dev' - .platforms[.type == 'docker'].target_registry := 'ghcr.io' - .platforms[.type == 'docker'].target_organization := 'openproblems-bio/openproblems' - .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/openproblems' - .platforms[.type == "nextflow"].directives.tag := "$id" - .platforms[.type == "nextflow"].auto.simplifyOutput := false - .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } - .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" \ No newline at end of file + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } + .runners[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" + + +info: + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/common + dest: resources_test/common + - type: s3 + path: s3://openproblems-data/resources_test/openproblems + dest: resources_test/openproblems diff --git a/common b/common new file mode 160000 index 0000000000..e64f472b37 --- /dev/null +++ b/common @@ -0,0 +1 @@ +Subproject commit e64f472b37f1bdbd383640098708ecf5c9f7fd7e diff --git a/scripts/create_resources/task_results_v1.sh b/scripts/create_resources/task_results_v1.sh new file mode 100755 index 0000000000..9c3c730cd3 --- /dev/null +++ b/scripts/create_resources/task_results_v1.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +OUT_DIR="resources_test/openproblems/task_results_v1" + +set -e + +echo ">> Removing previously generated results" +if [ -d "$OUT_DIR" ]; then + rm -r "$OUT_DIR" +fi + +echo ">> Fetch results in v1 format" +mkdir -p "$OUT_DIR/data/" +TMPDIR=$(mktemp -d) + +wget https://github.com/openproblems-bio/website/archive/refs/tags/v2.3.6.zip -O "$TMPDIR/website-v2.3.6.zip" +unzip "$TMPDIR/website-v2.3.6.zip" -d "$TMPDIR" +cp -r "$TMPDIR/website-2.3.6/results/batch_integration_embed/data/" "$OUT_DIR/processed" + +echo ">> Uploading results to S3" +aws s3 sync --profile op \ + "resources_test/openproblems/task_results_v1/" \ + "s3://openproblems-data/resources_test/openproblems/task_results_v1/" \ + --delete --dryrun diff --git a/scripts/create_resources/task_results_v2.sh b/scripts/create_resources/task_results_v2.sh new file mode 100755 index 0000000000..2df4230e21 --- /dev/null +++ b/scripts/create_resources/task_results_v2.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +OUT_DIR="resources_test/openproblems/task_results_v2" + +echo ">> Fetch results in v2 format" +aws s3 sync \ + s3://openproblems-data/resources/batch_integration/results/run_2024-06-28_13-20-27/ \ + "$OUT_DIR/raw/" \ + --delete + +echo ">> Process results" +nextflow run openproblems-bio/openproblems \ + -r v2.0.0 \ + -main-script "target/nextflow/common/process_task_results/run/main.nf" \ + -profile docker \ + --input_scores "$OUT_DIR/raw/score_uns.yaml" \ + --input_method_configs "$OUT_DIR/raw/method_configs.yaml" \ + --input_metric_configs "$OUT_DIR/raw/metric_configs.yaml" \ + --input_dataset_info "$OUT_DIR/raw/dataset_uns.yaml" \ + --input_execution "$OUT_DIR/raw/trace.txt" \ + --input_task_info "$OUT_DIR/raw/task_info.yaml" \ + --output_state "state.yaml" \ + --publish_dir "$OUT_DIR/processed/" + +echo ">> Uploading results to S3" +aws s3 sync --profile op \ + "resources_test/openproblems/task_results_v2/" \ + "s3://openproblems-data/resources_test/openproblems/task_results_v2/" \ + --delete --dryrun + +aws s3 sync --profile op \ + "resources_test/openproblems/" \ + "s3://openproblems-data/resources_test/openproblems/" \ + --delete --dryrun \ No newline at end of file diff --git a/scripts/create_resources/task_results_v3.sh b/scripts/create_resources/task_results_v3.sh new file mode 100755 index 0000000000..a21aeb66d4 --- /dev/null +++ b/scripts/create_resources/task_results_v3.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +OUT_DIR="resources_test/openproblems/task_results_v3" + +echo ">> Fetch results in v3 format" +aws s3 sync \ + s3://openproblems-data/resources/task_batch_integration/results/run_2024-09-29_08-33-24/ \ + "$OUT_DIR/raw/" \ + --delete + +echo ">> Process results" +nextflow run . \ + -main-script "target/nextflow/reporting/process_task_results/main.nf" \ + -profile docker \ + --input_scores "$OUT_DIR/raw/score_uns.yaml" \ + --input_method_configs "$OUT_DIR/raw/method_configs.yaml" \ + --input_metric_configs "$OUT_DIR/raw/metric_configs.yaml" \ + --input_dataset_info "$OUT_DIR/raw/dataset_uns.yaml" \ + --input_execution "$OUT_DIR/raw/trace.txt" \ + --input_task_info "$OUT_DIR/raw/task_info.yaml" \ + --output_state "state.yaml" \ + --publish_dir "$OUT_DIR/processed/" + +echo ">> Uploading results to S3" +aws s3 sync --profile op \ + "resources_test/openproblems/task_results_v3/" \ + "s3://openproblems-data/resources_test/openproblems/task_results_v3/" \ + --delete --dryrun diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh index 76e88e4a04..20b87e7d36 100755 --- a/scripts/sync_resources.sh +++ b/scripts/sync_resources.sh @@ -2,4 +2,4 @@ set -e -viash run src/common/sync_test_resources/config.vsh.yaml +common/scripts/sync_resources diff --git a/src/common/check_dataset_schema/config.vsh.yaml b/src/common/check_dataset_schema/config.vsh.yaml deleted file mode 100644 index 08449c3e7d..0000000000 --- a/src/common/check_dataset_schema/config.vsh.yaml +++ /dev/null @@ -1,45 +0,0 @@ -functionality: - name: check_dataset_schema - namespace: common - description: Checks if the dataset has the necessary slots that are predefined in a schema. - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - required: true - description: A h5ad file. - - name: --schema - type: file - required: true - description: A schema file for the h5ad object. - - name: Arguments - arguments: - - name: --stop_on_error - type: boolean - default: false - description: Whether or not to stop with exit code 1 if the input file does not adhere to the schema. - - name: Output - arguments: - - name: --output - type: file - required: true - description: If specified, this file will contain a structured log of which checks succeeded (or not). - example: checks.json - direction: output - resources: - - type: python_script - path: script.py - test_resources: - - path: /resources_test/common/pancreas - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - test_setup: - - type: python - packages: viashpy - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/common/check_dataset_schema/script.py b/src/common/check_dataset_schema/script.py deleted file mode 100644 index cd84f9cdcf..0000000000 --- a/src/common/check_dataset_schema/script.py +++ /dev/null @@ -1,60 +0,0 @@ -import anndata as ad -import yaml -import json - -## VIASH START -par = { - 'input': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/input_1/dataset.h5ad', - 'schema': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/schema_1/schema.yaml', - 'stop_on_error': False, - 'output': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/out.yaml', -} -## VIASH END - -def check_structure(slot, slot_info, adata_slot): - missing = [] - if slot == "X": - slot_info["name"] = "X" - slot_info = [slot_info] - for obj in slot_info: - adata_data = adata_slot.get(obj['name']) if slot != 'X' else adata_slot - if obj.get('required') and adata_data is None: - missing.append(obj['name']) - # todo: check types - return missing - -print('Load data', flush=True) -adata = ad.read_h5ad(par['input']) - -# create data structure -out = { - "exit_code": 0, - "error": {}, - "data_schema": "ok" -} - -print("Check AnnData against schema", flush=True) -with open(par["schema"], "r") as f: - data_struct = yaml.safe_load(f) - -def_slots = data_struct['info']['slots'] - -out = { - "exit_code": 0, - "error": {}, - "data_schema": "ok" -} -for slot in def_slots: - print("Checking slot", slot, flush=True) - missing = check_structure(slot, def_slots[slot], getattr(adata, slot)) - if missing: - print(f"Dataset is missing {slot} {missing}", flush=True) - out['exit_code'] = 1 - out['data_schema'] = 'not ok' - out['error'][slot] = missing - -with open(par["output"], "w") as f: - json.dump(out, f, indent=2) - -if par['stop_on_error']: - exit(out['exit_code']) diff --git a/src/common/check_yaml_schema/config.vsh.yaml b/src/common/check_yaml_schema/config.vsh.yaml deleted file mode 100644 index b87bec5429..0000000000 --- a/src/common/check_yaml_schema/config.vsh.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - name: check_yaml_schema - namespace: common - description: Checks if a YAML file adheres to a custom schema file. - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - required: true - description: A yaml file. - - name: --schema - type: file - required: true - description: A schema file for the yaml file. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - jsonschema - - type: nextflow diff --git a/src/common/comp_tests/check_get_info.py b/src/common/comp_tests/check_get_info.py deleted file mode 100644 index a00f1d702d..0000000000 --- a/src/common/comp_tests/check_get_info.py +++ /dev/null @@ -1,37 +0,0 @@ -import subprocess -from os import path -import json - -## VIASH START -## VIASH END - -input_path = meta["resources_dir"] + "/test_file.yaml" -task_id = "denoising" -output_path = "output.json" - -cmd = [ - meta['executable'], - "--input", input_path, - "--task_id", task_id, - "--output", output_path, -] - -print(">> Running script as test", flush=True) -out = subprocess.run(cmd, stderr=subprocess.STDOUT) - -if out.stdout: - print(out.stdout) - -if out.returncode: - print(f"script: '{cmd}' exited with an error.") - exit(out.returncode) - -print(">> Checking whether output file exists", flush=True) -assert path.exists(output_path), "Output does not exist" - -print(">> Reading json file", flush=True) -with open(output_path, 'r') as f: - out = json.load(f) - print(out) - -print("All checks succeeded!", flush=True) \ No newline at end of file diff --git a/src/common/comp_tests/check_method_config.py b/src/common/comp_tests/check_method_config.py deleted file mode 100644 index a30111d648..0000000000 --- a/src/common/comp_tests/check_method_config.py +++ /dev/null @@ -1,132 +0,0 @@ -import yaml - -## VIASH START -meta = { - "config" : "foo" -} -## VIASH END - - -NAME_MAXLEN = 50 - -SUMMARY_MAXLEN = 400 - -DESCRIPTION_MAXLEN = 5000 - -_MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] - -TIME_LABELS = ["lowtime", "midtime", "hightime", "veryhightime"] -MEM_LABELS = ["lowmem", "midmem", "highmem"] -CPU_LABELS = ["lowcpu", "midcpu", "highcpu"] - -def _load_bib(): - with open(f"{meta['resources_dir']}/library.bib", "r") as file: - return file.read() - -def check_url(url): - import requests - from urllib3.util.retry import Retry - from requests.adapters import HTTPAdapter - - # configure retry strategy - session = requests.Session() - retry = Retry(connect=3, backoff_factor=0.5) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - - get = session.head(url) - - if get.ok or get.status_code == 429: # 429 rejected, too many requests - return True - else: - return False - -def search_ref_bib(reference): - import re - bib = _load_bib() - - entry_pattern = r"(@\w+{[^}]*" + reference + r"[^}]*}(.|\n)*?)(?=@)" - - bib_entry = re.search(entry_pattern, bib) - - if bib_entry: - - type_pattern = r"@(.*){" + reference - doi_pattern = r"(?=[Dd][Oo][Ii]\s*=\s*{([^,}]+)})" - - entry_type = re.search(type_pattern, bib_entry.group(1)) - - if not (entry_type.group(1) == "misc" or reference in _MISSING_DOIS): - entry_doi = re.search(doi_pattern, bib_entry.group(1)) - assert entry_doi.group(1), "doi not found in bibtex reference" - url = f"https://doi.org/{entry_doi.group(1)}" - assert check_url(url), f"{url} is not reachable, ref= {reference}." - - return True - - else: - return False - -print("Load config data", flush=True) -with open(meta["config"], "r") as file: - config = yaml.safe_load(file) - -print("Check general fields", flush=True) -assert len(config["functionality"]["name"]) <= NAME_MAXLEN, f"Component id (.functionality.name) should not exceed {NAME_MAXLEN} characters." -assert "namespace" in config["functionality"] is not None, "namespace not a field or is empty" - -print("Check info fields", flush=True) -info = config['functionality']['info'] -assert "type" in info, "type not an info field" -info_types = ["method", "control_method"] -assert info["type"] in info_types , f"got {info['type']} expected one of {info_types}" -assert "label" in info is not None, "label not an info field or is empty" -assert "summary" in info is not None, "summary not an info field or is empty" -assert "FILL IN:" not in info["summary"], "Summary not filled in" -assert len(info["summary"]) <= SUMMARY_MAXLEN, f"Component id (.functionality.info.summary) should not exceed {SUMMARY_MAXLEN} characters." -assert "description" in info is not None, "description not an info field or is empty" -assert "FILL IN:" not in info["description"], "description not filled in" -assert len(info["description"]) <= DESCRIPTION_MAXLEN, f"Component id (.functionality.info.description) should not exceed {DESCRIPTION_MAXLEN} characters." -if info["type"] == "method": - assert "reference" in info, "reference not an info field" - bib = _load_bib() - if info["reference"]: - reference = info["reference"] - if not isinstance(reference, list): - reference = [reference] - for ref in reference: - assert search_ref_bib(ref), f"reference {ref} not added to library.bib" - assert "documentation_url" in info is not None, "documentation_url not an info field or is empty" - assert "repository_url" in info is not None, "repository_url not an info field or is empty" - assert check_url(info["documentation_url"]), f"{info['documentation_url']} is not reachable" - assert check_url(info["repository_url"]), f"{info['repository_url']} is not reachable" - -if "variants" in info: - arg_names = [arg["name"].replace("--", "") for arg in config["functionality"]["arguments"]] + ["preferred_normalization"] - - for paramset_id, paramset in info["variants"].items(): - if paramset: - for arg_id in paramset: - assert arg_id in arg_names, f"Argument '{arg_id}' in `.functionality.info.variants['{paramset_id}']` is not an argument in `.functionality.arguments`." - -assert "preferred_normalization" in info, "preferred_normalization not an info field" -norm_methods = ["log_cpm", "log_cp10k", "counts", "log_scran_pooling", "sqrt_cpm", "sqrt_cp10k", "l1_sqrt"] -assert info["preferred_normalization"] in norm_methods, "info['preferred_normalization'] not one of '" + "', '".join(norm_methods) + "'." - -print("Check platform fields", flush=True) -platforms = config['platforms'] -for platform in platforms: - if not platform["type"] == "nextflow": - continue - nextflow= platform - -assert nextflow, "nextflow not a platform" -assert nextflow["directives"], "directives not a field in nextflow platform" -assert nextflow["directives"]["label"], "label not a field in nextflow platform directives" - -assert [i for i in nextflow["directives"]["label"] if i in TIME_LABELS], "time label not filled in" -assert [i for i in nextflow["directives"]["label"] if i in MEM_LABELS], "mem label not filled in" -assert [i for i in nextflow["directives"]["label"] if i in CPU_LABELS], "cpu label not filled in" - -print("All checks succeeded!", flush=True) diff --git a/src/common/comp_tests/check_metric_config.py b/src/common/comp_tests/check_metric_config.py deleted file mode 100644 index 45fa1efc2b..0000000000 --- a/src/common/comp_tests/check_metric_config.py +++ /dev/null @@ -1,139 +0,0 @@ -import yaml -from typing import Dict - -## VIASH START - -meta = { - "config" : "foo" -} - -## VIASH END - -NAME_MAXLEN = 50 - -SUMMARY_MAXLEN = 400 - -DESCRIPTION_MAXLEN = 5000 - -_MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] - -TIME_LABELS = ["lowtime", "midtime", "hightime"] -MEM_LABELS = ["lowmem", "midmem", "highmem"] -CPU_LABELS = ["lowcpu", "midcpu", "highcpu"] - - -def _load_bib(): - bib_path = meta["resources_dir"]+"/library.bib" - with open(bib_path, "r") as file: - return file.read() - -def check_url(url): - import requests - from urllib3.util.retry import Retry - from requests.adapters import HTTPAdapter - - # configure retry strategy - session = requests.Session() - retry = Retry(connect=3, backoff_factor=0.5) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - - get = session.head(url) - - if get.ok or get.status_code == 429: # 429 rejected, too many requests - return True - else: - return False - -def search_ref_bib(reference): - import re - bib = _load_bib() - - entry_pattern = r"(@\w+{[^}]*" + reference + r"[^}]*}(.|\n)*?)(?=@)" - - bib_entry = re.search(entry_pattern, bib) - - if bib_entry: - - type_pattern = r"@(.*){" + reference - doi_pattern = r"(?=[Dd][Oo][Ii]\s*=\s*{([^,}]+)})" - - entry_type = re.search(type_pattern, bib_entry.group(1)) - - if not (entry_type.group(1) == "misc" or reference in _MISSING_DOIS): - entry_doi = re.search(doi_pattern, bib_entry.group(1)) - assert entry_doi.group(1), "doi not found in bibtex reference" - url = f"https://doi.org/{entry_doi.group(1)}" - assert check_url(url), f"{url} is not reachable, ref= {reference}." - - return True - - else: - return False - -def check_metric(metric: Dict[str, str]) -> str: - assert "name" in metric is not None, "name not a field or is empty" - assert len(metric["name"]) <= NAME_MAXLEN, f"Component id (.functionality.info.metrics.metric.name) should not exceed {NAME_MAXLEN} characters." - assert "label" in metric is not None, "label not a field in metric or is empty" - assert "summary" in metric is not None, "summary not a field in metric or is empty" - assert "FILL IN:" not in metric["summary"], "Summary not filled in" - assert len(metric["summary"]) <= SUMMARY_MAXLEN, f"Component id (.functionality.info.metrics.metric.summary) should not exceed {SUMMARY_MAXLEN} characters." - assert "description" in metric is not None, "description not a field in metric or is empty" - assert len(metric["description"]) <= DESCRIPTION_MAXLEN, f"Component id (.functionality.info.metrics.metric.description) should not exceed {DESCRIPTION_MAXLEN} characters." - assert "FILL IN:" not in metric["description"], "description not filled in" - # assert "reference" in metric, "reference not a field in metric" - if "reference" in metric: - reference = metric["reference"] - if not isinstance(reference, list): - reference = [reference] - for ref in reference: - assert search_ref_bib(ref), f"reference {ref} not added to library.bib" - # assert "documentation_url" in metric , "documentation_url not a field in metric" - # assert "repository_url" in metric , "repository_url not a metric field" - if "documentation_url" in metric: - assert check_url(metric["documentation_url"]), f"{metric['documentation_url']} is not reachable" - if "repository_url" in metric: - assert check_url(metric["repository_url"]), f"{metric['repository_url']} is not reachable" - assert "min" in metric is not None, f"min not a field in metric or is emtpy" - assert "max" in metric is not None, f"max not a field in metric or is empty" - assert "maximize" in metric is not None, f"maximize not a field in metric or is emtpy" - assert isinstance(metric['min'], (int, str)), "not an int or string (-.inf)" - assert isinstance(metric['max'], (int, str)), "not an int or string (+.inf)" - assert isinstance(metric['maximize'], bool) or metric["maximize"] not in ["-inf", "+inf"], "not a bool" - - -print("Load config data", flush=True) -with open(meta["config"], "r") as file: - config = yaml.safe_load(file) - -print("check general fields", flush=True) -assert "name" in config["functionality"] is not None, "Name not a field or is empty" -assert len(config["functionality"]["name"]) <= NAME_MAXLEN, f"Component id (.functionality.name) should not exceed {NAME_MAXLEN} characters." -assert "namespace" in config["functionality"] is not None, "namespace not a field or is empty" - - -print("Check info fields", flush=True) -info = config['functionality']['info'] -assert "type" in info, "type not an info field" -assert info["type"] == "metric" , f"got {info['type']} expected 'metric'" -assert "metrics" in info, "metrics not an info field" -for metric in info["metrics"]: - check_metric(metric) - -print("Check platform fields", flush=True) -platforms = config['platforms'] -for platform in platforms: - if not platform["type"] == "nextflow": - continue - nextflow= platform - -assert nextflow, "nextflow not a platform" -assert nextflow["directives"], "directives not a field in nextflow platform" -assert nextflow["directives"]["label"], "label not a field in nextflow platform directives" - -assert [i for i in nextflow["directives"]["label"] if i in TIME_LABELS], "time label not filled in" -assert [i for i in nextflow["directives"]["label"] if i in MEM_LABELS], "mem label not filled in" -assert [i for i in nextflow["directives"]["label"] if i in CPU_LABELS], "cpu label not filled in" - -print("All checks succeeded!", flush=True) diff --git a/src/common/comp_tests/run_and_check_adata.py b/src/common/comp_tests/run_and_check_adata.py deleted file mode 100644 index d2cda5af94..0000000000 --- a/src/common/comp_tests/run_and_check_adata.py +++ /dev/null @@ -1,127 +0,0 @@ -import anndata as ad -import subprocess -from os import path -import yaml -import re - -## VIASH START -meta = { - "executable": "target/docker/denoising/methods/dca/dca", - "config": "target/docker/denoising/methods/dca/.config.vsh.yaml", - "resources_dir": "resources_test/denoising" -} -## VIASH END - -# helper functions -def check_slots(adata, arg): - """Check whether an AnnData file contains all for the required - slots in the corresponding .info.slots field. - """ - for struc_name, slot_items in arg["info"].get("slots", {}).items(): - struc_x = getattr(adata, struc_name) - - if struc_name == "X": - if slot_items.get("required", True): - assert struc_x is not None,\ - f"File '{arg['value']}' is missing slot .{struc_name}" - - else: - for slot_item in slot_items: - if slot_item.get("required", True): - assert slot_item["name"] in struc_x,\ - f"File '{arg['value']}' is missing slot .{struc_name}['{slot_item['name']}']" - -def run_and_check(arguments, cmd): - print(">> Checking whether input files exist", flush=True) - for arg in arguments: - if arg["type"] == "file" and arg["direction"] == "input": - assert path.exists(arg["value"]), f"Input file '{arg['value']}' does not exist" - - print(f">> Running script as test", flush=True) - out = subprocess.run(cmd, stderr=subprocess.STDOUT) - - if out.stdout: - print(out.stdout) - - if out.returncode: - print(f"script: \'{' '.join(cmd)}\' exited with an error.") - exit(out.returncode) - - print(">> Checking whether output file exists", flush=True) - for arg in arguments: - if arg["type"] == "file" and arg["direction"] == "output": - assert path.exists(arg["value"]), f"Output file '{arg['value']}' does not exist" - - print(">> Reading h5ad files and checking formats", flush=True) - adatas = {} - for arg in arguments: - if arg["type"] == "file" and "slots" in arg["info"]: - print(f"Reading and checking {arg['clean_name']}", flush=True) - adata = ad.read_h5ad(arg["value"]) - - print(f" {adata}") - - check_slots(adata, arg) - - adatas[arg["clean_name"]] = adata - - print("All checks succeeded!", flush=True) - - -# read viash config -with open(meta["config"], "r") as file: - config = yaml.safe_load(file) - -# get resources -arguments = [] - -for arg in config["functionality"]["arguments"]: - new_arg = arg.copy() - arg_info = new_arg.get("info") or {} - - # set clean name - clean_name = re.sub("^--", "", arg["name"]) - new_arg["clean_name"] = clean_name - - # use example to find test resource file - if arg["type"] == "file": - if arg["direction"] == "input": - value = f"{meta['resources_dir']}/{arg['example'][0]}" - else: - value = f"{clean_name}.h5ad" - new_arg["value"] = value - elif "test_default" in arg_info: - new_arg["value"] = arg_info["test_default"] - - arguments.append(new_arg) - - -if "test_setup" not in config["functionality"]["info"]: - argument_sets = {"run": arguments} -else: - test_setup = config["functionality"]["info"]["test_setup"] - argument_sets = {} - for name, test_instance in test_setup.items(): - new_arguments = [] - for arg in arguments: - new_arg = arg.copy() - if arg["clean_name"] in test_instance: - val = test_instance[arg["clean_name"]] - if new_arg["type"] == "file" and new_arg["direction"] == "input": - val = f"{meta['resources_dir']}/{val}" - new_arg["value"] = val - new_arguments.append(new_arg) - argument_sets[name] = new_arguments - -for argset_name, argset_args in argument_sets.items(): - print(f">> Running test '{argset_name}'", flush=True) - # construct command - cmd = [ meta["executable"] ] - for arg in argset_args: - if "value" in arg: - value = arg["value"] - if arg["multiple"] and isinstance(value, list): - value = arg["multiple_sep"].join(value) - cmd.extend([arg["name"], str(value)]) - - run_and_check(argset_args, cmd) \ No newline at end of file diff --git a/src/common/create_component/config.vsh.yaml b/src/common/create_component/config.vsh.yaml deleted file mode 100644 index 58303a1ca8..0000000000 --- a/src/common/create_component/config.vsh.yaml +++ /dev/null @@ -1,72 +0,0 @@ -functionality: - name: create_component - status: disabled - namespace: common - description: | - Create a component Viash component. - - Usage: - ``` - bin/create_component --task denoising --type method --language r --name foo - bin/create_component --task denoising --type metric --language python --name bar - ``` - arguments: - - type: string - name: --task - description: Which task the component will be added to. - example: denoising - - type: string - name: --type - example: metric - description: The type of component to create. Typically must be one of 'method', 'control_method' or 'metric'. - - type: string - name: --language - description: Which scripting language to use. Options are 'python', 'r'. - default: python - choices: [python, r] - - type: string - name: --name - example: new_comp - description: Name of the new method, formatted in snake case. - - type: file - name: --output - direction: output - # required: true - description: Path to the component directory. Suggested location is `src//s/`. - default: src/tasks/${VIASH_PAR_TASK}/${VIASH_PAR_TYPE}s/${VIASH_PAR_NAME} - - type: file - name: --api_file - description: | - Which API file to use. Defaults to `src//api/comp_.yaml`. - In tasks with different subtypes of method, this location might not exist and you might need - to manually specify a different API file to inherit from. - must_exist: false - # required: true - default: src/tasks/${VIASH_PAR_TASK}/api/comp_${VIASH_PAR_TYPE}.yaml - - type: file - name: --viash_yaml - description: | - Path to the project config file. Needed for knowing the relative location of a file to the project root. - # required: true - default: "_viash.yaml" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/read_and_merge_yaml.py - test_resources: - - type: python_script - path: test.py - - path: /src - dest: openproblems/src - - path: /_viash.yaml - dest: openproblems/_viash.yaml -platforms: - - type: docker - image: python:3.10-slim - setup: - - type: python - pypi: ruamel.yaml - - type: native - - type: nextflow - - diff --git a/src/common/create_component/script.sh b/src/common/create_component/script.sh deleted file mode 100755 index 9fef9ef3a7..0000000000 --- a/src/common/create_component/script.sh +++ /dev/null @@ -1,5 +0,0 @@ -TASK=dimensionality_reduction -viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type metric --name foor --language r -viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type method --name foor --language r -viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type method --name foopy -viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type metric --name foopy \ No newline at end of file diff --git a/src/common/create_component/test.py b/src/common/create_component/test.py deleted file mode 100644 index a53e54a18e..0000000000 --- a/src/common/create_component/test.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -import subprocess -from os import path -from ruamel.yaml import YAML - -## VIASH START -meta = { - 'executable': 'foo' -} -## VIASH END - -opv2 = f"{meta['resources_dir']}/openproblems" -output_path = f"{opv2}/src/tasks/label_projection/methods/test_method" - -cmd = [ - meta['executable'], - '--task', 'label_projection', - '--type', 'method', - '--name', 'test_method', - '--language', 'python' -] - -print('>> Running the script as test', flush=True) -out = subprocess.run(cmd, stderr=subprocess.STDOUT, cwd=opv2) - -if out.stdout: - print(out.stdout) - -if out.returncode: - print(f"script: '{cmd}' exited with an error.") - exit(out.returncode) - -print('>> Checking whether output files exist', flush=True) -assert os.path.exists(output_path), "Output dir does not exist" - -conf_f = path.join(output_path, 'config.vsh.yaml') -assert os.path.exists(conf_f), "Config file does not exist" - -script_f = path.join(output_path, "script.py") -assert os.path.exists(script_f), "Script file does not exist" - -print('>> Checking file contents', flush=True) -yaml = YAML(typ='safe', pure=True) -with open(conf_f) as f: - conf_data = yaml.load(f) - -assert conf_data['functionality']['name'] == 'test_method', "Name should be equal to 'test_method'" -# assert conf_data['platforms'][0]['image'] == 'python:3.10', "Python image should be equal to python:3.10" - - -print('All checks succeeded!', flush=True) - diff --git a/src/common/create_task_readme/config.vsh.yaml b/src/common/create_task_readme/config.vsh.yaml deleted file mode 100644 index 273e196ffb..0000000000 --- a/src/common/create_task_readme/config.vsh.yaml +++ /dev/null @@ -1,70 +0,0 @@ -functionality: - name: create_task_readme - status: disabled - namespace: common - description: | - Create a README for the task. - argument_groups: - - name: Inputs - arguments: - - type: string - name: --task - description: Which task the component will be added to. - example: denoising - required: false - - type: file - name: --task_dir - description: Path to the task directory. - default: src/tasks/${VIASH_PAR_TASK} - required: false - - type: file - name: --viash_yaml - description: | - Path to the project config file. Needed for knowing the relative location of a file to the project root. - default: "_viash.yaml" - - type: string - name: --github_url - description: | - URL to the GitHub repository. Needed for linking to the source code. - default: "https://github.com/openproblems-bio/openproblems/tree/main/" - - name: Outputs - arguments: - - type: file - name: --output - direction: output - description: Path to the component directory. Suggested location is `src/tasks//README.md`. - default: src/tasks/${VIASH_PAR_TASK}/README.md - resources: - - type: r_script - path: script.R - - path: /src/common/helper_functions/read_and_merge_yaml.R - - path: /src/common/helper_functions/read_api_files.R - - path: /src/common/helper_functions/strip_margin.R - test_resources: - - type: r_script - path: test.R - - path: /src - dest: openproblems/src - - path: /_viash.yaml - dest: openproblems/_viash.yaml -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - packages: [dplyr, purrr, rlang, glue, yaml, fs, cli, igraph, rmarkdown, processx] - - type: apt - packages: [jq, curl] - - type: docker - # download and install quarto-*-linux-amd64.deb from latest release - run: | - release_info=$(curl -s https://api.github.com/repos/quarto-dev/quarto-cli/releases/latest) && \ - download_url=$(printf "%s" "$release_info" | jq -r '.assets[] | select(.name | test("quarto-.*-linux-amd64.deb")) | .browser_download_url') && \ - curl -sL "$download_url" -o /opt/quarto.deb && \ - dpkg -i /opt/quarto.deb && \ - rm /opt/quarto.deb - - type: native - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] - diff --git a/src/common/create_task_readme/render_all.sh b/src/common/create_task_readme/render_all.sh deleted file mode 100755 index e44195c1ed..0000000000 --- a/src/common/create_task_readme/render_all.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -e - -TASK_IDS=`ls src/tasks` - -for task_id in $TASK_IDS; do - echo ">> Processing $task_id" - viash run src/common/create_task_readme/config.vsh.yaml -- --task $task_id -done \ No newline at end of file diff --git a/src/common/create_task_readme/script.R b/src/common/create_task_readme/script.R deleted file mode 100644 index 35320e4d97..0000000000 --- a/src/common/create_task_readme/script.R +++ /dev/null @@ -1,134 +0,0 @@ -library(rlang, quietly = TRUE, warn.conflicts = FALSE) -library(purrr, quietly = TRUE, warn.conflicts = FALSE) -library(dplyr, quietly = TRUE, warn.conflicts = FALSE) - -## VIASH START -par <- list( - "task" = "batch_integration", - "task_dir" = "src/tasks/batch_integration", - "output" = "src/tasks/batch_integration/README.md", - "viash_yaml" = "_viash.yaml", - "github_url" = "https://github.com/openproblems-bio/openproblems/tree/main/" -) -meta <- list( - "resources_dir" = "src/common/helper_functions", - "temp_dir" = "temp/" -) -## VIASH END - -if (is.null(par$task) && is.null(par$task_dir)) { - stop("Either 'task' or 'task_dir' must be provided") -} -if (is.null(par$viash_yaml)) { - stop("Argument 'viash_yaml' must be provided") -} -if (is.null(par$output)) { - stop("Argument 'output' must be provided") -} - -# import helper function -source(paste0(meta["resources_dir"], "/read_and_merge_yaml.R")) -source(paste0(meta["resources_dir"], "/strip_margin.R")) -source(paste0(meta["resources_dir"], "/read_api_files.R")) - -cat("Read task info\n") -task_api <- read_task_api(par[["task_dir"]]) - -# determine ordering -root <- .task_graph_get_root(task_api) - -r_graph <- render_task_graph(task_api, root) - -cat("Render API details\n") -order <- names(igraph::bfs(task_api$task_graph, root)$order) -r_details <- map_chr( - order, - function(file_name) { - if (file_name %in% names(task_api$comp_specs)) { - render_component(task_api$comp_specs[[file_name]]) - } else { - render_file(task_api$file_specs[[file_name]]) - } - } -) - -cat("Render authors\n") -authors_str <- - if (nrow(task_api$authors) > 0) { - paste0( - "\n## Authors & contributors\n\n", - task_api$authors %>% knitr::kable() %>% paste(collapse = "\n"), - "\n" - ) - } else { - "" - } -readme_str <- - if (is.null(task_api$task_info$readme) || is.na(task_api$task_info$readme)) { - "" - } else { - paste0( - "\n## README\n\n", - task_api$task_info$readme, - "\n" - ) - } - -cat("Generate qmd content\n") -relative_path <- par[["task_dir"]] %>% - gsub(paste0(dirname(par[["viash_yaml"]]), "/*"), "", .) %>% - gsub("/*$", "", .) -source_url <- paste0(par[["github_url"]], relative_path) -qmd_content <- strip_margin(glue::glue(" - §--- - §title: \"{task_api$task_info$label}\" - §format: gfm - §--- - § - § - § - §{task_api$task_info$summary} - § - §Path to source: [`{relative_path}`]({source_url}) - § - §{readme_str} - § - §## Motivation - § - §{task_api$task_info$motivation} - § - §## Description - § - §{task_api$task_info$description} - §{authors_str} - §## API - § - §{r_graph} - § - §{paste(r_details, collapse = '\n\n')} - § - §"), symbol = "§") - -cat("Write README.qmd to file\n") -qmd_file <- tempfile( - pattern = "README_", - fileext = ".qmd", - tmpdir = meta$temp_dir -) - -if (!dir.exists(meta$temp_dir)) { - dir.create(meta$temp_dir, recursive = TRUE) -} -writeLines(qmd_content, qmd_file) - -cat("Render README.qmd to README.md\n") -out <- processx::run( - command = "quarto", - args = c("render", qmd_file, "--output", "-"), - echo = TRUE -) - -writeLines(out$stdout, par$output) diff --git a/src/common/create_task_readme/test.R b/src/common/create_task_readme/test.R deleted file mode 100644 index 3a981fe7ca..0000000000 --- a/src/common/create_task_readme/test.R +++ /dev/null @@ -1,30 +0,0 @@ -requireNamespace("assertthat", quietly = TRUE) - -## VIASH START -## VIASH END - -opv2 <- paste0(meta$resources_dir, "/openproblems") -output_path <- "output.md" - -cat(">> Running the script as test\n") -system(paste( - meta["executable"], - "--task", "label_projection", - "--output", output_path, - "--task_dir", paste0(opv2, "/src/tasks/label_projection"), - "--viash_yaml", paste0(opv2, "/_viash.yaml") -)) - -cat(">> Checking whether output files exist\n") -assertthat::assert_that(file.exists(output_path)) - -cat(">> Checking file contents\n") -lines <- readLines(output_path) -assertthat::assert_that(any(grepl("# Label projection", lines))) -assertthat::assert_that(any(grepl("# Description", lines))) -assertthat::assert_that(any(grepl("# Motivation", lines))) -assertthat::assert_that(any(grepl("# Authors", lines))) -assertthat::assert_that(any(grepl("flowchart LR", lines))) -assertthat::assert_that(any(grepl("# File format:", lines))) - -cat("All checks succeeded!\n") diff --git a/src/common/decompress_gzip/config.vsh.yaml b/src/common/decompress_gzip/config.vsh.yaml deleted file mode 100644 index 2716dc554d..0000000000 --- a/src/common/decompress_gzip/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -functionality: - name: decompress_gzip - namespace: common - arguments: - - name: --input - type: file - description: Input file - example: /path/to/file.gz - - name: --output - type: file - description: Output file - example: /path/to/file - direction: output - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: - - type: docker - image: ubuntu:latest - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/common/extract_metadata/config.vsh.yaml b/src/common/extract_metadata/config.vsh.yaml deleted file mode 100644 index 76e73cb975..0000000000 --- a/src/common/extract_metadata/config.vsh.yaml +++ /dev/null @@ -1,40 +0,0 @@ -functionality: - name: extract_metadata - namespace: common - description: Extract the metadata from an h5ad file. - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - required: true - description: A h5ad file. - - name: --schema - type: file - required: false - description: An optional schema with which to annotate the output - - name: Output - arguments: - - name: --output - type: file - required: true - description: A yaml file containing the metadata. - example: output_meta.yaml - direction: output - resources: - - type: python_script - path: script.py - test_resources: - - path: /resources_test/common/pancreas - - path: /src/datasets/api/file_raw.yaml - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - test_setup: - - type: python - packages: viashpy - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/common/extract_metadata/test.py b/src/common/extract_metadata/test.py deleted file mode 100644 index 8af023d8f6..0000000000 --- a/src/common/extract_metadata/test.py +++ /dev/null @@ -1,26 +0,0 @@ -import sys -import re -import pytest -import json -import subprocess - -## VIASH START -## VIASH END - -input_path = meta["resources_dir"] + "/pancreas/dataset.h5ad" -schema_path = meta["resources_dir"] + "/file_raw.yaml" - -def test_run(run_component, tmp_path): - output_path = tmp_path / "meta.yaml" - - run_component([ - "--input", input_path, - "--schema", schema_path, - "--output", str(output_path), - ]) - - assert output_path.exists(), "Output path does not exist" - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__])) diff --git a/src/common/extract_scores/config.vsh.yaml b/src/common/extract_scores/config.vsh.yaml deleted file mode 100644 index 72270b7a95..0000000000 --- a/src/common/extract_scores/config.vsh.yaml +++ /dev/null @@ -1,35 +0,0 @@ -functionality: - name: "extract_scores" - status: disabled - namespace: "common" - description: "Extract evaluation data frame on output" - arguments: - - name: "--input" - alternatives: ["-i"] - type: "file" - multiple: true - default: "input.h5ad" - description: "Input h5ad files containing metadata and metrics in adata.uns" - - name: "--column_names" - type: "string" - multiple: true - default: [ "dataset_id", "method_id", "metric_ids", "metric_values" ] - description: "Which fields from adata.uns to extract and store as a data frame." - - name: "--output" - alternatives: ["-o"] - type: "file" - direction: "output" - default: "output.tsv" - description: "Output tsv" - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ tidyverse ] - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/common/extract_scores/script.R b/src/common/extract_scores/script.R deleted file mode 100644 index 6b540380ab..0000000000 --- a/src/common/extract_scores/script.R +++ /dev/null @@ -1,30 +0,0 @@ -cat("Loading dependencies\n") -library(anndata, warn.conflicts = FALSE) -options(tidyverse.quiet = TRUE) -library(tidyverse) -library(assertthat) - -## VIASH START -par <- list( - input = "resources_test/label_projection/pancreas/knn_accuracy.h5ad", - output = "scores.tsv" -) -inp <- par$input[[1]] -## VIASH END - -cat("Reading input h5ad files\n") -scores <- map_df(par$input, function(inp) { - cat("Reading '", inp, "'\n", sep = "") - ad <- read_h5ad(inp) - - for (uns_name in par$column_names) { - assert_that( - uns_name %in% names(ad$uns), - msg = paste0("File ", inp, " must contain `uns['", uns_name, "']`") - ) - } - - data.frame(ad$uns[par$column_names]) -}) - -write_tsv(scores, par$output) diff --git a/src/common/helper_functions/read_and_merge_yaml.R b/src/common/helper_functions/read_and_merge_yaml.R deleted file mode 100644 index 932d3feb92..0000000000 --- a/src/common/helper_functions/read_and_merge_yaml.R +++ /dev/null @@ -1,144 +0,0 @@ -#' Read a Viash YAML -#' -#' If the YAML contains a "__merge__" key anywhere in the yaml, -#' the path specified in that YAML will be read and the two -#' lists will be merged. This is a recursive procedure. -#' -#' @param path Path to Viash YAML -read_and_merge_yaml <- function(path, project_path = .ram_find_project(path)) { - path <- normalizePath(path, mustWork = FALSE) - data <- tryCatch({ - suppressWarnings(yaml::read_yaml(path)) - }, error = function(e) { - stop("Could not read ", path, ". Error: ", e) - }) - .ram_process_merge(data, data, path, project_path) -} - -.ram_find_project <- function(path) { - path <- normalizePath(path, mustWork = FALSE) - check <- paste0(dirname(path), "/_viash.yaml") - if (file.exists(check)) { - dirname(check) - } else if (check == "//_viash.yaml") { - NULL - } else { - .ram_find_project(dirname(check)) - } -} - -.ram_is_named_list <- function(obj) { - is.null(obj) || (is.list(obj) && (length(obj) == 0 || !is.null(names(obj)))) -} - -.ram_process_merge <- function(data, root_data, path, project_path) { - if (.ram_is_named_list(data)) { - # check whether children have `__merge__` entries - processed_data <- lapply(data, function(dat) { - .ram_process_merge(dat, root_data, path, project_path) - }) - processed_data <- lapply(names(data), function(nm) { - dat <- data[[nm]] - .ram_process_merge(dat, root_data, path, project_path) - }) - names(processed_data) <- names(data) - - # if current element has __merge__, read list2 yaml and combine with data - new_data <- - if ("__merge__" %in% names(processed_data) && !.ram_is_named_list(processed_data$`__merge__`)) { - new_data_path <- .ram_resolve_path( - path = processed_data$`__merge__`, - project_path = project_path, - parent_path = dirname(path) - ) - read_and_merge_yaml(new_data_path, project_path) - } else if ("$ref" %in% names(processed_data) && !.ram_is_named_list(processed_data$`$ref`)) { - ref_parts <- strsplit(processed_data$`$ref`, "#")[[1]] - - # resolve the path in $ref - x <- - if (ref_parts[[1]] == "") { - root_data - } else { - new_data_path <- .ram_resolve_path( - path = ref_parts[[1]], - project_path = project_path, - parent_path = dirname(path) - ) - new_data_path <- normalizePath(new_data_path, mustWork = FALSE) - - # read in the new data - tryCatch({ - suppressWarnings(yaml::read_yaml(new_data_path)) - }, error = function(e) { - stop("Could not read ", new_data_path, ". Error: ", e) - }) - } - x_root <- x - - - # Navigate the path and retrieve the referenced data - ref_path_parts <- unlist(strsplit(ref_parts[[2]], "/")) - for (part in ref_path_parts) { - if (part == "") { - next - } else if (part %in% names(x)) { - x <- x[[part]] - } else { - stop("Could not find ", processed_data$`$ref`, " in ", path) - } - } - - # postprocess the new data - if (ref_parts[[1]] == "") { - x - } else { - .ram_process_merge(x, x_root, new_data_path, project_path) - } - } else { - list() - } - - .ram_deep_merge(new_data, processed_data) - } else if (is.list(data)) { - lapply(data, function(dat) { - .ram_process_merge(dat, root_data, path, project_path) - }) - } else { - data - } -} - -.ram_resolve_path <- function(path, project_path, parent_path) { - ifelse( - grepl("^/", path), - paste0(project_path, "/", path), - fs::path_abs(path, parent_path) - ) -} - -.ram_deep_merge <- function(list1, list2) { - if (.ram_is_named_list(list1) && .ram_is_named_list(list2)) { - # if list1 and list2 are objects, recursively merge - keys <- unique(c(names(list1), names(list2))) - out <- lapply(keys, function(key) { - if (key %in% names(list1)) { - if (key %in% names(list2)) { - .ram_deep_merge(list1[[key]], list2[[key]]) - } else { - list1[[key]] - } - } else { - list2[[key]] - } - }) - names(out) <- keys - out - } else if (is.list(list1) && is.list(list2)) { - # if list1 and list2 are both lists, append - c(list1, list2) - } else { - # else override list1 with list2 - list2 - } -} \ No newline at end of file diff --git a/src/common/helper_functions/read_and_merge_yaml.py b/src/common/helper_functions/read_and_merge_yaml.py deleted file mode 100644 index b74995aed1..0000000000 --- a/src/common/helper_functions/read_and_merge_yaml.py +++ /dev/null @@ -1,52 +0,0 @@ -def read_and_merge_yaml(path): - """Read a Viash YAML - - If the YAML contains a "__merge__" key anywhere in the yaml, - the path specified in that YAML will be read and the two - lists will be merged. This is a recursive procedure. - - Arguments: - path -- Path to the Viash YAML""" - from ruamel.yaml import YAML - - yaml = YAML(typ='safe', pure=True) - - with open(path, 'r') as stream: - data = yaml.load(stream) - return _ram_process_merge(data, path) - -def _ram_deep_merge(dict1, dict2): - if isinstance(dict1, dict) and isinstance(dict2, dict): - keys = set(list(dict1.keys()) + list(dict2.keys())) - out = {} - for key in keys: - if key in dict1: - if key in dict2: - out[key] = _ram_deep_merge(dict1[key], dict2[key]) - else: - out[key] = dict1[key] - else: - out[key] = dict2[key] - return out - elif isinstance(dict1, list) and isinstance(dict2, list): - return dict1 + dict2 - else: - return dict2 - -def _ram_process_merge(data, path): - import os - if isinstance(data, dict): - processed_data = {k: _ram_process_merge(v, path) for k, v in data.items()} - - if "__merge__" in processed_data: - new_data_path = os.path.join(os.path.dirname(path), processed_data["__merge__"]) - new_data = read_and_merge_yaml(new_data_path) - else: - new_data = {} - - return _ram_deep_merge(new_data, processed_data) - elif isinstance(data, list): - return [_ram_process_merge(dat, path) for dat in data] - else: - return data - diff --git a/src/common/helper_functions/read_anndata_partial.py b/src/common/helper_functions/read_anndata_partial.py deleted file mode 100644 index efbea0592d..0000000000 --- a/src/common/helper_functions/read_anndata_partial.py +++ /dev/null @@ -1,77 +0,0 @@ -import warnings -from pathlib import Path -import anndata as ad -import h5py -from scipy.sparse import csr_matrix -from anndata.experimental import read_elem, sparse_dataset - - -def read_anndata( - file: str, - backed: bool = False, - **kwargs -) -> ad.AnnData: - """ - Read anndata file - :param file: path to anndata file in h5ad format - :param kwargs: AnnData parameter to group mapping - """ - assert Path(file).exists(), f'File not found: {file}' - - f = h5py.File(file, 'r') - kwargs = {x: x for x in f} if not kwargs else kwargs - if len(f.keys()) == 0: - return ad.AnnData() - # check if keys are available - for name, slot in kwargs.items(): - if slot not in f: - warnings.warn( - f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"' - ) - adata = read_partial(f, backed=backed, **kwargs) - if not backed: - f.close() - - return adata - - -def read_partial( - group: h5py.Group, - backed: bool = False, - force_sparse_types: [str, list] = None, - **kwargs -) -> ad.AnnData: - """ - Partially read h5py groups - :params group: file group - :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix - :params backed: read sparse matrix as sparse_dataset - :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file - :return: AnnData object - """ - if force_sparse_types is None: - force_sparse_types = [] - elif isinstance(force_sparse_types, str): - force_sparse_types = [force_sparse_types] - slots = {} - if backed: - print('Read as backed sparse matrix...') - - for slot_name, slot in kwargs.items(): - print(f'Read slot "{slot}", store as "{slot_name}"...') - if slot not in group: - warnings.warn(f'Slot "{slot}" not found, skip...') - slots[slot_name] = None - else: - elem = group[slot] - iospec = ad._io.specs.get_spec(elem) - if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed: - slots[slot_name] = sparse_dataset(elem) - elif iospec.encoding_type in force_sparse_types: - slots[slot_name] = csr_matrix(read_elem(elem)) - if backed: - slots[slot_name] = sparse_dataset(slots[slot_name]) - else: - slots[slot_name] = read_elem(elem) - return ad.AnnData(**slots) - diff --git a/src/common/helper_functions/read_api_files.R b/src/common/helper_functions/read_api_files.R deleted file mode 100644 index f2cf49b2f8..0000000000 --- a/src/common/helper_functions/read_api_files.R +++ /dev/null @@ -1,493 +0,0 @@ - -anndata_struct_names <- c("obs", "var", "obsm", "obsp", "varm", "varp", "layers", "uns") - -read_file_spec <- function(path) { - spec <- read_and_merge_yaml(path) - out <- list( - info = read_file_info(spec, path) - ) - if (out$info$file_type == "h5ad" || "slots" %in% names(spec$info)) { - out$info$file_type <- "h5ad" - out$slots <- read_anndata_slots(spec, path) - } - if (out$info$file_type == "csv" || out$info$file_type == "tsv" || out$info$file_type == "parquet") { - out$columns <- read_tabular_columns(spec, path) - } - out -} -read_file_info <- function(spec, path) { - # TEMP: make it readable - spec$info$slots <- NULL - df <- list_as_tibble(spec) - if (list_contains_tibble(spec$info)) { - df <- dplyr::bind_cols(df, list_as_tibble(spec$info)) - } - df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) - df$description <- df$description %||% NA_character_ %>% as.character - df$summary <- df$summary %||% NA_character_ %>% as.character - as_tibble(df) -} -read_anndata_slots <- function(spec, path) { - map_df( - anndata_struct_names, - function(struct_name, slot) { - slot <- spec$info$slots[[struct_name]] - if (is.null(slot)) return(NULL) - df <- map_df(slot, as.data.frame) - df$struct <- struct_name - df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) - df$required <- df$required %||% TRUE %|% TRUE - df$multiple <- df$multiple %||% FALSE %|% FALSE - as_tibble(df) - } - ) -} -read_tabular_columns <- function(spec, path) { - map_df( - spec$info$columns, - function(column) { - df <- list_as_tibble(column) - df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) - df$required <- df$required %||% TRUE %|% TRUE - df$multiple <- df$multiple %||% FALSE %|% FALSE - as_tibble(df) - } - ) -} - -format_file_format <- function(spec) { - if (spec$info$file_type == "h5ad") { - example <- spec$slots %>% - group_by(struct) %>% - summarise( - str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", ")) - ) %>% - arrange(match(struct, anndata_struct_names)) - - c(" AnnData object", paste0(" ", example$str)) - } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") { - example <- spec$columns %>% - summarise( - str = paste0("'", name, "'", collapse = ", ") - ) - - c(" Tabular data", paste0(" ", example$str)) - } else { - "" - } -} - -format_file_format_as_kable <- function(spec) { - if (spec$info$file_type == "h5ad") { - spec$slots %>% - mutate( - tag_str = pmap_chr(lst(required), function(required) { - out <- c() - if (!required) { - out <- c(out, "Optional") - } - if (length(out) == 0) { - "" - } else { - paste0("(_", paste(out, collapse = ", "), "_) ") - } - }) - ) %>% - transmute( - Slot = paste0("`", struct, "[\"", name, "\"]`"), - Type = paste0("`", type, "`"), - Description = paste0( - tag_str, - description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), - "." - ) - ) %>% - knitr::kable() - } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") { - spec$columns %>% - mutate( - tag_str = pmap_chr(lst(required), function(required) { - out <- c() - if (!required) { - out <- c(out, "Optional") - } - if (length(out) == 0) { - "" - } else { - paste0("(_", paste(out, collapse = ", "), "_) ") - } - }) - ) %>% - transmute( - Column = paste0("`", name, "`"), - Type = paste0("`", type, "`"), - Description = paste0( - tag_str, - description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), - "." - ) - ) %>% - knitr::kable() - } else { - "" - } -} - -list_contains_tibble <- function(li) { - is.list(li) && any(sapply(li, is.atomic)) -} - -list_as_tibble <- function(li) { - as.data.frame(li[sapply(li, is.atomic)], check.names = FALSE) -} - -read_comp_spec <- function(path) { - spec_yaml <- read_and_merge_yaml(path) - list( - info = read_comp_info(spec_yaml, path), - args = read_comp_args(spec_yaml, path) - ) -} - -read_comp_info <- function(spec_yaml, path) { - # TEMP: make it readable - spec_yaml$functionality$arguments <- NULL - spec_yaml$functionality$argument_groups <- NULL - - df <- list_as_tibble(spec_yaml$functionality) - if (nrow(df) == 0) { - df <- data.frame(a = 1)[, integer(0)] - } - if (list_contains_tibble(spec_yaml$functionality$info)) { - df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info)) - } - if (list_contains_tibble(spec_yaml$functionality$info$type_info)) { - df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info$type_info)) - } - df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) - as_tibble(df) -} - -read_comp_args <- function(spec_yaml, path) { - arguments <- spec_yaml$functionality$arguments - for (arg_group in spec_yaml$functionality$argument_groups) { - arguments <- c(arguments, arg_group$arguments) - } - map_df(arguments, function(arg) { - df <- list_as_tibble(arg) - if (list_contains_tibble(arg$info)) { - df <- dplyr::bind_cols(df, list_as_tibble(arg$info)) - } - df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) - df$arg_name <- gsub("^-*", "", arg$name) - df$direction <- df$direction %||% "input" %|% "input" - df$parent <- df$`__merge__` %||% NA_character_ %>% basename() %>% gsub("\\.yaml", "", .) - df$required <- df$required %||% FALSE %|% FALSE - df$default <- df$default %||% NA_character_ %>% as.character - df$example <- df$example %||% NA_character_ %>% as.character - df$description <- df$description %||% NA_character_ %>% as.character - df$summary <- df$summary %||% NA_character_ %>% as.character - df - }) -} - -format_comp_args_as_tibble <- function(spec) { - if (nrow(spec$args) == 0) return("") - spec$args %>% - mutate( - tag_str = pmap_chr(lst(required, direction), function(required, direction) { - out <- c() - if (!required) { - out <- c(out, "Optional") - } - if (direction == "output") { - out <- c(out, "Output") - } - if (length(out) == 0) { - "" - } else { - paste0("(_", paste(out, collapse = ", "), "_) ") - } - }) - ) %>% - transmute( - Name = paste0("`--", arg_name, "`"), - Type = paste0("`", type, "`"), - Description = paste0( - tag_str, - (summary %|% description) %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), - ".", - ifelse(!is.na(default), paste0(" Default: `", default, "`."), "") - ) - ) %>% - knitr::kable() -} - -# path <- "src/datasets/api/comp_processor_knn.yaml" -render_component <- function(spec) { - if (is.character(spec)) { - spec <- read_comp_spec(spec) - } - - strip_margin(glue::glue(" - §## Component type: {spec$info$label} - § - §Path: [`src/{spec$info$namespace}`](https://github.com/openproblems-bio/openproblems/tree/main/src/{spec$info$namespace}) - § - §{spec$info$summary} - § - §Arguments: - § - §:::{{.small}} - §{paste(format_comp_args_as_tibble(spec), collapse = '\n')} - §::: - § - §"), symbol = "§") -} - -# path <- "src/datasets/api/file_pca.yaml" -render_file <- function(spec) { - if (is.character(spec)) { - spec <- read_file_spec(spec) - } - - if (!"label" %in% names(spec$info)) { - spec$info$label <- basename(spec$info$example) - } - - example <- - if (is.null(spec$info$example) || is.na(spec$info$example)) { - "" - } else { - paste0("Example file: `", spec$info$example, "`") - } - - description <- - if (is.null(spec$info$description) || is.na(spec$info$description)) { - "" - } else { - paste0("Description:\n\n", spec$info$description) - } - - strip_margin(glue::glue(" - §## File format: {spec$info$label} - § - §{spec$info$summary %||% ''} - § - §{example} - § - §{description} - § - §Format: - § - §:::{{.small}} - §{paste(format_file_format(spec), collapse = '\n')} - §::: - § - §Slot description: - § - §:::{{.small}} - §{paste(format_file_format_as_kable(spec), collapse = '\n')} - §::: - § - §"), symbol = "§") -} - -# path <- "src/tasks/denoising" -read_task_api <- function(path) { - cli::cli_inform("Looking for project root") - project_path <- .ram_find_project(path) - api_dir <- paste0(path, "/api") - - cli::cli_inform("Reading task info") - task_info_yaml <- list.files(api_dir, pattern = "task_info.ya?ml", full.names = TRUE) - assertthat::assert_that(length(task_info_yaml) == 1) - task_info <- read_and_merge_yaml(task_info_yaml, project_path) - - cli::cli_inform("Reading task authors") - authors <- map_df(task_info$authors, function(aut) { - aut$roles <- paste(aut$roles, collapse = ", ") - list_as_tibble(aut) - }) - - cli::cli_inform("Reading component yamls") - comp_yamls <- list.files(api_dir, pattern = "comp_.*\\.ya?ml", full.names = TRUE) - comps <- map(comp_yamls, read_comp_spec) - comp_info <- map_df(comps, "info") - comp_args <- map_df(comps, "args") - names(comps) <- basename(comp_yamls) %>% gsub("\\..*$", "", .) - - cli::cli_inform("Reading file yamls") - file_yamls <- .ram_resolve_path( - path = na.omit(unique(comp_args$`__merge__`)), - project_path = project_path, - parent_path = api_dir - ) - files <- map(file_yamls, read_file_spec) - names(files) <- basename(file_yamls) %>% gsub("\\..*$", "", .) - file_info <- map_df(files, "info") - file_slots <- map_df(files, "slots") - - cli::cli_inform("Generating task graph") - task_graph <- create_task_graph(file_info, comp_info, comp_args) - - list( - task_info = task_info, - file_specs = files, - file_info = file_info, - file_slots = file_slots, - comp_specs = comps, - comp_info = comp_info, - comp_args = comp_args, - task_graph = task_graph, - authors = authors - ) -} - - -create_task_graph <- function(file_info, comp_info, comp_args) { - clean_id <- function(id) { - gsub("graph", "graaf", id) - } - nodes <- - bind_rows( - file_info %>% - mutate(id = file_name, label = label, is_comp = FALSE), - comp_info %>% - mutate(id = file_name, label = label, is_comp = TRUE) - ) %>% - select(id, label, everything()) %>% - mutate(str = paste0( - " ", - clean_id(id), - ifelse(is_comp, "[/\"", "(\""), - label, - ifelse(is_comp, "\"/]", "\")") - )) - edges <- bind_rows( - comp_args %>% - filter(type == "file", direction == "input") %>% - mutate( - from = parent, - to = file_name, - arrow = "---" - ), - comp_args %>% - filter(type == "file", direction == "output") %>% - mutate( - from = file_name, - to = parent, - arrow = "-->" - ) - ) %>% - select(from, to, everything()) %>% - mutate(str = paste0(" ", clean_id(from), arrow, clean_id(to))) - - igraph::graph_from_data_frame( - edges, - vertices = nodes, - directed = TRUE - ) -} - -.task_graph_get_root <- function(task_api) { - root <- names(which(igraph::degree(task_api$task_graph, mode = "in") == 0)) - if (length(root) > 1) { - warning( - "There should probably only be one node with in-degree equal to 0.\n", - " Nodes with in-degree == 0: ", paste(root, collapse = ", ") - ) - } - root[[1]] -} - -render_task_graph <- function(task_api, root = .task_graph_get_root(task_api)) { - order <- names(igraph::bfs(task_api$task_graph, root)$order) - - vdf <- igraph::as_data_frame(task_api$task_graph, "vertices") %>% - arrange(match(name, order)) - edf <- igraph::as_data_frame(task_api$task_graph, "edges") %>% - arrange(match(from, order), match(to, order)) - - strip_margin(glue::glue(" - §```mermaid - §flowchart LR - §{paste(vdf$str, collapse = '\n')} - §{paste(edf$str, collapse = '\n')} - §``` - §"), symbol = "§") -} - - - -# Recursive function to process each property with indentation -.render_example_process_property <- function(prop, prop_name = NULL, indent_level = 0) { - if (is.null(prop_name)) { - prop_name <- "" - } - - out <- c() - - # define helper variables - indent_spaces <- strrep(" ", indent_level) - next_indent_spaces <- strrep(" ", indent_level + 2) - - # add comment if available - if ("description" %in% names(prop)) { - comment <- gsub("\n", paste0("\n", indent_spaces, "# "), stringr::str_trim(prop$description)) - out <- c(out, indent_spaces, "# ", comment, "\n") - } - - # add variable - out <- c(out, indent_spaces, prop_name, ": ") - - if (prop$type == "object" && "properties" %in% names(prop)) { - # Handle object with properties - prop_names <- setdiff(names(prop$properties), "additionalProperties") - sub_props <- unlist(lapply(prop_names, function(sub_prop_name) { - prop_out <- .render_example_process_property( - prop$properties[[sub_prop_name]], - sub_prop_name, - indent_level + 2 - ) - c(prop_out, "\n") - })) - c(out, "\n", sub_props[-length(sub_props)]) - } else if (prop$type == "array") { - if (is.list(prop$items) && "properties" %in% names(prop$items)) { - # Handle array of objects - array_items_yaml <- unlist(lapply(names(prop$items$properties), function(item_prop_name) { - prop_out <- .render_example_process_property( - prop$items$properties[[item_prop_name]], - item_prop_name, - indent_level + 4 - ) - c(prop_out, "\n") - })) - c(out, "\n", next_indent_spaces, "- ", array_items_yaml[-1]) - } else { - # Handle simple array - c(out, "[ ... ]") - } - } else { - c(out, "...") - } -} - -# Function for rendering an example yaml based on a JSON schema -render_example <- function(json_schema) { - if (!"properties" %in% names(json_schema)) { - return("") - } - text <- - unlist(lapply(names(json_schema$properties), function(prop_name) { - out <- .render_example_process_property( - json_schema$properties[[prop_name]], - prop_name, - 0 - ) - c(out, "\n") - })) - - paste(text, collapse = "") -} \ No newline at end of file diff --git a/src/common/helper_functions/setup_logger.py b/src/common/helper_functions/setup_logger.py deleted file mode 100644 index ae71eb9611..0000000000 --- a/src/common/helper_functions/setup_logger.py +++ /dev/null @@ -1,12 +0,0 @@ -def setup_logger(): - import logging - from sys import stdout - - logger = logging.getLogger() - logger.setLevel(logging.INFO) - console_handler = logging.StreamHandler(stdout) - logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") - console_handler.setFormatter(logFormatter) - logger.addHandler(console_handler) - - return logger \ No newline at end of file diff --git a/src/common/helper_functions/strip_margin.R b/src/common/helper_functions/strip_margin.R deleted file mode 100644 index 3830d58d79..0000000000 --- a/src/common/helper_functions/strip_margin.R +++ /dev/null @@ -1,3 +0,0 @@ -strip_margin <- function(text, symbol = "\\|") { - gsub(paste0("(^|\n)[ \t]*", symbol), "\\1", text) -} \ No newline at end of file diff --git a/src/common/helper_functions/strip_margin.py b/src/common/helper_functions/strip_margin.py deleted file mode 100644 index fbfb39dec9..0000000000 --- a/src/common/helper_functions/strip_margin.py +++ /dev/null @@ -1,3 +0,0 @@ -def strip_margin(text: str) -> str: - import re - return re.sub("(^|\n)[ \t]*\|", "\\1", text) \ No newline at end of file diff --git a/src/common/helper_functions/subset_anndata.py b/src/common/helper_functions/subset_anndata.py deleted file mode 100644 index 80bd160872..0000000000 --- a/src/common/helper_functions/subset_anndata.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Helper functions related to subsetting AnnData objects based on the file format -specifications in the .config.vsh.yaml and slot mapping overrides.""" - -def read_config_slots_info(config_file, slot_mapping = {}): - """Read the .config.vsh.yaml to find out which output slots need to be copied to which output file. - - Arguments: - config_file -- Path to the .config.vsh.yaml file (required). - slot_mapping -- Which slots to retain. Must be a dictionary whose keys are the names - of the AnnData structs, and values is another dictionary with destination value - names as keys and source value names as values. - Example of slot_mapping: - ``` - slot_mapping = { - "layers": { - "counts": par["layer_counts"], - }, - "obs": { - "cell_type": par["obs_cell_type"], - "batch": par["obs_batch"], - } - } - ``` - """ - import yaml - import re - - # read output spec from yaml - with open(config_file, "r") as object_name: - config = yaml.safe_load(object_name) - - output_struct_slots = {} - - # fetch info on which slots should be copied to which file - for arg in config["functionality"]["arguments"]: - # argument is an output file with a slot specification - if arg["direction"] == "output" and arg.get("info", {}).get("slots"): - object_name = re.sub("--", "", arg["name"]) - - struct_slots = arg['info']['slots'] - out = {} - for (struct, slots) in struct_slots.items(): - out_struct = {} - for slot in slots: - # if slot_mapping[struct][slot['name']] exists, use that as the source slot name - # otherwise use slot['name'] - source_slot = slot_mapping.get(struct, {}).get(slot["name"], slot["name"]) - out_struct[slot["name"]] = source_slot - out[struct] = out_struct - - output_struct_slots[object_name] = out - - return output_struct_slots - -# create new anndata objects according to api spec -def subset_anndata(adata, slot_info): - """Create new anndata object according to slot info specifications. - - Arguments: - adata -- An AnnData object to subset (required) - slot_info -- Which slots to retain, typically one of the items in the output of read_config_slots_info. - Must be a dictionary whose keys are the names of the AnnData structs, and values is another - dictionary with destination value names as keys and source value names as values. - """ - import pandas as pd - import anndata as ad - - structs = ["layers", "obs", "var", "uns", "obsp", "obsm", "varp", "varm"] - kwargs = {} - - for struct in structs: - slot_mapping = slot_info.get(struct, {}) - data = {dest : getattr(adata, struct)[src] for (dest, src) in slot_mapping.items()} - if len(data) > 0: - if struct in ['obs', 'var']: - data = pd.concat(data, axis=1) - kwargs[struct] = data - elif struct in ['obs', 'var']: - # if no columns need to be copied, we still need an 'obs' and a 'var' - # to help determine the shape of the adata - kwargs[struct] = getattr(adata, struct).iloc[:,[]] - - return ad.AnnData(**kwargs) \ No newline at end of file diff --git a/src/common/library.bib b/src/common/library.bib deleted file mode 100644 index af730fe8cd..0000000000 --- a/src/common/library.bib +++ /dev/null @@ -1,2191 +0,0 @@ -@misc{10x2018pbmc, - title = {1k PBMCs from a Healthy Donor (v3 chemistry)}, - author = {{10x Genomics}}, - year = {2018}, - url = {https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0} -} - - -@misc{10x2019heart, - title = {Human Heart}, - author = {{10x Genomics}}, - year = {2019}, - url = {https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0} -} - - -@misc{10x2019lymph, - title = {Human Lymph Node}, - author = {{10x Genomics}}, - year = {2019}, - url = {https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0} -} - - -@misc{10x2019pbmc, - title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, - author = {{10x Genomics}}, - year = {2019}, - url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0} -} - - -@misc{10x2020breast, - title = {Human Breast Cancer: Whole Transcriptome Analysis}, - author = {{10x Genomics}}, - year = {2020}, - url = {https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0} -} - - -@misc{10x2020cerebellum, - title = {Human Cerebellum: Whole Transcriptome Analysis}, - author = {{10x Genomics}}, - year = {2020}, - url = {https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0} -} - - -@misc{10x2020kidney, - title = {Mouse Kidney Section (Coronal)}, - author = {{10x Genomics}}, - year = {2020}, - url = {https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0} -} - - -@misc{10x2021breast, - title = {Human Breast Cancer: Ductal Carcinoma In Situ, Invasive Carcinoma (FFPE)}, - author = {{10x Genomics}}, - year = {2021}, - url = {https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0} -} - - -@misc{10x2021prostate, - title = {Normal Human Prostate (FFPE)}, - author = {{10x Genomics}}, - year = {2021}, - url = {https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0} -} - - -@misc{10x2022brain, - title = {Mouse Brain Coronal Section 1 (FFPE)}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard} -} - - -@misc{10x2022cervical, - title = {Human Cervical Cancer (FFPE)}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard} -} - - -@misc{10x2022olfactory, - title = {Adult Mouse Olfactory Bulb}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1} -} - - -@misc{10x2022intestine, - title = {Human Intestine Cancer (FPPE)}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard} -} - - -@misc{10x2022melanoma, - title = {Human Melanoma, IF Stained (FFPE)}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard} -} - - -@misc{10x2022prostate, - title = {Human Prostate Cancer, Adjacent Normal Section with IF Staining (FFPE)}, - author = {{10x Genomics}}, - year = {2022}, - url = {https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard} -} - - -@misc{10x2023brain, - title = {Human Brain Cancer, 11 mm Capture Area (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard} -} - - -@misc{10x2023colon, - title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Human Colon Cancer (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard} -} - - -@misc{10x2023colorectal, - title = {Human Colorectal Cancer, 11 mm Capture Area (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard} -} - - -@misc{10x2023embryo, - title = {Visium CytAssist, Mouse Embryo, 11 mm Capture Area (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard} -} - - -@misc{10x2023kidney, - title = {Human Kidney, 11 mm Capture Area (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard} -} - - -@misc{10x2023lung, - title = {Human Lung Cancer, 11 mm Capture Area (FFPE)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard} -} - - -@misc{10x2023mousebrain, - title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Mouse Brain (FF)}, - author = {{10x Genomics}}, - year = {2023}, - url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard} -} - - -@article{agostinis2022newwave, - doi = {10.1093/bioinformatics/btac149}, - url = {https://doi.org/10.1093/bioinformatics/btac149}, - year = {2022}, - month = {Mar.}, - publisher = {Oxford University Press ({OUP})}, - volume = {38}, - number = {9}, - pages = {2648--2650}, - author = {Federico Agostinis and Chiara Romualdi and Gabriele Sales and Davide Risso}, - editor = {Yann Ponty}, - title = {NewWave: a scalable R/Bioconductor package for the dimensionality reduction and batch effect removal of single-cell {RNA}-seq data}, - journal = {Bioinformatics} -} - - -@article{agrawal2021mde, - title = {Minimum-Distortion Embedding}, - author = {Akshay Agrawal and Alnur Ali and Stephen Boyd}, - year = {2021}, - journal = {Foundations and Trends{\textregistered} in Machine Learning}, - publisher = {Now Publishers}, - volume = {14}, - number = {3}, - pages = {211--378}, - doi = {10.1561/2200000090}, - url = {https://doi.org/10.1561/2200000090} -} - - -@article{aliee2021autogenes, - title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution}, - author = {Hananeh Aliee and Fabian J. Theis}, - year = {2021}, - month = {Jul.}, - journal = {Cell Systems}, - publisher = {Elsevier {BV}}, - volume = {12}, - number = {7}, - pages = {706--715.e4}, - doi = {10.1016/j.cels.2021.05.006}, - url = {https://doi.org/10.1016/j.cels.2021.05.006} -} - - -@inproceedings{amelio2015normalized, - doi = {10.1145/2808797.2809344}, - url = {https://doi.org/10.1145/2808797.2809344}, - year = {2015}, - month = {Aug.}, - publisher = {{ACM}}, - author = {Alessia Amelio and Clara Pizzuti}, - title = {Is Normalized Mutual Information a Fair Measure for Comparing Community Detection Methods?}, - booktitle = {Proceedings of the 2015 {IEEE}/{ACM} International Conference on Advances in Social Networks Analysis and Mining 2015} -} - - -@article{andersson2020single, - title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, - author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, - year = {2020}, - month = {Oct.}, - journal = {Communications Biology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {3}, - number = {1}, - doi = {10.1038/s42003-020-01247-y}, - url = {https://doi.org/10.1038/s42003-020-01247-y} -} - - -@article{andersson2021sepal, - title={sepal: Identifying transcript profiles with spatial patterns by diffusion-based modeling}, - author={Andersson, Alma and Lundeberg, Joakim}, - journal={Bioinformatics}, - volume={37}, - number={17}, - pages={2644--2650}, - year={2021}, - publisher={Oxford University Press}, - doi={10.1093/bioinformatics/btab164} -} - - -@string{apr = {Apr.}} - - -@string{aug = {Aug.}} - - -@article{batson2019molecular, - title = {Molecular Cross-Validation for Single-Cell RNA-seq}, - author = {Batson, Joshua and Royer, Lo{\"\i}c and Webber, James}, - year = {2019}, - journal = {bioRxiv}, - publisher = {Cold Spring Harbor Laboratory}, - doi = {10.1101/786269}, - url = {https://www.biorxiv.org/content/early/2019/09/30/786269}, - elocation-id = {786269}, - eprint = {https://www.biorxiv.org/content/early/2019/09/30/786269.full.pdf} -} - - -@article{biancalani2021deep, - title = {Deep learning and alignment of spatially resolved single-cell transcriptomes with Tangram}, - author = {Tommaso Biancalani and Gabriele Scalia and Lorenzo Buffoni and Raghav Avasthi and Ziqing Lu and Aman Sanger and Neriman Tokcan and Charles R. Vanderburg and {\AA}sa Segerstolpe and Meng Zhang and Inbal Avraham-Davidi and Sanja Vickovic and Mor Nitzan and Sai Ma and Ayshwarya Subramanian and Michal Lipinski and Jason Buenrostro and Nik Bear Brown and Duccio Fanelli and Xiaowei Zhuang and Evan Z. Macosko and Aviv Regev}, - year = {2021}, - month = {Oct.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {18}, - number = {11}, - pages = {1352--1362}, - doi = {10.1038/s41592-021-01264-7}, - url = {https://doi.org/10.1038/s41592-021-01264-7} -} - - -@article{bintayyash2021non, - author = {BinTayyash, Nuha and Georgaka, Sokratia and John, S T and Ahmed, Sumon and Boukouvalas, Alexis and Hensman, James and Rattray, Magnus}, - title = "{Non-parametric modelling of temporal and spatial counts data from RNA-seq experiments}", - journal = {Bioinformatics}, - volume = {37}, - number = {21}, - pages = {3788-3795}, - year = {2021}, - month = {07}, - issn = {1367-4803}, - doi = {10.1093/bioinformatics/btab486}, - url = {https://doi.org/10.1093/bioinformatics/btab486}, - eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/21/3788/50336570/btab486.pdf}, -} - - -@article{bland2000odds, - title = {Statistics Notes: The odds ratio}, - author = {J. M. Bland}, - year = {2000}, - month = {May}, - journal = {{BMJ}}, - publisher = {{BMJ}}, - volume = {320}, - number = {7247}, - pages = {1468--1468}, - doi = {10.1136/bmj.320.7247.1468}, - url = {https://doi.org/10.1136/bmj.320.7247.1468} -} - - -@article{breiman2001random, - doi = {10.1023/a:1010933404324}, - url = {https://doi.org/10.1023/a:1010933404324}, - year = {2001}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {45}, - number = {1}, - pages = {5--32}, - author = {Leo Breiman}, - journal = {Machine Learning} -} - - -@article{bttner2018test, - title = {A test metric for assessing single-cell {RNA}-seq batch correction}, - author = {Maren B\"{u}ttner and Zhichao Miao and F. Alexander Wolf and Sarah A. Teichmann and Fabian J. Theis}, - year = {2018}, - month = {Dec.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {16}, - number = {1}, - pages = {43--49}, - doi = {10.1038/s41592-018-0254-1}, - url = {https://doi.org/10.1038/s41592-018-0254-1} -} - - -@article{cabello2020singlecellsignalr, - title = {{SingleCellSignalR}: inference of intercellular networks from single-cell transcriptomics}, - author = {Simon Cabello-Aguilar and M{\'{e}}lissa Alame and Fabien Kon-Sun-Tack and Caroline Fau and Matthieu Lacroix and Jacques Colinge}, - year = {2020}, - month = {Mar.}, - journal = {Nucleic Acids Research}, - publisher = {Oxford University Press ({OUP})}, - volume = {48}, - number = {10}, - pages = {e55--e55}, - doi = {10.1093/nar/gkaa183}, - url = {https://doi.org/10.1093/nar/gkaa183} -} - - -@article{cable2021robust, - title = {Robust decomposition of cell type mixtures in spatial transcriptomics}, - author = {Dylan M. Cable and Evan Murray and Luli S. Zou and Aleksandrina Goeva and Evan Z. Macosko and Fei Chen and Rafael A. Irizarry}, - year = {2021}, - month = {Feb.}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {40}, - number = {4}, - pages = {517--526}, - doi = {10.1038/s41587-021-00830-w}, - url = {https://doi.org/10.1038/s41587-021-00830-w} -} - - -@misc{cannoodt2021viashfromscripts, - doi = {10.48550/ARXIV.2110.11494}, - url = {https://arxiv.org/abs/2110.11494}, - author = {Cannoodt, Robrecht and Cannoodt, Hendrik and Van de Kerckhove, Eric and Boschmans, Andy and De Maeyer, Dries and Verbeiren, Toni}, - keywords = {Software Engineering (cs.SE), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Viash: from scripts to pipelines}, - publisher = {arXiv}, - year = {2021}, - copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} -} - - -@article{cai2023spanve, - title={Spanve: an Statistical Method to Detect Clustering-friendly Spatially Variable Genes in Large-scale Spatial Transcriptomics Data}, - author={Cai, Guoxin and Chen, Yichang and Chen, Shuqing and Gu, Xun and Zhou, Zhan}, - journal={bioRxiv}, - pages={2023--02}, - year={2023}, - publisher={Cold Spring Harbor Laboratory}, - doi={10.1101/2023.02.08.527623} -} - - -@article{cao2018joint, - title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, - author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, - year = {2018}, - month = {Sept.}, - journal = {Science}, - publisher = {American Association for the Advancement of Science ({AAAS})}, - volume = {361}, - number = {6409}, - pages = {1380--1385}, - doi = {10.1126/science.aau0730}, - url = {https://doi.org/10.1126/science.aau0730} -} - - -@article{cao2020human, - title = {A human cell atlas of fetal gene expression}, - author = {Junyue Cao and Diana R. O'Day and Hannah A. Pliner and Paul D. Kingsley and Mei Deng and Riza M. Daza and Michael A. Zager and Kimberly A. Aldinger and Ronnie Blecher-Gonen and Fan Zhang and Malte Spielmann and James Palis and Dan Doherty and Frank J. Steemers and Ian A. Glass and Cole Trapnell and Jay Shendure}, - year = {2020}, - month = {Nov.}, - journal = {Science}, - publisher = {American Association for the Advancement of Science ({AAAS})}, - volume = {370}, - number = {6518}, - doi = {10.1126/science.aba7721}, - url = {https://doi.org/10.1126/science.aba7721} -} - - -@article{chai2014root, - doi = {10.5194/gmdd-7-1525-2014}, - url = {https://doi.org/10.5194/gmdd-7-1525-2014}, - year = {2014}, - month = {Feb.}, - publisher = {Copernicus {GmbH}}, - author = {T. Chai and R. R. Draxler}, - title = {Root mean square error ({RMSE}) or mean absolute error ({MAE})?} -} - - -@article{chang2022spatial, - title={Spatial omics representation and functional tissue module inference using graph Fourier transform}, - author={Chang, Yuzhou and Liu, Jixin and Ma, Anjun and Jiang, Sizun and Krull, Jordan and Yeo, Yao Yu and Liu, Yang and Rodig, Scott J and Barouch, Dan H and Fan, Rong and others}, - journal={bioRxiv}, - pages={2022--12}, - year={2022}, - publisher={Cold Spring Harbor Laboratory}, - doi={10.1101/2022.12.10.519929} -} - - -@article{chazarragil2021flexible, - doi = {10.1093/nar/gkab004}, - url = {https://doi.org/10.1093/nar/gkab004}, - year = {2021}, - month = {Feb.}, - publisher = {Oxford University Press ({OUP})}, - volume = {49}, - number = {7}, - pages = {e42--e42}, - author = {Ruben Chazarra-Gil and Stijn van~Dongen and Vladimir~Yu Kiselev and Martin Hemberg}, - title = {Flexible comparison of batch correction methods for single-cell {RNA}-seq using {BatchBench}}, - journal = {Nucleic Acids Research} -} - - -@article{chen2009local, - title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, - author = {Lisha Chen and Andreas Buja}, - year = {2009}, - month = {Mar.}, - journal = {Journal of the American Statistical Association}, - publisher = {Informa {UK} Limited}, - volume = {104}, - number = {485}, - pages = {209--219}, - doi = {10.1198/jasa.2009.0111}, - url = {https://doi.org/10.1198/jasa.2009.0111} -} - - -@inproceedings{chen2016xgboost, - title = {{XGBoost}}, - author = {Tianqi Chen and Carlos Guestrin}, - year = {2016}, - month = {Aug.}, - booktitle = {Proceedings of the 22nd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining}, - publisher = {{Acm}}, - doi = {10.1145/2939672.2939785}, - url = {https://doi.org/10.1145/2939672.2939785} -} - - -@article{cichocki2009fast, - title = {Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations}, - author = {Andrzej Cichocki and Anh-Huy Phan}, - year = {2009}, - journal = {{IEICE} Transactions on Fundamentals of Electronics, Communications and Computer Sciences}, - publisher = {Institute of Electronics, Information and Communications Engineers ({IEICE})}, - volume = {E92-a}, - number = {3}, - pages = {708--721}, - doi = {10.1587/transfun.e92.a.708}, - url = {https://doi.org/10.1587/transfun.e92.a.708} -} - - -@article{coifman2006diffusion, - title = {Diffusion maps}, - author = {Ronald R. Coifman and St{\'{e}}phane Lafon}, - year = {2006}, - month = {Jul.}, - journal = {Applied and Computational Harmonic Analysis}, - publisher = {Elsevier {BV}}, - volume = {21}, - number = {1}, - pages = {5--30}, - doi = {10.1016/j.acha.2006.04.006}, - url = {https://doi.org/10.1016/j.acha.2006.04.006} -} - - -@article{cover1967nearest, - title = {Nearest neighbor pattern classification}, - author = {T. Cover and P. Hart}, - year = {1967}, - month = {Jan}, - journal = {{IEEE} Transactions on Information Theory}, - publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, - volume = {13}, - number = {1}, - pages = {21--27}, - doi = {10.1109/tit.1967.1053964}, - url = {https://doi.org/10.1109/tit.1967.1053964} -} - - -@inproceedings{davis2006prauc, - title = {The relationship between Precision-Recall and {ROC} curves}, - author = {Jesse Davis and Mark Goadrich}, - year = {2006}, - booktitle = {Proceedings of the 23rd international conference on Machine learning - {ICML} {\textquotesingle}06}, - publisher = {{ACM} Press}, - doi = {10.1145/1143844.1143874}, - url = {https://doi.org/10.1145/1143844.1143874} -} - - -@string{dec = {Dec.}} - -@article{Demetci2020scot, - author = {Pinar Demetci and Rebecca Santorella and Bj{\"o}rn Sandstede and William Stafford Noble and Ritambhara Singh}, - title = {Gromov-Wasserstein optimal transport to align single-cell multi-omics data}, - elocation-id = {2020.04.28.066787}, - year = {2020}, - doi = {10.1101/2020.04.28.066787}, - publisher = {Cold Spring Harbor Laboratory}, - URL = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787}, - eprint = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787.full.pdf}, - journal = {bioRxiv} -} - - -@article{dimitrov2022comparison, - title = {Comparison of methods and resources for cell-cell communication inference from single-cell {RNA}-Seq data}, - author = {Daniel Dimitrov and D{\'{e}}nes T\"{u}rei and Martin Garrido-Rodriguez and Paul L. Burmedi and James S. Nagai and Charlotte Boys and Ricardo O. Ramirez Flores and Hyojin Kim and Bence Szalai and Ivan G. Costa and Alberto Valdeolivas and Aur{\'{e}}lien Dugourd and Julio Saez-Rodriguez}, - year = {2022}, - month = {Jun.}, - journal = {Nature Communications}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {13}, - number = {1}, - doi = {10.1038/s41467-022-30755-0}, - url = {https://doi.org/10.1038/s41467-022-30755-0} -} - - -@article{donoho2017yearsdatascience, - doi = {10.1080/10618600.2017.1384734}, - url = {https://doi.org/10.1080/10618600.2017.1384734}, - year = {2017}, - month = {Oct.}, - publisher = {Informa {UK} Limited}, - volume = {26}, - number = {4}, - pages = {745--766}, - author = {David Donoho}, - title = {50 Years of Data Science}, - journal = {Journal of Computational and Graphical Statistics} -} - - -@article{efremova2020cellphonedb, - title = {{CellPhoneDB}: inferring cell{\textendash}cell communication from combined expression of multi-subunit ligand{\textendash}receptor complexes}, - author = {Mirjana Efremova and Miquel Vento-Tormo and Sarah A. Teichmann and Roser Vento-Tormo}, - year = {2020}, - month = {Feb.}, - journal = {Nature Protocols}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {15}, - number = {4}, - pages = {1484--1506}, - doi = {10.1038/s41596-020-0292-x}, - url = {https://doi.org/10.1038/s41596-020-0292-x} -} - - -@article{emmons2016analysis, - title = {Analysis of Network Clustering Algorithms and Cluster Quality Metrics at Scale}, - volume = {11}, - ISSN = {1932-6203}, - url = {http://dx.doi.org/10.1371/journal.pone.0159161}, - doi = {10.1371/journal.pone.0159161}, - number = {7}, - journal = {PLOS ONE}, - publisher = {Public Library of Science (PLoS)}, - author = {Emmons, Scott and Kobourov, Stephen and Gallant, Mike and B\"{o}rner, Katy}, - editor = {Dovrolis, Constantine}, - year = {2016}, - month = jul, - pages = {e0159161} -} - - -@article{eraslan2019single, - title = {Single-cell {RNA}-seq denoising using a deep count autoencoder}, - author = {G\"{o}kcen Eraslan and Lukas M. Simon and Maria Mircea and Nikola S. Mueller and Fabian J. Theis}, - year = {2019}, - month = {Jan}, - journal = {Nature Communications}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {10}, - number = {1}, - doi = {10.1038/s41467-018-07931-2}, - url = {https://doi.org/10.1038/s41467-018-07931-2} -} - - -@article{fang2022conservation, - title = {Conservation and divergence of cortical cell organization in human and mouse revealed by MERFISH}, - volume = {377}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.abm1741}, - DOI = {10.1126/science.abm1741}, - number = {6601}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Fang, Rongxin and Xia, Chenglong and Close, Jennie L. and Zhang, Meng and He, Jiang and Huang, Zhengkai and Halpern, Aaron R. and Long, Brian and Miller, Jeremy A. and Lein, Ed S. and Zhuang, Xiaowei}, - year = {2022}, - month = jul, - pages = {56-62} -} - - -@string{feb = {Feb.}} - - -@article{fix1989discriminatory, - doi = {10.2307/1403797}, - url = {https://doi.org/10.2307/1403797}, - year = {1989}, - month = {Dec.}, - publisher = {{JSTOR}}, - volume = {57}, - number = {3}, - pages = {238}, - author = {Evelyn Fix and J. L. Hodges}, - title = {Discriminatory Analysis. Nonparametric Discrimination: Consistency Properties}, - journal = {International Statistical Review / Revue Internationale de Statistique} -} - - -@article{gower1975generalized, - title = {Generalized procrustes analysis}, - author = {J. C. Gower}, - year = {1975}, - month = {Mar.}, - journal = {Psychometrika}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {40}, - number = {1}, - pages = {33--51}, - doi = {10.1007/bf02291478}, - url = {https://doi.org/10.1007/bf02291478} -} - - -@article{grandini2020metrics, - title = {Metrics for Multi-Class Classification: an Overview}, - author = {Grandini, Margherita and Bagli, Enrico and Visani, Giorgio}, - year = {2020}, - journal = {arXiv}, - publisher = {Cornell University}, - doi = {10.48550/arxiv.2008.05756}, - url = {https://arxiv.org/abs/2008.05756}, - copyright = {arXiv.org perpetual, non-exclusive license}, - keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} -} - - -@article{granja2021archr, - title = {{ArchR} is a scalable software package for integrative single-cell chromatin accessibility analysis}, - author = {Jeffrey M. Granja and M. Ryan Corces and Sarah E. Pierce and S. Tansu Bagdatli and Hani Choudhry and Howard Y. Chang and William J. Greenleaf}, - year = {2021}, - month = {Feb.}, - journal = {Nature Genetics}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {53}, - number = {3}, - pages = {403--411}, - doi = {10.1038/s41588-021-00790-6}, - url = {https://doi.org/10.1038/s41588-021-00790-6} -} - - -@article{grn2014validation, - title = {Validation of noise models for single-cell transcriptomics}, - author = {Dominic Gr\"{u}n and Lennart Kester and Alexander van Oudenaarden}, - year = {2014}, - month = {Apr.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {11}, - number = {6}, - pages = {637--640}, - doi = {10.1038/nmeth.2930}, - url = {https://doi.org/10.1038/nmeth.2930} -} - - -@article{haghverdi2018batch, - title = {Batch effects in single-cell {RNA}-sequencing data are corrected by matching mutual nearest neighbors}, - author = {Laleh Haghverdi and Aaron T L Lun and Michael D Morgan and John C Marioni}, - year = {2018}, - month = {Apr.}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {36}, - number = {5}, - pages = {421--427}, - doi = {10.1038/nbt.4091}, - url = {https://doi.org/10.1038/nbt.4091} -} - - -@article{hammarlund2018cengen, - title = {The {CeNGEN} Project: The Complete Gene Expression Map of an Entire Nervous System}, - author = {Marc Hammarlund and Oliver Hobert and David M. Miller and Nenad Sestan}, - year = {2018}, - month = {Aug.}, - journal = {Neuron}, - publisher = {Elsevier {BV}}, - volume = {99}, - number = {3}, - pages = {430--433}, - doi = {10.1016/j.neuron.2018.07.042}, - url = {https://doi.org/10.1016/j.neuron.2018.07.042} -} - - -@article{hansen2012removing, - title = {Adjusting batch effects in microarray expression data using empirical Bayes methods}, - author = {W. Evan Johnson and Cheng Li and Ariel Rabinovic}, - year = {2006}, - month = {Apr.}, - journal = {Biostatistics}, - publisher = {Oxford University Press ({OUP})}, - volume = {8}, - number = {1}, - pages = {118--127}, - doi = {10.1093/biostatistics/kxj037}, - url = {https://doi.org/10.1093/biostatistics/kxj037} -} - - -@article{hao2021integrated, - title = {Integrated analysis of multimodal single-cell data}, - author = {Yuhan Hao and Stephanie Hao and Erica Andersen-Nissen and William M. Mauck and Shiwei Zheng and Andrew Butler and Maddie J. Lee and Aaron J. Wilk and Charlotte Darby and Michael Zager and Paul Hoffman and Marlon Stoeckius and Efthymia Papalexi and Eleni P. Mimitou and Jaison Jain and Avi Srivastava and Tim Stuart and Lamar M. Fleming and Bertrand Yeung and Angela J. Rogers and Juliana M. McElrath and Catherine A. Blish and Raphael Gottardo and Peter Smibert and Rahul Satija}, - year = {2021}, - month = {Jun.}, - journal = {Cell}, - publisher = {Elsevier {BV}}, - volume = {184}, - number = {13}, - pages = {3573--3587.e29}, - doi = {10.1016/j.cell.2021.04.048}, - url = {https://doi.org/10.1016/j.cell.2021.04.048} -} - - -@article{hao2021somde, - title={SOMDE: a scalable method for identifying spatially variable genes with self-organizing map}, - author={Hao, Minsheng and Hua, Kui and Zhang, Xuegong}, - journal={Bioinformatics}, - volume={37}, - number={23}, - pages={4392--4398}, - year={2021}, - publisher={Oxford University Press}, - doi={10.1093/bioinformatics/btab471} -} - - -@article{hie2019efficient, - title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, - author = {Brian Hie and Bryan Bryson and Bonnie Berger}, - year = {2019}, - month = {May}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {37}, - number = {6}, - pages = {685--691}, - doi = {10.1038/s41587-019-0113-3}, - url = {https://doi.org/10.1038/s41587-019-0113-3} -} - - -@article{hinton1989connectionist, - title = {Connectionist learning procedures}, - author = {Geoffrey E. Hinton}, - year = {1989}, - month = {Sept.}, - journal = {Artificial Intelligence}, - publisher = {Elsevier {BV}}, - volume = {40}, - number = {1-3}, - pages = {185--234}, - doi = {10.1016/0004-3702(89)90049-0}, - url = {https://doi.org/10.1016/0004-3702(89)90049-0} -} - - -@book{hosmer2013applied, - title = {Applied logistic regression}, - author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, - year = {2013}, - publisher = {John Wiley \& Sons}, - volume = {398} -} - - -@article{hou2019scmatch, - title = {{scMatch}: a single-cell gene expression profile annotation tool using reference datasets}, - author = {Rui Hou and Elena Denisenko and Alistair R R Forrest}, - year = {2019}, - month = {Apr.}, - journal = {Bioinformatics}, - publisher = {Oxford University Press ({OUP})}, - volume = {35}, - number = {22}, - pages = {4688--4695}, - doi = {10.1093/bioinformatics/btz292}, - url = {https://doi.org/10.1093/bioinformatics/btz292}, - editor = {Janet Kelso} -} - - -@article{hou2020predicting, - title = {Predicting cell-to-cell communication networks using {NATMI}}, - author = {Rui Hou and Elena Denisenko and Huan Ting Ong and Jordan A. Ramilowski and Alistair R. R. Forrest}, - year = {2020}, - month = {Oct.}, - journal = {Nature Communications}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {11}, - number = {1}, - doi = {10.1038/s41467-020-18873-z}, - url = {https://doi.org/10.1038/s41467-020-18873-z} -} - - -@article{hou2020systematic, - title = {A systematic evaluation of single-cell {RNA}-sequencing imputation methods}, - author = {Wenpin Hou and Zhicheng Ji and Hongkai Ji and Stephanie C. Hicks}, - year = {2020}, - month = {Aug.}, - journal = {Genome Biology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {21}, - number = {1}, - doi = {10.1186/s13059-020-02132-x}, - url = {https://doi.org/10.1186/s13059-020-02132-x} -} - - -@article{hubert1985comparing, - doi = {10.1007/bf01908075}, - url = {https://doi.org/10.1007/bf01908075}, - year = {1985}, - month = {Dec.}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {2}, - number = {1}, - pages = {193--218}, - author = {Lawrence Hubert and Phipps Arabie}, - title = {Comparing partitions}, - journal = {Journal of Classification} -} - - -@article{hu2021spagcn, - title={SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network}, - author={Hu, Jian and Li, Xiangjie and Coleman, Kyle and Schroeder, Amelia and Ma, Nan and Irwin, David J and Lee, Edward B and Shinohara, Russell T and Li, Mingyao}, - journal={Nature methods}, - volume={18}, - number={11}, - pages={1342--1351}, - year={2021}, - publisher={Nature Publishing Group US New York}, - doi={10.1038/s41592-021-01255-8} -} - - -@string{jan = {Jan}} - - -@string{jul = {Jul.}} - - -@string{jun = {Jun.}} - - -@article{kats2021spatialde2, - title={SpatialDE2: fast and localized variance component analysis of spatial transcriptomics}, - author={Kats, Ilia and Vento-Tormo, Roser and Stegle, Oliver}, - journal={Biorxiv}, - pages={2021--10}, - year={2021}, - publisher={Cold Spring Harbor Laboratory}, - doi={10.1101/2021.10.27.466045} -} - - -@article{kendall1938new, - doi = {10.1093/biomet/30.1-2.81}, - url = {https://doi.org/10.1093/biomet/30.1-2.81}, - year = {1938}, - month = {Jun.}, - publisher = {Oxford University Press ({OUP})}, - volume = {30}, - number = {1-2}, - pages = {81--93}, - author = {M. G. KENDALL}, - title = {A new measure of rank correlation}, - journal = {Biometrika} -} - - -@article{kiselev2019challenges, - title = {Challenges in unsupervised clustering of single-cell {RNA}-seq data}, - author = {Vladimir Yu Kiselev and Tallulah S. Andrews and Martin Hemberg}, - year = {2019}, - month = {Jan}, - journal = {Nature Reviews Genetics}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {20}, - number = {5}, - pages = {273--282}, - doi = {10.1038/s41576-018-0088-9}, - url = {https://doi.org/10.1038/s41576-018-0088-9} -} - - -@article{kleshchevnikov2022cell2location, - title = {Cell2location maps fine-grained cell types in spatial transcriptomics}, - author = {Vitalii Kleshchevnikov and Artem Shmatko and Emma Dann and Alexander Aivazidis and Hamish W. King and Tong Li and Rasa Elmentaite and Artem Lomakin and Veronika Kedlian and Adam Gayoso and Mika Sarkin Jain and Jun Sung Park and Lauma Ramona and Elizabeth Tuck and Anna Arutyunyan and Roser Vento-Tormo and Moritz Gerstung and Louisa James and Oliver Stegle and Omer Ali Bayraktar}, - year = {2022}, - month = {Jan}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {40}, - number = {5}, - pages = {661--671}, - doi = {10.1038/s41587-021-01139-4}, - url = {https://doi.org/10.1038/s41587-021-01139-4} -} - - -@article{korsunsky2019fast, - title = {Fast, sensitive and accurate integration of single-cell data with Harmony}, - author = {Ilya Korsunsky and Nghia Millard and Jean Fan and Kamil Slowikowski and Fan Zhang and Kevin Wei and Yuriy Baglaenko and Michael Brenner and Po-ru Loh and Soumya Raychaudhuri}, - year = {2019}, - month = {Nov.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {16}, - number = {12}, - pages = {1289--1296}, - doi = {10.1038/s41592-019-0619-0}, - url = {https://doi.org/10.1038/s41592-019-0619-0} -} - - -@article{kraemer2018dimred, - title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, - author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, - year = {2018}, - journal = {The R Journal}, - publisher = {The R Foundation}, - volume = {10}, - number = {1}, - pages = {342}, - doi = {10.32614/rj-2018-039}, - url = {https://doi.org/10.32614/rj-2018-039} -} - - -@article{kruskal1964mds, - title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, - author = {J. B. Kruskal}, - year = {1964}, - month = {Mar.}, - journal = {Psychometrika}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {29}, - number = {1}, - pages = {1--27}, - doi = {10.1007/bf02289565}, - url = {https://doi.org/10.1007/bf02289565} -} - - -@article{kuppe2022spatial, - title={Spatial multi-omic map of human myocardial infarction}, - author={Kuppe, Christoph and Ramirez Flores, Ricardo O and Li, Zhijian and Hayat, Sikander and Levinson, Rebecca T and Liao, Xian and Hannani, Monica T and Tanevski, Jovan and W{\"u}nnemann, Florian and Nagai, James S and others}, - journal={Nature}, - volume={608}, - number={7924}, - pages={766--777}, - year={2022}, - publisher={Nature Publishing Group UK London} -} - - -@article{lance2022multimodal, - title = {Multimodal single cell data integration challenge: results and lessons learned}, - author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, - year = {2022}, - journal = {bioRxiv}, - publisher = {Cold Spring Harbor Laboratory}, - doi = {10.1101/2022.04.11.487796}, - url = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796}, - elocation-id = {2022.04.11.487796}, - eprint = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796.full.pdf} -} - - -@article{lance2024predicting, - title = {Predicting cellular profiles across modalities in longitudinal single-cell data: An Open Problems competition}, - author = {...}, - year = {2024}, - journal = {In preparation}, -} - - -@book{lawson1995solving, - title = {Solving Least Squares Problems}, - author = {Charles L. Lawson and Richard J. Hanson}, - year = {1995}, - month = {Jan}, - publisher = {Society for Industrial and Applied Mathematics}, - doi = {10.1137/1.9781611971217}, - url = {https://doi.org/10.1137/1.9781611971217} -} - - -@article{lee2009quality, - title = {Quality assessment of dimensionality reduction: Rank-based criteria}, - author = {John A. Lee and Michel Verleysen}, - year = {2009}, - month = {Mar.}, - journal = {Neurocomputing}, - publisher = {Elsevier {BV}}, - volume = {72}, - number = {7-9}, - pages = {1431--1443}, - doi = {10.1016/j.neucom.2008.12.017}, - url = {https://doi.org/10.1016/j.neucom.2008.12.017} -} - - -@article{li2021bayesian, - author = {Li, Qiwei and Zhang, Minzhe and Xie, Yang and Xiao, Guanghua}, - title = "{Bayesian modeling of spatial molecular profiling data via Gaussian process}", - journal = {Bioinformatics}, - volume = {37}, - number = {22}, - pages = {4129-4136}, - year = {2021}, - month = {06}, - abstract = "{The location, timing and abundance of gene expression (both mRNA and proteins) within a tissue define the molecular mechanisms of cell functions. Recent technology breakthroughs in spatial molecular profiling, including imaging-based technologies and sequencing-based technologies, have enabled the comprehensive molecular characterization of single cells while preserving their spatial and morphological contexts. This new bioinformatics scenario calls for effective and robust computational methods to identify genes with spatial patterns.We represent a novel Bayesian hierarchical model to analyze spatial transcriptomics data, with several unique characteristics. It models the zero-inflated and over-dispersed counts by deploying a zero-inflated negative binomial model that greatly increases model stability and robustness. Besides, the Bayesian inference framework allows us to borrow strength in parameter estimation in a de novo fashion. As a result, the proposed model shows competitive performances in accuracy and robustness over existing methods in both simulation studies and two real data applications.The related R/C++ source code is available at https://github.com/Minzhe/BOOST-GP.Supplementary data are available at Bioinformatics online. }", - issn = {1367-4803}, - doi = {10.1093/bioinformatics/btab455}, - url = {https://doi.org/10.1093/bioinformatics/btab455}, - eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/22/4129/50335106/btab455.pdf}, -} - - -@article{linderman2018zero, - title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, - author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, - year = {2018}, - journal = {bioRxiv}, - publisher = {Cold Spring Harbor Laboratory}, - doi = {10.1101/397588}, - url = {https://www.biorxiv.org/content/early/2018/08/22/397588}, - elocation-id = {397588}, - eprint = {https://www.biorxiv.org/content/early/2018/08/22/397588.full.pdf} -} - - -@article{liu2020high, - title = {High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue}, - volume = {183}, - ISSN = {0092-8674}, - url = {http://dx.doi.org/10.1016/j.cell.2020.10.026}, - DOI = {10.1016/j.cell.2020.10.026}, - number = {6}, - journal = {Cell}, - publisher = {Elsevier BV}, - author = {Liu, Yang and Yang, Mingyu and Deng, Yanxiang and Su, Graham and Enninful, Archibald and Guo, Cindy C. and Tebaldi, Toma and Zhang, Di and Kim, Dongjoo and Bai, Zhiliang and Norris, Eileen and Pan, Alisia and Li, Jiatong and Xiao, Yang and Halene, Stephanie and Fan, Rong}, - year = {2020}, - month = dec, - pages = {1665--1681.e18} -} - - -@article{lohoff2021integration, - title = {Integration of spatial and single-cell transcriptomic data elucidates mouse organogenesis}, - volume = {40}, - ISSN = {1546-1696}, - url = {http://dx.doi.org/10.1038/s41587-021-01006-2}, - DOI = {10.1038/s41587-021-01006-2}, - number = {1}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media LLC}, - author = {Lohoff, T. and Ghazanfar, S. and Missarova, A. and Koulena, N. and Pierson, N. and Griffiths, J. A. and Bardot, E. S. and Eng, C.-H. L. and Tyser, R. C. V. and Argelaguet, R. and Guibentif, C. and Srinivas, S. and Briscoe, J. and Simons, B. D. and Hadjantonakis, A.-K. and G\"{o}ttgens, B. and Reik, W. and Nichols, J. and Cai, L. and Marioni, J. C.}, - year = {2021}, - month = sep, - pages = {74-85} -} - - -@article{lopez2018deep, - title = {Deep generative modeling for single-cell transcriptomics}, - author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, - year = {2018}, - month = {Nov.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {15}, - number = {12}, - pages = {1053--1058}, - doi = {10.1038/s41592-018-0229-2}, - url = {https://doi.org/10.1038/s41592-018-0229-2} -} - - -@article{lopez2022destvi, - title = {{DestVI} identifies continuums of cell types in spatial transcriptomics data}, - author = {Romain Lopez and Baoguo Li and Hadas Keren-Shaul and Pierre Boyeau and Merav Kedmi and David Pilzer and Adam Jelinski and Ido Yofe and Eyal David and Allon Wagner and Can Ergen and Yoseph Addadi and Ofra Golani and Franca Ronchese and Michael I. Jordan and Ido Amit and Nir Yosef}, - year = {2022}, - month = {Apr.}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {40}, - number = {9}, - pages = {1360--1369}, - doi = {10.1038/s41587-022-01272-8}, - url = {https://doi.org/10.1038/s41587-022-01272-8} -} - - -@article{lotfollahi2020query, - title = {Query to reference single-cell integration with transfer learning}, - author = {Lotfollahi, Mohammad and Naghipourfar, Mohsen and Luecken, Malte D. and Khajavi, Matin and B{\"u}ttner, Maren and Avsec, Ziga and Misharin, Alexander V. and Theis, Fabian J.}, - year = {2020}, - journal = {bioRxiv}, - publisher = {Cold Spring Harbor Laboratory}, - doi = {10.1101/2020.07.16.205997}, - url = {https://doi.org/10.1101/2020.07.16.205997}, - elocation-id = {2020.07.16.205997}, - eprint = {https://www.biorxiv.org/content/early/2020/07/16/2020.07.16.205997.full.pdf} -} - - -@article{luecken2022benchmarking, - title = {Benchmarking atlas-level data integration in single-cell genomics}, - author = {Malte D. Luecken and M. B\"{u}ttner and K. Chaichoompu and A. Danese and M. Interlandi and M. F. Mueller and D. C. Strobl and L. Zappia and M. Dugas and M. Colom{\'{e}}-Tatch{\'{e}} and Fabian J. Theis}, - year = {2021}, - month = {Dec.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {19}, - number = {1}, - pages = {41--50}, - doi = {10.1038/s41592-021-01336-8}, - url = {https://doi.org/10.1038/s41592-021-01336-8} -} - - -@article{lueks2011evaluate, - title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, - author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, - year = {2011}, - journal = {arXiv}, - doi = {10.48550/ARXIV.1110.3917}, - url = {https://arxiv.org/abs/1110.3917}, - copyright = {arXiv.org perpetual, non-exclusive license}, - keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences} -} - - -@misc{lun2019fastmnn, - title = {A description of the theory behind the fastMNN algorithm}, - author = {Lun, Aaron}, - year = {2019}, - url = {https://marionilab.github.io/FurtherMNN2018/theory/description.html} -} - - -@string{mar = {Mar.}} - - -@string{may = {May}} - - -@article{mcinnes2018umap, - title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, - author = {McInnes, Leland and Healy, John and Melville, James}, - year = {2018}, - journal = {arXiv}, - publisher = {Cornell University}, - doi = {10.48550/arxiv.1802.03426}, - url = {https://arxiv.org/abs/1802.03426}, - copyright = {arXiv.org perpetual, non-exclusive license}, - keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} -} - - -@article{mereu2020benchmarking, - doi = {10.1038/s41587-020-0469-4}, - author = {Mereu, Elisabetta and Lafzi, Atefeh and Moutinho, Catia and Ziegenhain, Christoph and McCarthy, Davis J and Alvarez-Varela, Adrian and Batlle, Eduard and Sagar and Gruen, Dominic and Lau, Julia K and others}, - journal = {Nature biotechnology}, - number = {6}, - pages = {747--755}, - publisher = {Nature Publishing Group US New York}, - title = {Benchmarking single-cell {RNA}-sequencing protocols for cell atlas projects}, - volume = {38}, - year = {2020} -} - - -@inbook{miles2005rsquared, - title = {Encyclopedia of Statistics in Behavioral Science}, - author = {Jeremy Miles}, - year = {2005}, - month = {Oct.}, - publisher = {John Wiley {\&} Sons, Ltd}, - doi = {10.1002/0470013192.bsa526}, - url = {https://doi.org/10.1002/0470013192.bsa526}, - chapter = {{R-Squared}, Adjusted {R-Squared}} -} - - -@article{moon2019visualizing, - title = {Visualizing structure and transitions in high-dimensional biological data}, - author = {Kevin R. Moon and David van Dijk and Zheng Wang and Scott Gigante and Daniel B. Burkhardt and William S. Chen and Kristina Yim and Antonia van den Elzen and Matthew J. Hirn and Ronald R. Coifman and Natalia B. Ivanova and Guy Wolf and Smita Krishnaswamy}, - year = {2019}, - month = {Dec.}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {37}, - number = {12}, - pages = {1482--1492}, - doi = {10.1038/s41587-019-0336-3}, - url = {https://doi.org/10.1038/s41587-019-0336-3} -} - - -@article{narayan2021assessing, - title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, - author = {Ashwin Narayan and Bonnie Berger and Hyunghoon Cho}, - year = {2021}, - month = {Jan}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {39}, - number = {6}, - pages = {765--774}, - doi = {10.1038/s41587-020-00801-7}, - url = {https://doi.org/10.1038/s41587-020-00801-7} -} - - -@article{nestorowa2016single, - title = {A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation}, - author = {Sonia Nestorowa and Fiona K. Hamey and Blanca Pijuan Sala and Evangelia Diamanti and Mairi Shepherd and Elisa Laurenti and Nicola K. Wilson and David G. Kent and Berthold G\"{o}ttgens}, - year = {2016}, - month = {Aug.}, - journal = {Blood}, - publisher = {American Society of Hematology}, - volume = {128}, - number = {8}, - pages = {e20--e31}, - doi = {10.1182/blood-2016-05-716480}, - url = {https://doi.org/10.1182/blood-2016-05-716480} -} - - -@inproceedings{luecken2021neurips, - author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M}, - booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, - editor = {J. Vanschoren and S. Yeung}, - pages = {}, - publisher = {Curran}, - title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells}, - url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf}, - volume = {1}, - year = {2021} -} - - -@string{nov = {Nov.}} - - -@string{oct = {Oct.}} - - -@article{olsson2016single, - title = {Single-cell analysis of mixed-lineage states leading to a binary cell fate choice}, - author = {Andre Olsson and Meenakshi Venkatasubramanian and Viren K. Chaudhri and Bruce J. Aronow and Nathan Salomonis and Harinder Singh and H. Leighton Grimes}, - year = {2016}, - month = {Aug.}, - journal = {Nature}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {537}, - number = {7622}, - pages = {698--702}, - doi = {10.1038/nature19348}, - url = {https://doi.org/10.1038/nature19348} -} - - -@misc{openproblems, - title = {Open Problems}, - author = {{Open Problems for Single Cell Analysis Consortium}}, - year = {2022}, - url = {https://openproblems.bio} -} - - -@article{palla2022squidpy, - title={Squidpy: a scalable framework for spatial omics analysis}, - author={Palla, Giovanni and Spitzer, Hannah and Klein, Michal and Fischer, David and Schaar, Anna Christina and Kuemmerle, Louis Benedikt and Rybakov, Sergei and Ibarra, Ignacio L and Holmberg, Olle and Virshup, Isaac and others}, - journal={Nature methods}, - volume={19}, - number={2}, - pages={171--178}, - year={2022}, - publisher={Nature Publishing Group US New York}, - doi={10.1038/s41592-021-01358-2} -} - - -@article{pearson1895regression, - doi = {10.1098/rspl.1895.0041}, - title = {VII. Note on regression and inheritance in the case of two parents}, - author = {Pearson, Karl}, - journal = {proceedings of the royal society of London}, - volume = {58}, - number = {347-352}, - pages = {240--242}, - year = {1895}, - publisher = {The Royal Society London} -} - - -@article{pearson1901pca, - title = {On lines and planes of closest fit to systems of points in space}, - author = {Karl Pearson}, - year = {1901}, - month = {Nov.}, - journal = {The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}, - publisher = {Informa {UK} Limited}, - volume = {2}, - number = {11}, - pages = {559--572}, - doi = {10.1080/14786440109462720}, - url = {https://doi.org/10.1080/14786440109462720} -} - - -@article{pliner2019supervised, - title = {Supervised classification enables rapid annotation of cell atlases}, - author = {Hannah A. Pliner and Jay Shendure and Cole Trapnell}, - year = {2019}, - month = {Sept.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {16}, - number = {10}, - pages = {983--986}, - doi = {10.1038/s41592-019-0535-3}, - url = {https://doi.org/10.1038/s41592-019-0535-3} -} - - -@article{polanski2020bbknn, - title = {{BBKNN}: fast batch alignment of single cell transcriptomes}, - author = {Krzysztof Pola{\'{n}}ski and Matthew D Young and Zhichao Miao and Kerstin B Meyer and Sarah A Teichmann and Jong-Eun Park}, - year = {2019}, - month = {Aug.}, - journal = {Bioinformatics}, - publisher = {Oxford University Press ({OUP})}, - doi = {10.1093/bioinformatics/btz625}, - url = {https://doi.org/10.1093/bioinformatics/btz625}, - editor = {Bonnie Berger} -} - - -@article{raredon2022computation, - title = {Computation and visualization of cell{\textendash}cell signaling topologies in single-cell systems data using Connectome}, - author = {Micha Sam Brickman Raredon and Junchen Yang and James Garritano and Meng Wang and Dan Kushnir and Jonas Christian Schupp and Taylor S. Adams and Allison M. Greaney and Katherine L. Leiby and Naftali Kaminski and Yuval Kluger and Andre Levchenko and Laura E. Niklason}, - year = {2022}, - month = {Mar.}, - journal = {Scientific Reports}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {12}, - number = {1}, - doi = {10.1038/s41598-022-07959-x}, - url = {https://doi.org/10.1038/s41598-022-07959-x} -} - - -@article{rodriques2019slide, - title = {Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution}, - author = {Samuel G. Rodriques and Robert R. Stickels and Aleksandrina Goeva and Carly A. Martin and Evan Murray and Charles R. Vanderburg and Joshua Welch and Linlin M. Chen and Fei Chen and Evan Z. Macosko}, - year = {2019}, - month = {Mar.}, - journal = {Science}, - publisher = {American Association for the Advancement of Science ({AAAS})}, - volume = {363}, - number = {6434}, - pages = {1463--1467}, - doi = {10.1126/science.aaw1219}, - url = {https://doi.org/10.1126/science.aaw1219} -} - - -@article{russell2023slide, - title = {Slide-tags enables single-nucleus barcoding for multimodal spatial genomics}, - volume = {625}, - ISSN = {1476-4687}, - url = {http://dx.doi.org/10.1038/s41586-023-06837-4}, - DOI = {10.1038/s41586-023-06837-4}, - number = {7993}, - journal = {Nature}, - publisher = {Springer Science and Business Media LLC}, - author = {Russell, Andrew J. C. and Weir, Jackson A. and Nadaf, Naeem M. and Shabet, Matthew and Kumar, Vipin and Kambhampati, Sandeep and Raichur, Ruth and Marrero, Giovanni J. and Liu, Sophia and Balderrama, Karol S. and Vanderburg, Charles R. and Shanmugam, Vignesh and Tian, Luyi and Iorgulescu, J. Bryan and Yoon, Charles H. and Wu, Catherine J. and Macosko, Evan Z. and Chen, Fei}, - year = {2023}, - month = dec, - pages = {101–109} -} - - -@InProceedings{santos2009on, - author = {Santos, Jorge M. and Embrechts, Mark"}, - editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios}, - title = {On the Use of the Adjusted Rand Index as a Metric for Evaluating Supervised Classification}, - booktitle = {Artificial Neural Networks -- ICANN 2009}, - year = {2009}, - publisher = {Springer Berlin Heidelberg}, - address = {Berlin, Heidelberg}, - pages = {175--184}, - isbn = {978-3-642-04277-5}, - doi = {10.1007/978-3-642-04277-5_18}, - url = {https://doi.org/10.1007/978-3-642-04277-5_18} -} - - -@article{sarkar2021separating, - title = {Separating measurement and expression models clarifies confusion in single-cell {RNA} sequencing analysis}, - author = {Abhishek Sarkar and Matthew Stephens}, - year = {2021}, - month = {May}, - journal = {Nature Genetics}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {53}, - number = {6}, - pages = {770--777}, - doi = {10.1038/s41588-021-00873-4}, - url = {https://doi.org/10.1038/s41588-021-00873-4} -} - - -@article{schober2018correlation, - title = {Correlation Coefficients}, - author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, - year = {2018}, - month = {May}, - journal = {Anesthesia {\&} Analgesia}, - publisher = {Ovid Technologies (Wolters Kluwer Health)}, - volume = {126}, - number = {5}, - pages = {1763--1768}, - doi = {10.1213/ane.0000000000002864}, - url = {https://doi.org/10.1213/ane.0000000000002864} -} - - -@string{sep = {Sept.}} - - -@inproceedings{stanley2020harmonic, - title = {Harmonic Alignment}, - author = {Jay S. Stanley and Scott Gigante and Guy Wolf and Smita Krishnaswamy}, - year = {2020}, - month = {Jan}, - booktitle = {Proceedings of the 2020 {SIAM} International Conference on Data Mining}, - publisher = {Society for Industrial and Applied Mathematics}, - pages = {316--324}, - doi = {10.1137/1.9781611976236.36}, - url = {https://doi.org/10.1137/1.9781611976236.36} -} - - -@article{stickels2020highly, - title = {Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2}, - volume = {39}, - ISSN = {1546-1696}, - url = {http://dx.doi.org/10.1038/s41587-020-0739-1}, - DOI = {10.1038/s41587-020-0739-1}, - number = {3}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media LLC}, - author = {Stickels, Robert R. and Murray, Evan and Kumar, Pawan and Li, Jilong and Marshall, Jamie L. and Di Bella, Daniela J. and Arlotta, Paola and Macosko, Evan Z. and Chen, Fei}, - year = {2020}, - month = dec, - pages = {313–319} -} - - -@article{stoeckius2017simultaneous, - title = {Simultaneous epitope and transcriptome measurement in single cells}, - author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, - year = {2017}, - month = {Jul.}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {14}, - number = {9}, - pages = {865--868}, - doi = {10.1038/nmeth.4380}, - url = {https://doi.org/10.1038/nmeth.4380} -} - - -@article{stuart2019comprehensive, - title = {Comprehensive Integration of Single-Cell Data}, - author = {Stuart, T. and Butler, A. and Hoffman, P. and Hafemeister, C. and Papalexi, E. and Mauck, W.M. and Hao, Y. and Stoeckius, M. and Smibert, P. and Satija, R.}, - year = {2019}, - journal = {Cell}, - volume = {177}, - number = {7}, - pages = {1888--1902.e21}, - doi = {10.1016/j.cell.2019.05.031} -} - - -@article{sun2020statistical, - title={Statistical analysis of spatial expression patterns for spatially resolved transcriptomic studies}, - author={Sun, Shiquan and Zhu, Jiaqiang and Zhou, Xiang}, - journal={Nature methods}, - volume={17}, - number={2}, - pages={193--200}, - year={2020}, - publisher={Nature Publishing Group US New York}, - doi={10.1038/s41592-019-0701-7} -} - - -@article{svensson2018spatialde, - title={SpatialDE: identification of spatially variable genes}, - author={Svensson, Valentine and Teichmann, Sarah A and Stegle, Oliver}, - journal={Nature methods}, - volume={15}, - number={5}, - pages={343--346}, - year={2018}, - publisher={Nature Publishing Group}, - doi={10.1038/nmeth.4636} -} - - -@article{szubert2019structurepreserving, - title = {Structure-preserving visualisation of high dimensional single-cell datasets}, - author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, - year = {2019}, - month = {Jun.}, - journal = {Scientific Reports}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {9}, - number = {1}, - doi = {10.1038/s41598-019-45301-0}, - url = {https://doi.org/10.1038/s41598-019-45301-0} -} - - -@article{tabula2018single, - title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, - author = {{Tabula Muris Consortium}}, - year = {2018}, - month = {Oct.}, - journal = {Nature}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {562}, - number = {7727}, - pages = {367--372}, - doi = {10.1038/s41586-018-0590-4}, - url = {https://doi.org/10.1038/s41586-018-0590-4} -} - - -@article{tabula2020single, - title = {A single-cell transcriptomic atlas characterizes ageing tissues in the mouse}, - author = {{Tabula Muris Consortium}}, - year = {2020}, - month = {Jul.}, - journal = {Nature}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {583}, - number = {7817}, - pages = {590--595}, - doi = {10.1038/s41586-020-2496-1}, - url = {https://doi.org/10.1038/s41586-020-2496-1} -} - - -@article{tasic2016adult, - title = {Adult mouse cortical cell taxonomy revealed by single cell transcriptomics}, - author = {Bosiljka Tasic and Vilas Menon and Thuc Nghi Nguyen and Tae Kyung Kim and Tim Jarsky and Zizhen Yao and Boaz Levi and Lucas T Gray and Staci A Sorensen and Tim Dolbeare and Darren Bertagnolli and Jeff Goldy and Nadiya Shapovalova and Sheana Parry and Changkyu Lee and Kimberly Smith and Amy Bernard and Linda Madisen and Susan M Sunkin and Michael Hawrylycz and Christof Koch and Hongkui Zeng}, - year = {2016}, - month = {Jan}, - journal = {Nature Neuroscience}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {19}, - number = {2}, - pages = {335--346}, - doi = {10.1038/nn.4216}, - url = {https://doi.org/10.1038/nn.4216} -} - - -@article{tian2019benchmarking, - title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, - author = {Luyi Tian and Xueyi Dong and Saskia Freytag and Kim-Anh L{\^{e}} Cao and Shian Su and Abolfazl JalalAbadi and Daniela Amann-Zalcenstein and Tom S. Weber and Azadeh Seidi and Jafar S. Jabbari and Shalin H. Naik and Matthew E. Ritchie}, - year = {2019}, - month = {May}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {16}, - number = {6}, - pages = {479--487}, - doi = {10.1038/s41592-019-0425-8}, - url = {https://doi.org/10.1038/s41592-019-0425-8} -} - - -@article{tran2020benchmark, - doi = {10.1186/s13059-019-1850-9}, - url = {https://doi.org/10.1186/s13059-019-1850-9}, - year = {2020}, - month = {Jan}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {21}, - number = {1}, - author = {Hoa Thi Nhu Tran and Kok Siong Ang and Marion Chevrier and Xiaomeng Zhang and Nicole Yee Shin Lee and Michelle Goh and Jinmiao Chen}, - title = {A benchmark of batch-effect correction methods for single-cell {RNA} sequencing data}, - journal = {Genome Biology} -} - - -@article{van2018recovering, - title = {Recovering Gene Interactions from Single-Cell Data Using Data Diffusion}, - author = {David van Dijk and Roshan Sharma and Juozas Nainys and Kristina Yim and Pooja Kathail and Ambrose J. Carr and Cassandra Burdziak and Kevin R. Moon and Christine L. Chaffer and Diwakar Pattabiraman and Brian Bierie and Linas Mazutis and Guy Wolf and Smita Krishnaswamy and Dana Pe'er}, - year = {2018}, - month = {Jul.}, - journal = {Cell}, - publisher = {Elsevier {BV}}, - volume = {174}, - number = {3}, - pages = {716--729.e27}, - doi = {10.1016/j.cell.2018.05.061}, - url = {https://doi.org/10.1016/j.cell.2018.05.061} -} - - -@article{vandermaaten2008visualizing, - title = {Visualizing Data using t-SNE}, - author = {{van der} Maaten, Laurens and Hinton, Geoffrey}, - year = {2008}, - journal = {Journal of Machine Learning Research}, - volume = {9}, - number = {86}, - pages = {2579--2605}, - url = {http://jmlr.org/papers/v9/vandermaaten08a.html} -} - - -@inproceedings{venna2001neighborhood, - title = {Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study}, - author = {Jarkko Venna and Samuel Kaski}, - year = {2001}, - booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, - publisher = {Springer Berlin Heidelberg}, - pages = {485--491}, - doi = {{10.1007/3-540-44668-0\_68}}, - url = {{https://doi.org/10.1007/3-540-44668-0\_68}} -} - - -@article{venna2006local, - title = {Local multidimensional scaling}, - author = {Jarkko Venna and Samuel Kaski}, - year = {2006}, - month = {Jul.}, - journal = {Neural Networks}, - publisher = {Elsevier {BV}}, - volume = {19}, - number = {6-7}, - pages = {889--899}, - doi = {10.1016/j.neunet.2006.05.014}, - url = {https://doi.org/10.1016/j.neunet.2006.05.014} -} - - -@article{virshup2021anndataannotateddata, - doi = {10.1101/2021.12.16.473007}, - url = {https://doi.org/10.1101/2021.12.16.473007}, - year = {2021}, - month = {Dec.}, - publisher = {Cold Spring Harbor Laboratory}, - author = {Isaac Virshup and Sergei Rybakov and Fabian J. Theis and Philipp Angerer and F. Alexander Wolf}, - title = {anndata: Annotated data} -} - - -@article{wagner2018knearest, - title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, - author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, - year = {2018}, - journal = {bioRxiv}, - publisher = {Cold Spring Harbor Laboratory}, - doi = {10.1101/217737}, - url = {https://www.biorxiv.org/content/early/2018/04/09/217737}, - elocation-id = {217737}, - eprint = {https://www.biorxiv.org/content/early/2018/04/09/217737.full.pdf} -} - - -@article{wagner2018single, - title = {Single-cell mapping of gene expression landscapes and lineage in the zebrafish embryo}, - author = {Daniel E. Wagner and Caleb Weinreb and Zach M. Collins and James A. Briggs and Sean G. Megason and Allon M. Klein}, - year = {2018}, - month = {Jun.}, - journal = {Science}, - publisher = {American Association for the Advancement of Science ({AAAS})}, - volume = {360}, - number = {6392}, - pages = {981--987}, - doi = {10.1126/science.aar4362}, - url = {https://doi.org/10.1126/science.aar4362} -} - - -@article{wang2013target, - title = {Target analysis by integration of transcriptome and {ChIP}-seq data with {BETA}}, - author = {Su Wang and Hanfei Sun and Jian Ma and Chongzhi Zang and Chenfei Wang and Juan Wang and Qianzi Tang and Clifford A Meyer and Yong Zhang and X Shirley Liu}, - year = {2013}, - month = {Nov.}, - journal = {Nature Protocols}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {8}, - number = {12}, - pages = {2502--2515}, - doi = {10.1038/nprot.2013.150}, - url = {https://doi.org/10.1038/nprot.2013.150} -} - - -@article{wang2017visualization, - title = {Visualization and analysis of single-cell {RNA}-seq data by kernel-based similarity learning}, - volume = {14}, - copyright = {2017 Springer Nature America, Inc.}, - issn = {1548-7105}, - url = {https://www.nature.com/articles/nmeth.4207}, - doi = {10.1038/nmeth.4207}, - abstract = {The SIMLR software identifies similarities between cells across a range of single-cell RNA-seq data, enabling effective dimension reduction, clustering and visualization.}, - language = {en}, - number = {4}, - journal = {Nature Methods}, - author = {Wang, Bo and Zhu, Junjie and Pierson, Emma and Ramazzotti, Daniele and Batzoglou, Serafim}, - month = apr, - year = {2017}, - publisher = {Nature Publishing Group}, - keywords = {Gene expression, Genome informatics, Machine learning, Statistical methods}, - pages = {414--416}, -} - - -@article{wang2018three, - title = {Three-dimensional intact-tissue sequencing of single-cell transcriptional states}, - volume = {361}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.aat5691}, - DOI = {10.1126/science.aat5691}, - number = {6400}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Wang, Xiao and Allen, William E. and Wright, Matthew A. and Sylwestrak, Emily L. and Samusik, Nikolay and Vesuna, Sam and Evans, Kathryn and Liu, Cindy and Ramakrishnan, Charu and Liu, Jia and Nolan, Garry P. and Bava, Felice-Alessio and Deisseroth, Karl}, - year = {2018}, - month = jul -} - - -@article{wang2022high, - title = {High-resolution 3D spatiotemporal transcriptomic maps of developing Drosophila embryos and larvae}, - volume = {57}, - ISSN = {1534-5807}, - url = {http://dx.doi.org/10.1016/j.devcel.2022.04.006}, - DOI = {10.1016/j.devcel.2022.04.006}, - number = {10}, - journal = {Developmental Cell}, - publisher = {Elsevier BV}, - author = {Wang, Mingyue and Hu, Qinan and Lv, Tianhang and Wang, Yuhang and Lan, Qing and Xiang, Rong and Tu, Zhencheng and Wei, Yanrong and Han, Kai and Shi, Chang and Guo, Junfu and Liu, Chao and Yang, Tao and Du, Wensi and An, Yanru and Cheng, Mengnan and Xu, Jiangshan and Lu, Haorong and Li, Wangsheng and Zhang, Shaofang and Chen, Ao and Chen, Wei and Li, Yuxiang and Wang, Xiaoshan and Xu, Xun and Hu, Yuhui and Liu, Longqi}, - year = {2022}, - month = may, - pages = {1271--1283.e4} -} - - -@article{weber2023nnsvg, - title={nnSVG for the scalable identification of spatially variable genes using nearest-neighbor Gaussian processes}, - author={Weber, Lukas M and Saha, Arkajyoti and Datta, Abhirup and Hansen, Kasper D and Hicks, Stephanie C}, - journal={Nature communications}, - volume={14}, - number={1}, - pages={4059}, - year={2023}, - publisher={Nature Publishing Group UK London}, - doi={10.1038/s41467-023-39748-z} -} - - -@article{welch2019single, - title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, - author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, - year = {2019}, - month = {Jun.}, - journal = {Cell}, - publisher = {Elsevier {BV}}, - volume = {177}, - number = {7}, - pages = {1873--1887.e17}, - doi = {10.1016/j.cell.2019.05.006}, - url = {https://doi.org/10.1016/j.cell.2019.05.006} -} - - -@article{wilkinson1973symbolic, - doi = {10.2307/2346786}, - url = {https://doi.org/10.2307/2346786}, - year = {1973}, - publisher = {{JSTOR}}, - volume = {22}, - number = {3}, - pages = {392}, - author = {G. N. Wilkinson and C. E. Rogers}, - title = {Symbolic Description of Factorial Models for Analysis of Variance}, - journal = {Applied Statistics} -} - - -@article{wu2021single, - title = {A single-cell and spatially resolved atlas of human breast cancers}, - author = {Sunny Z. Wu and Ghamdan Al-Eryani and Daniel Lee Roden and Simon Junankar and Kate Harvey and Alma Andersson and Aatish Thennavan and Chenfei Wang and James R. Torpy and Nenad Bartonicek and Taopeng Wang and Ludvig Larsson and Dominik Kaczorowski and Neil I. Weisenfeld and Cedric R. Uytingco and Jennifer G. Chew and Zachary W. Bent and Chia-Ling Chan and Vikkitharan Gnanasambandapillai and Charles-Antoine Dutertre and Laurence Gluch and Mun N. Hui and Jane Beith and Andrew Parker and Elizabeth Robbins and Davendra Segara and Caroline Cooper and Cindy Mak and Belinda Chan and Sanjay Warrier and Florent Ginhoux and Ewan Millar and Joseph E. Powell and Stephen R. Williams and X. Shirley Liu and Sandra O'Toole and Elgene Lim and Joakim Lundeberg and Charles M. Perou and Alexander Swarbrick}, - year = {2021}, - month = {Sept.}, - journal = {Nature Genetics}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {53}, - number = {9}, - pages = {1334--1347}, - doi = {10.1038/s41588-021-00911-1}, - url = {https://doi.org/10.1038/s41588-021-00911-1} -} - - -@article{xiong2020neuralee, - title = {{NeuralEE}: A {GPU}-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale {scRNA}-Seq Data}, - author = {Jiankang Xiong and Fuzhou Gong and Lin Wan and Liang Ma}, - year = {2020}, - month = {Oct.}, - journal = {Frontiers in Genetics}, - publisher = {Frontiers Media {SA}}, - volume = {11}, - doi = {10.3389/fgene.2020.00786}, - url = {https://doi.org/10.3389/fgene.2020.00786} -} - - -@article{xiong2021online, - title = {Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space}, - author = {Lei Xiong and Kang Tian and Yuzhe Li and Weixi Ning and Xin Gao and Qiangfeng Cliff Zhang}, - year = {2022}, - month = {Oct.}, - journal = {Nature Communications}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {13}, - number = {1}, - doi = {10.1038/s41467-022-33758-z}, - url = {https://doi.org/10.1038/s41467-022-33758-z} -} - - -@article{xu2021probabilistic, - title = {Probabilistic harmonization and annotation of single-cell transcriptomics data with deep generative models}, - author = {Chenling Xu and Romain Lopez and Edouard Mehlman and Jeffrey Regier and Michael I Jordan and Nir Yosef}, - year = {2021}, - month = {Jan}, - journal = {Molecular Systems Biology}, - publisher = {{Embo}}, - volume = {17}, - number = {1}, - doi = {10.15252/msb.20209620}, - url = {https://doi.org/10.15252/msb.20209620} -} - - -@article{zappia2018exploring, - doi = {10.1371/journal.pcbi.1006245}, - url = {https://doi.org/10.1371/journal.pcbi.1006245}, - year = {2018}, - month = {Jun.}, - publisher = {Public Library of Science ({PLoS})}, - volume = {14}, - number = {6}, - pages = {e1006245}, - author = {Luke Zappia and Belinda Phipson and Alicia Oshlack}, - editor = {Dina Schneidman}, - title = {Exploring the single-cell {RNA}-seq analysis landscape with the {scRNA}-tools database}, - journal = {{PLOS} Computational Biology} -} - - -@article{zhang2021pydrmetrics, - title = {{pyDRMetrics} - A Python toolkit for dimensionality reduction quality assessment}, - author = {Yinsheng Zhang and Qian Shang and Guoming Zhang}, - year = {2021}, - month = {Feb.}, - journal = {Heliyon}, - publisher = {Elsevier {BV}}, - volume = {7}, - number = {2}, - pages = {e06199}, - doi = {10.1016/j.heliyon.2021.e06199}, - url = {https://doi.org/10.1016/j.heliyon.2021.e06199} -} - - -@article{zhang2022identification, - title={Identification of spatially variable genes with graph cuts}, - author={Zhang, Ke and Feng, Wanwan and Wang, Peng}, - journal={Nature Communications}, - volume={13}, - number={1}, - pages={5488}, - year={2022}, - publisher={Nature Publishing Group UK London}, - doi={10.1038/s41467-022-33182-3} -} - - -@article{zhu2021spark, - title={SPARK-X: non-parametric modeling enables scalable and robust detection of spatial expression patterns for large spatial transcriptomic studies}, - author={Zhu, Jiaqiang and Sun, Shiquan and Zhou, Xiang}, - journal={Genome biology}, - volume={22}, - number={1}, - pages={184}, - year={2021}, - publisher={Springer}, - doi={10.1186/s13059-021-02404-0} -} - - -@article {hrovatin2023delineating, - author = {Karin Hrovatin and Aim{\'e}e Bastidas-Ponce and Mostafa Bakhti and Luke Zappia and Maren B{\"u}ttner and Ciro Sallino and Michael Sterr and Anika B{\"o}ttcher and Adriana Migliorini and Heiko Lickert and Fabian J. Theis}, - title = {Delineating mouse β-cell identity during lifetime and in diabetes with a single cell atlas}, - elocation-id = {2022.12.22.521557}, - year = {2023}, - doi = {10.1101/2022.12.22.521557}, - publisher = {Cold Spring Harbor Laboratory}, - URL = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557}, - eprint = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557.full.pdf}, - journal = {bioRxiv} -} - -@article{sikkema2023integrated, - title = {An integrated cell atlas of the lung in health and disease}, - volume = {29}, - ISSN = {1546-170X}, - url = {http://dx.doi.org/10.1038/s41591-023-02327-2}, - DOI = {10.1038/s41591-023-02327-2}, - number = {6}, - journal = {Nature Medicine}, - publisher = {Springer Science and Business Media LLC}, - author = {Sikkema, Lisa and Ramírez-Suástegui, Ciro and Strobl, Daniel C. and Gillett, Tessa E. and Zappia, Luke and Madissoon, Elo and Markov, Nikolay S. and Zaragosi, Laure-Emmanuelle and Ji, Yuge and Ansari, Meshal and Arguel, Marie-Jeanne and Apperloo, Leonie and Banchero, Martin and Bécavin, Christophe and Berg, Marijn and Chichelnitskiy, Evgeny and Chung, Mei-i and Collin, Antoine and Gay, Aurore C. A. and Gote-Schniering, Janine and Hooshiar Kashani, Baharak and Inecik, Kemal and Jain, Manu and Kapellos, Theodore S. and Kole, Tessa M. and Leroy, Sylvie and Mayr, Christoph H. and Oliver, Amanda J. and von Papen, Michael and Peter, Lance and Taylor, Chase J. and Walzthoeni, Thomas and Xu, Chuan and Bui, Linh T. and De Donno, Carlo and Dony, Leander and Faiz, Alen and Guo, Minzhe and Gutierrez, Austin J. and Heumos, Lukas and Huang, Ni and Ibarra, Ignacio L. and Jackson, Nathan D. and Kadur Lakshminarasimha Murthy, Preetish and Lotfollahi, Mohammad and Tabib, Tracy and Talavera-López, Carlos and Travaglini, Kyle J. and Wilbrey-Clark, Anna and Worlock, Kaylee B. and Yoshida, Masahiro and Chen, Yuexin and Hagood, James S. and Agami, Ahmed and Horvath, Peter and Lundeberg, Joakim and Marquette, Charles-Hugo and Pryhuber, Gloria and Samakovlis, Chistos and Sun, Xin and Ware, Lorraine B. and Zhang, Kun and van den Berge, Maarten and Bossé, Yohan and Desai, Tushar J. and Eickelberg, Oliver and Kaminski, Naftali and Krasnow, Mark A. and Lafyatis, Robert and Nikolic, Marko Z. and Powell, Joseph E. and Rajagopal, Jayaraj and Rojas, Mauricio and Rozenblatt-Rosen, Orit and Seibold, Max A. and Sheppard, Dean and Shepherd, Douglas P. and Sin, Don D. and Timens, Wim and Tsankov, Alexander M. and Whitsett, Jeffrey and Xu, Yan and Banovich, Nicholas E. and Barbry, Pascal and Duong, Thu Elizabeth and Falk, Christine S. and Meyer, Kerstin B. and Kropski, Jonathan A. and Pe’er, Dana and Schiller, Herbert B. and Tata, Purushothama Rao and Schultze, Joachim L. and Teichmann, Sara A. and Misharin, Alexander V. and Nawijn, Martijn C. and Luecken, Malte D. and Theis, Fabian J.}, - year = {2023}, - month = jun, - pages = {1563–1577} -} - -@article{consortium2022tabula, - title = {The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans}, - volume = {376}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.abl4896}, - DOI = {10.1126/science.abl4896}, - number = {6594}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Jones, Robert C. and Karkanias, Jim and Krasnow, Mark A. and Pisco, Angela Oliveira and Quake, Stephen R. and Salzman, Julia and Yosef, Nir and Bulthaup, Bryan and Brown, Phillip and Harper, William and Hemenez, Marisa and Ponnusamy, Ravikumar and Salehi, Ahmad and Sanagavarapu, Bhavani A. and Spallino, Eileen and Aaron, Ksenia A. and Concepcion, Waldo and Gardner, James M. and Kelly, Burnett and Neidlinger, Nikole and Wang, Zifa and Crasta, Sheela and Kolluru, Saroja and Morri, Maurizio and Pisco, Angela Oliveira and Tan, Serena Y. and Travaglini, Kyle J. and Xu, Chenling and Alcántara-Hernández, Marcela and Almanzar, Nicole and Antony, Jane and Beyersdorf, Benjamin and Burhan, Deviana and Calcuttawala, Kruti and Carter, Matthew M. and Chan, Charles K. F. and Chang, Charles A. and Chang, Stephen and Colville, Alex and Crasta, Sheela and Culver, Rebecca N. and Cvijović, Ivana and D’Amato, Gaetano and Ezran, Camille and Galdos, Francisco X. and Gillich, Astrid and Goodyer, William R. and Hang, Yan and Hayashi, Alyssa and Houshdaran, Sahar and Huang, Xianxi and Irwin, Juan C. and Jang, SoRi and Juanico, Julia Vallve and Kershner, Aaron M. and Kim, Soochi and Kiss, Bernhard and Kolluru, Saroja and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Li, Baoxiang and Loeb, Gabriel B. and Lu, Wan-Jin and Mantri, Sruthi and Markovic, Maxim and McAlpine, Patrick L. and de Morree, Antoine and Morri, Maurizio and Mrouj, Karim and Mukherjee, Shravani and Muser, Tyler and Neuh\"{o}fer, Patrick and Nguyen, Thi D. and Perez, Kimberly and Phansalkar, Ragini and Pisco, Angela Oliveira and Puluca, Nazan and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Schaum, Nicholas and Scott, Bronwyn and Seddighzadeh, Bobak and Segal, Joe and Sen, Sushmita and Sikandar, Shaheen and Spencer, Sean P. and Steffes, Lea C. and Subramaniam, Varun R. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Van Treuren, Will and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Vo, Kim Chi and Vorperian, Sevahn K. and Wang, Wanxin and Weinstein, Hannah N. W. and Winkler, Juliane and Wu, Timothy T. H. and Xie, Jamie and Yung, Andrea R. and Zhang, Yue and Detweiler, Angela M. and Mekonen, Honey and Neff, Norma F. and Sit, Rene V. and Tan, Michelle and Yan, Jia and Bean, Gregory R. and Charu, Vivek and Forgó, Erna and Martin, Brock A. and Ozawa, Michael G. and Silva, Oscar and Tan, Serena Y. and Toland, Angus and Vemuri, Venkata N. P. and Afik, Shaked and Awayan, Kyle and Botvinnik, Olga Borisovna and Byrne, Ashley and Chen, Michelle and Dehghannasiri, Roozbeh and Detweiler, Angela M. and Gayoso, Adam and Granados, Alejandro A. and Li, Qiqing and Mahmoudabadi, Gita and McGeever, Aaron and de Morree, Antoine and Olivieri, Julia Eve and Park, Madeline and Pisco, Angela Oliveira and Ravikumar, Neha and Salzman, Julia and Stanley, Geoff and Swift, Michael and Tan, Michelle and Tan, Weilun and Tarashansky, Alexander J. and Vanheusden, Rohan and Vorperian, Sevahn K. and Wang, Peter and Wang, Sheng and Xing, Galen and Xu, Chenling and Yosef, Nir and Alcántara-Hernández, Marcela and Antony, Jane and Chan, Charles K. F. and Chang, Charles A. and Colville, Alex and Crasta, Sheela and Culver, Rebecca and Dethlefsen, Les and Ezran, Camille and Gillich, Astrid and Hang, Yan and Ho, Po-Yi and Irwin, Juan C. and Jang, SoRi and Kershner, Aaron M. and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Liu, Shixuan and Loeb, Gabriel B. and Lu, Wan-Jin and Maltzman, Jonathan S. and Metzger, Ross J. and de Morree, Antoine and Neuh\"{o}fer, Patrick and Perez, Kimberly and Phansalkar, Ragini and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Sasagawa, Koki and Scott, Bronwyn and Sinha, Rahul and Song, Hanbing and Spencer, Sean P. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Wang, Bruce and Wang, Wanxin and Winkler, Juliane and Xie, Jamie and Yung, Andrea R. and Artandi, Steven E. and Beachy, Philip A. and Clarke, Michael F. and Giudice, Linda C. and Huang, Franklin W. and Huang, Kerwyn Casey and Idoyaga, Juliana and Kim, Seung K. and Krasnow, Mark and Kuo, Christin S. and Nguyen, Patricia and Quake, Stephen R. and Rando, Thomas A. and Red-Horse, Kristy and Reiter, Jeremy and Relman, David A. and Sonnenburg, Justin L. and Wang, Bruce and Wu, Albert and Wu, Sean M. and Wyss-Coray, Tony}, - year = {2022}, - month = may -} - -@article{dominguez2022crosstissue, - title = {Cross-tissue immune cell analysis reveals tissue-specific features in humans}, - volume = {376}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.abl5197}, - DOI = {10.1126/science.abl5197}, - number = {6594}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Domínguez Conde, C. and Xu, C. and Jarvis, L. B. and Rainbow, D. B. and Wells, S. B. and Gomes, T. and Howlett, S. K. and Suchanek, O. and Polanski, K. and King, H. W. and Mamanova, L. and Huang, N. and Szabo, P. A. and Richardson, L. and Bolt, L. and Fasouli, E. S. and Mahbubani, K. T. and Prete, M. and Tuck, L. and Richoz, N. and Tuong, Z. K. and Campos, L. and Mousa, H. S. and Needham, E. J. and Pritchard, S. and Li, T. and Elmentaite, R. and Park, J. and Rahmani, E. and Chen, D. and Menon, D. K. and Bayraktar, O. A. and James, L. K. and Meyer, K. B. and Yosef, N. and Clatworthy, M. R. and Sims, P. A. and Farber, D. L. and Saeb-Parsy, K. and Jones, J. L. and Teichmann, S. A.}, - year = {2022}, - month = may -} - -@article{eraslan2022singlenucleus, - title = {Single-nucleus cross-tissue molecular reference maps toward understanding disease gene function}, - volume = {376}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.abl4290}, - DOI = {10.1126/science.abl4290}, - number = {6594}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Eraslan, G\"{o}kcen and Drokhlyansky, Eugene and Anand, Shankara and Fiskin, Evgenij and Subramanian, Ayshwarya and Slyper, Michal and Wang, Jiali and Van Wittenberghe, Nicholas and Rouhana, John M. and Waldman, Julia and Ashenberg, Orr and Lek, Monkol and Dionne, Danielle and Win, Thet Su and Cuoco, Michael S. and Kuksenko, Olena and Tsankov, Alexander M. and Branton, Philip A. and Marshall, Jamie L. and Greka, Anna and Getz, Gad and Segrè, Ayellet V. and Aguet, Fran\c{c}ois and Rozenblatt-Rosen, Orit and Ardlie, Kristin G. and Regev, Aviv}, - year = {2022}, - month = may -} - -@article{li2023integrated, - title = {Integrated multi-omics single cell atlas of the human retina}, - url = {http://dx.doi.org/10.1101/2023.11.07.566105}, - DOI = {10.1101/2023.11.07.566105}, - publisher = {Cold Spring Harbor Laboratory}, - author = {Li, Jin and Wang, Jun and Ibarra, Ignacio L and Cheng, Xuesen and Luecken, Malte D and Lu, Jiaxiong and Monavarfeshani, Aboozar and Yan, Wenjun and Zheng, Yiqiao and Zuo, Zhen and Zayas Colborn, Samantha Lynn and Cortez, Berenice Sarahi and Owen, Leah A and Tran, Nicholas M and Shekhar, Karthik and Sanes, Joshua R and Stout, J Timothy and Chen, Shiming and Li, Yumei and DeAngelis, Margaret M and Theis, Fabian J and Chen, Rui}, - year = {2023}, - month = nov -} - -@article{wilson2022multimodal, - title = {Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression}, - volume = {13}, - ISSN = {2041-1723}, - url = {http://dx.doi.org/10.1038/s41467-022-32972-z}, - DOI = {10.1038/s41467-022-32972-z}, - number = {1}, - journal = {Nature Communications}, - publisher = {Springer Science and Business Media LLC}, - author = {Wilson, Parker C. and Muto, Yoshiharu and Wu, Haojia and Karihaloo, Anil and Waikar, Sushrut S. and Humphreys, Benjamin D.}, - year = {2022}, - month = sep -} - -@article{steuernagel2022hypomap, - title = {HypoMap—a unified single-cell gene expression atlas of the murine hypothalamus}, - volume = {4}, - ISSN = {2522-5812}, - url = {http://dx.doi.org/10.1038/s42255-022-00657-y}, - DOI = {10.1038/s42255-022-00657-y}, - number = {10}, - journal = {Nature Metabolism}, - publisher = {Springer Science and Business Media LLC}, - author = {Steuernagel, Lukas and Lam, Brian Y. H. and Klemm, Paul and Dowsett, Georgina K. C. and Bauder, Corinna A. and Tadross, John A. and Hitschfeld, Tamara Sotelo and del Rio Martin, Almudena and Chen, Weiyi and de Solis, Alain J. and Fenselau, Henning and Davidsen, Peter and Cimino, Irene and Kohnke, Sara N. and Rimmington, Debra and Coll, Anthony P. and Beyer, Andreas and Yeo, Giles S. H. and Br\"{u}ning, Jens C.}, - year = {2022}, - month = oct, - pages = {1402–1419} -} - -@article{tian2023singlecell, - title = {Single-cell DNA methylation and 3D genome architecture in the human brain}, - volume = {382}, - ISSN = {1095-9203}, - url = {http://dx.doi.org/10.1126/science.adf5357}, - DOI = {10.1126/science.adf5357}, - number = {6667}, - journal = {Science}, - publisher = {American Association for the Advancement of Science (AAAS)}, - author = {Tian, Wei and Zhou, Jingtian and Bartlett, Anna and Zeng, Qiurui and Liu, Hanqing and Castanon, Rosa G. and Kenworthy, Mia and Altshul, Jordan and Valadon, Cynthia and Aldridge, Andrew and Nery, Joseph R. and Chen, Huaming and Xu, Jiaying and Johnson, Nicholas D. and Lucero, Jacinta and Osteen, Julia K. and Emerson, Nora and Rink, Jon and Lee, Jasper and Li, Yang E. and Siletti, Kimberly and Liem, Michelle and Claffey, Naomi and O’Connor, Carolyn and Yanny, Anna Marie and Nyhus, Julie and Dee, Nick and Casper, Tamara and Shapovalova, Nadiya and Hirschstein, Daniel and Ding, Song-Lin and Hodge, Rebecca and Levi, Boaz P. and Keene, C. Dirk and Linnarsson, Sten and Lein, Ed and Ren, Bing and Behrens, M. Margarita and Ecker, Joseph R.}, - year = {2023}, - month = oct -} - - -@article{sonrel2023metaanalysis, - title = {Meta-analysis of (single-cell method) benchmarks reveals the need for extensibility and interoperability}, - volume = {24}, - ISSN = {1474-760X}, - url = {http://dx.doi.org/10.1186/s13059-023-02962-5}, - DOI = {10.1186/s13059-023-02962-5}, - number = {1}, - journal = {Genome Biology}, - publisher = {Springer Science and Business Media LLC}, - author = {Sonrel, Anthony and Luetge, Almut and Soneson, Charlotte and Mallona, Izaskun and Germain, Pierre-Luc and Knyazev, Sergey and Gilis, Jeroen and Gerber, Reto and Seurinck, Ruth and Paul, Dominique and Sonder, Emanuel and Crowell, Helena L. and Fanaswala, Imran and Al-Ajami, Ahmad and Heidari, Elyas and Schmeing, Stephan and Milosavljevic, Stefan and Saeys, Yvan and Mangul, Serghei and Robinson, Mark D.}, - year = {2023}, - month = may -} - - -@article{saelens2019comparison, - title = {A comparison of single-cell trajectory inference methods}, - volume = {37}, - ISSN = {1546-1696}, - url = {http://dx.doi.org/10.1038/s41587-019-0071-9}, - DOI = {10.1038/s41587-019-0071-9}, - number = {5}, - journal = {Nature Biotechnology}, - publisher = {Springer Science and Business Media LLC}, - author = {Saelens, Wouter and Cannoodt, Robrecht and Todorov, Helena and Saeys, Yvan}, - year = {2019}, - month = apr, - pages = {547–554} -} - - -@article{huang2018savergene, - title = {SAVER: gene expression recovery for single-cell RNA sequencing}, - volume = {15}, - ISSN = {1548-7105}, - url = {http://dx.doi.org/10.1038/s41592-018-0033-z}, - DOI = {10.1038/s41592-018-0033-z}, - number = {7}, - journal = {Nature Methods}, - publisher = {Springer Science and Business Media LLC}, - author = {Huang, Mo and Wang, Jingshu and Torre, Eduardo and Dueck, Hannah and Shaffer, Sydney and Bonasio, Roberto and Murray, John I. and Raj, Arjun and Li, Mingyao and Zhang, Nancy R.}, - year = {2018}, - month = jun, - pages = {539–542} -} - - -@article{chari2023speciousart, - title = {The specious art of single-cell genomics}, - volume = {19}, - ISSN = {1553-7358}, - url = {http://dx.doi.org/10.1371/journal.pcbi.1011288}, - DOI = {10.1371/journal.pcbi.1011288}, - number = {8}, - journal = {PLOS Computational Biology}, - publisher = {Public Library of Science (PLoS)}, - author = {Chari, Tara and Pachter, Lior}, - editor = {Papin, Jason A.}, - year = {2023}, - month = aug, - pages = {e1011288} -} - diff --git a/src/common/process_dataset_metadata/run/config.vsh.yaml b/src/common/process_dataset_metadata/run/config.vsh.yaml deleted file mode 100644 index 550b621ef6..0000000000 --- a/src/common/process_dataset_metadata/run/config.vsh.yaml +++ /dev/null @@ -1,29 +0,0 @@ -functionality: - name: run - namespace: common/process_dataset_metadata - description: >- - This workflow transforms the meta information of the datasets into a format - that can be used by the website. - argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - required: true - direction: input - example: meta.yaml - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - direction: output - default: meta.json - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/process_task_results/yaml_to_json -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/common/process_task_results/api/get_info.yaml b/src/common/process_task_results/api/get_info.yaml deleted file mode 100644 index 9691936615..0000000000 --- a/src/common/process_task_results/api/get_info.yaml +++ /dev/null @@ -1,23 +0,0 @@ -functionality: - namespace: common/process_task_results - arguments: - - name: "--input" - type: "file" - example: - description: "A yaml file" - - name: "--task_id" - type: "string" - description: "A task dir" - example: label_projection - - name: "--output" - type: "file" - direction: "output" - default: "output.json" - description: "Output json" - test_resources: - - type: python_script - path: /src/common/comp_tests/check_get_info.py - - path: /src - dest: openproblems/src - - path: /_viash.yaml - dest: openproblems/_viash.yaml \ No newline at end of file diff --git a/src/common/process_task_results/generate_qc/config.vsh.yaml b/src/common/process_task_results/generate_qc/config.vsh.yaml deleted file mode 100644 index 68a5d19682..0000000000 --- a/src/common/process_task_results/generate_qc/config.vsh.yaml +++ /dev/null @@ -1,39 +0,0 @@ -functionality: - name: "generate_qc" - description: "Generate task QC metrics" - namespace: common/process_task_results - arguments: - - name: "--task_info" - type: "file" - example: task_info.json - description: "Task info file" - - name: "--method_info" - type: "file" - example: method_info.json - description: "Method info file" - - name: "--metric_info" - type: "file" - example: metric_info.json - description: "Metric info file" - - name: "--dataset_info" - type: "file" - example: dataset_info.json - description: "Dataset info file" - - name: "--results" - type: "file" - example: results.json - description: "Results file" - - name: "--output" - type: "file" - direction: "output" - default: "output.json" - description: "Output json" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_api_info/config.vsh.yaml b/src/common/process_task_results/get_api_info/config.vsh.yaml deleted file mode 100644 index 0e7eb1696e..0000000000 --- a/src/common/process_task_results/get_api_info/config.vsh.yaml +++ /dev/null @@ -1,18 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - status: disabled - name: get_api_info - description: "Extract api info" - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, dplyr, yaml, rlang, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] - - type: native diff --git a/src/common/process_task_results/get_api_info/script.R b/src/common/process_task_results/get_api_info/script.R deleted file mode 100644 index 1686dee222..0000000000 --- a/src/common/process_task_results/get_api_info/script.R +++ /dev/null @@ -1,79 +0,0 @@ -library(purrr) -library(dplyr) -library(yaml) -library(rlang) - -## VIASH START -par <- list( - input = ".", - task_id = "label_projection", - output = "output/api.json" -) -## VIASH END - -comp_yamls <- list.files(paste(par$input, "src/tasks", par$task_id, "api", sep = "/"), pattern = "comp_", full.names = TRUE) -file_yamls <- list.files(paste(par$input, "src/tasks", par$task_id, "api", sep = "/"), pattern = "file_", full.names = TRUE) - -# list component - file args links -comp_file <- map_df(comp_yamls, function(yaml_file) { - conf <- yaml::read_yaml(yaml_file) - - map_df(conf$functionality$arguments, function(arg) { - tibble( - comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .), - arg_name = gsub("^-*", "", arg$name), - direction = arg$direction %||% "input", - file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .) - ) - }) -}) - -# get component info -comp_info <- map_df(comp_yamls, function(yaml_file) { - conf <- yaml::read_yaml(yaml_file) - - tibble( - name = basename(yaml_file) %>% gsub("\\.yaml", "", .), - label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .) - ) -}) - -# get file info -file_info <- map_df(file_yamls, function(yaml_file) { - arg <- yaml::read_yaml(yaml_file) - - tibble( - name = basename(yaml_file) %>% gsub("\\.yaml", "", .), - description = arg$description, - label = arg$info$label, - example = arg$example, - clean_label = name %>% gsub("file_", "", .) %>% gsub("_", " ", .) - ) -}) - -# get file - slot args -file_slot <- map_df(file_yamls, function(yaml_file) { - arg <- yaml::read_yaml(yaml_file) - - map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) { - df <- map_df(slot, as.data.frame) - df$struct <- group_name - df$file_name <- basename(yaml_file) %>% gsub("\\.yaml", "", .) - as_tibble(df) - }) -}) %>% - mutate(multiple = multiple %|% FALSE) - -out <- list( - comp_info = purrr::transpose(comp_info), - file_info = purrr::transpose(file_info), - comp_file_io = purrr::transpose(comp_file), - file_schema = purrr::transpose(file_slot) -) - -jsonlite::write_json( - out, - par$output, - auto_unbox = TRUE, - pretty = TRUE -) diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml deleted file mode 100644 index 10247a22ba..0000000000 --- a/src/common/process_task_results/get_dataset_info/config.vsh.yaml +++ /dev/null @@ -1,20 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - name: "get_dataset_info" - description: "Extract dataset info and convert to expected format for website results" - resources: - - type: r_script - path: script.R - test_resources: - - type: file - path: /resources_test/common/task_metadata/dataset_info.yaml - dest: test_file.yaml -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, yaml, rlang, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_method_info/config.vsh.yaml b/src/common/process_task_results/get_method_info/config.vsh.yaml deleted file mode 100644 index 053bbac53c..0000000000 --- a/src/common/process_task_results/get_method_info/config.vsh.yaml +++ /dev/null @@ -1,20 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - name: "get_method_info" - description: "Extract method info" - resources: - - type: r_script - path: script.R - test_resources: - - type: file - path: /resources_test/common/task_metadata/method_configs.yaml - dest: test_file.yaml -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, yaml, rlang, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_metric_info/config.vsh.yaml b/src/common/process_task_results/get_metric_info/config.vsh.yaml deleted file mode 100644 index ee5833b5b9..0000000000 --- a/src/common/process_task_results/get_metric_info/config.vsh.yaml +++ /dev/null @@ -1,20 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - name: "get_metric_info" - description: "Extract metric info" - resources: - - type: r_script - path: script.R - test_resources: - - type: file - path: /resources_test/common/task_metadata/metric_configs.yaml - dest: test_file.yaml -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, yaml, rlang, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_results/config.vsh.yaml b/src/common/process_task_results/get_results/config.vsh.yaml deleted file mode 100644 index cd639fad4d..0000000000 --- a/src/common/process_task_results/get_results/config.vsh.yaml +++ /dev/null @@ -1,51 +0,0 @@ -functionality: - name: "get_results" - description: "Extract execution info" - namespace: common/process_task_results - arguments: - - name: "--task_id" - type: "string" - example: "batch_integration" - description: "Task id" - - name: "--input_scores" - type: "file" - example: score_uns.yaml - description: "Scores file" - - name: "--input_execution" - type: "file" - example: trace.txt - description: "Nextflow log file" - - name: "--input_dataset_info" - type: "file" - example: dataset_info.json - description: "Method info file" - - name: "--input_method_info" - type: "file" - example: method_info.json - description: "Method info file" - - name: "--input_metric_info" - type: "file" - example: metric_info.json - description: "Metric info file" - - name: "--output_results" - type: "file" - direction: "output" - default: "results.json" - description: "Output json" - - name: "--output_metric_execution_info" - type: "file" - direction: "output" - default: "metric_execution_info.json" - description: "Output metric execution info" - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, yaml, rlang, dplyr, tidyr, readr, lubridate, dynutils, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_task_info/config.vsh.yaml b/src/common/process_task_results/get_task_info/config.vsh.yaml deleted file mode 100644 index 2e8fbd2b66..0000000000 --- a/src/common/process_task_results/get_task_info/config.vsh.yaml +++ /dev/null @@ -1,20 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - name: "get_task_info" - description: "Extract task info" - resources: - - type: r_script - path: script.R - test_resources: - - type: file - path: /resources_test/common/task_metadata/task_info.yaml - dest: test_file.yaml -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ purrr, yaml, rlang, processx ] - - type: nextflow - directives: - label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/run/config.vsh.yaml b/src/common/process_task_results/run/config.vsh.yaml deleted file mode 100644 index d746a54245..0000000000 --- a/src/common/process_task_results/run/config.vsh.yaml +++ /dev/null @@ -1,91 +0,0 @@ -functionality: - name: run - namespace: common/process_task_results - description: >- - This workflow transforms the meta information of the results into a format - that can be used by the website. - argument_groups: - - name: Inputs - arguments: - - name: "--input_scores" - type: file - required: true - direction: input - description: A yaml file containing the scores of each of the methods - example: score_uns.yaml - - name: "--input_method_configs" - type: file - required: true - direction: input - example: method_configs.yaml - - name: "--input_metric_configs" - type: file - required: true - direction: input - example: metric_configs.yaml - - name: "--input_dataset_info" - type: file - required: true - direction: input - example: dataset_info.yaml - - name: "--input_execution" - type: file - required: true - direction: input - example: trace.txt - - name: "--input_task_info" - type: file - required: true - direction: input - example: task_info.yaml - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: results.json - - name: "--output_method_info" - type: file - required: true - direction: output - default: method_info.json - - name: "--output_metric_info" - type: file - required: true - direction: output - default: metric_info.json - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_info.json - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.json - - name: "--output_qc" - type: file - required: true - direction: output - default: quality_control.json - - name: "--output_metric_execution_info" - type: file - required: true - direction: output - default: metric_execution_info.json - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/process_task_results/get_results - - name: common/process_task_results/get_method_info - - name: common/process_task_results/get_metric_info - - name: common/process_task_results/get_dataset_info - - name: common/process_task_results/get_task_info - - name: common/process_task_results/generate_qc -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/common/process_task_results/yaml_to_json/config.vsh.yaml b/src/common/process_task_results/yaml_to_json/config.vsh.yaml deleted file mode 100644 index 7231cdcdbf..0000000000 --- a/src/common/process_task_results/yaml_to_json/config.vsh.yaml +++ /dev/null @@ -1,16 +0,0 @@ -__merge__: ../api/get_info.yaml -functionality: - name: "yaml_to_json" - description: "convert yaml file to json file" - resources: - - type: python_script - path: script.py - test_resources: - - type: file - path: /resources_test/common/task_metadata/dataset_info.yaml - dest: test_file.yaml -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - - type: native diff --git a/src/common/resources_test_scripts/aws_sync.sh b/src/common/resources_test_scripts/aws_sync.sh deleted file mode 100644 index 0541df125a..0000000000 --- a/src/common/resources_test_scripts/aws_sync.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -echo "Run the command in this script manually" -exit 1 - -aws s3 sync "resources_test" "s3://openproblems-data/resources_test" --exclude "*/temp*" --exclude "*/tmp*" --delete --dryrun -aws s3 sync "resources" "s3://openproblems-data/resources" --exclude */temp_* --delete --dryrun diff --git a/src/common/resources_test_scripts/task_metadata.sh b/src/common/resources_test_scripts/task_metadata.sh deleted file mode 100755 index cd9072f443..0000000000 --- a/src/common/resources_test_scripts/task_metadata.sh +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/bash - -# make sure folloewing command has been executed -# viash ns build -q 'common' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -DATASETS_DIR="resources_test/batch_integration" -OUTPUT_DIR="resources_test/common/task_metadata" - - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -# Create small git sha input file -sha_file="$OUTPUT_DIR/input_git_sha.json" - -cat < $sha_file -[ - { - "path": "tasks/denoising/README.md", - "last_modified": "2022-09-20 14:26:51 -0400", - "sha": "3fe9251ba906061b6769eed2ac9da0db5f8e26bb" - }, - { - "path": "tasks/denoising/__init__.py", - "last_modified": "2022-09-30 14:49:17 +0200", - "sha": "c97decf07adb2e3050561d6fa9ae46132be07bef" - }, - { - "path": "tasks/denoising/api.py", - "last_modified": "2022-10-21 13:56:15 -0400", - "sha": "b460ecb183328c857cbbf653488f522a4034a61c" - }, - { - "path": "tasks/denoising/datasets/__init__.py", - "last_modified": "2022-11-23 10:32:02 -0500", - "sha": "725ff0c46140aaa6bbded68646256f64bc63df6d" - }, - { - "path": "tasks/denoising/datasets/pancreas.py", - "last_modified": "2022-12-04 12:06:43 -0500", - "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" - }, - { - "path": "tasks/denoising/datasets/pbmc.py", - "last_modified": "2022-12-04 12:06:43 -0500", - "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" - }, - { - "path": "tasks/denoising/datasets/tabula_muris_senis.py", - "last_modified": "2022-12-04 12:06:43 -0500", - "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" - }, - { - "path": "tasks/denoising/datasets/utils.py", - "last_modified": "2022-11-15 17:19:16 -0500", - "sha": "c2470ce02e6f196267cec1c554ba7ae389c0956a" - }, - { - "path": "tasks/denoising/methods/__init__.py", - "last_modified": "2022-10-21 13:56:15 -0400", - "sha": "b460ecb183328c857cbbf653488f522a4034a61c" - }, - { - "path": "tasks/denoising/methods/alra.R", - "last_modified": "2022-05-16 15:10:42 -0400", - "sha": "ba06cf71b564eb23823a662341055dc5ac2be231" - }, - { - "path": "tasks/denoising/methods/alra.py", - "last_modified": "2022-07-25 12:29:34 -0400", - "sha": "411a416150ecabce25e1f59bde422a029d0a8baa" - }, - { - "path": "tasks/denoising/methods/baseline.py", - "last_modified": "2022-10-21 13:56:15 -0400", - "sha": "b460ecb183328c857cbbf653488f522a4034a61c" - }, - { - "path": "tasks/denoising/methods/dca.py", - "last_modified": "2022-12-01 15:38:21 -0500", - "sha": "aa2253779e9aa9cd178f54ac0f3b6ba521ecd59f" - }, - { - "path": "tasks/denoising/methods/knn_smoothing.py", - "last_modified": "2022-11-14 11:54:15 -0500", - "sha": "bbecf4e9ad90007c2711394e7fbd8e49cbd3e4a1" - }, - { - "path": "tasks/denoising/methods/magic.py", - "last_modified": "2022-11-14 11:57:35 -0500", - "sha": "2af9a4918ed3370859f71774558068961f6d22c6" - }, - { - "path": "tasks/denoising/metrics/__init__.py", - "last_modified": "2021-01-19 13:31:20 -0500", - "sha": "8e0600c516c392fa747137415b6a93b8af0f61d8" - }, - { - "path": "tasks/denoising/metrics/mse.py", - "last_modified": "2022-11-15 17:19:16 -0500", - "sha": "c2470ce02e6f196267cec1c554ba7ae389c0956a" - }, - { - "path": "tasks/denoising/metrics/poisson.py", - "last_modified": "2022-12-04 12:06:43 -0500", - "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" - } -] -EOT - -# Create all metadata -export NXF_VER=22.04.5 - -nextflow run . \ - -main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -c src/wf_utils/labels_ci.config \ - -with-trace \ - -entry auto \ - --input_states "$DATASETS_DIR/pancreas/state.yaml" \ - --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml", "method_ids": ["bbknn", "mnnpy", "mnnr"]}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" - -cp trace.txt "$OUTPUT_DIR/trace.txt" - - -viash run src/common/process_task_results/get_method_info/config.vsh.yaml -- --input "$OUTPUT_DIR/method_configs.yaml" --output "$OUTPUT_DIR/method_info.json" diff --git a/src/common/schemas/api_component.yaml b/src/common/schemas/api_component.yaml deleted file mode 100644 index b197e2e367..0000000000 --- a/src/common/schemas/api_component.yaml +++ /dev/null @@ -1,67 +0,0 @@ -title: Component API -description: | - A component type specification file. -type: object -required: [functionality] -properties: - functionality: - type: object - description: Information regarding the functionality of the component. - required: [namespace, info, arguments, test_resources] - additionalProperties: false - properties: - namespace: - "$ref": "defs_common.yaml#/definitions/Namespace" - info: - type: object - description: Metadata of the component. - additionalProperties: false - required: [type, type_info] - properties: - type: - "$ref": "defs_common.yaml#/definitions/ComponentType" - subtype: - "$ref": "defs_common.yaml#/definitions/ComponentSubtype" - type_info: - type: object - description: Metadata related to the component type. - required: [label, summary, description] - properties: - label: - $ref: "defs_common.yaml#/definitions/Label" - summary: - $ref: "defs_common.yaml#/definitions/Summary" - description: - $ref: "defs_common.yaml#/definitions/Description" - arguments: - type: array - description: Component-specific parameters. - items: - anyOf: - - $ref: 'defs_common.yaml#/definitions/ComponentAPIFile' - - $ref: 'defs_viash.yaml#/definitions/BooleanArgument' - - $ref: 'defs_viash.yaml#/definitions/BooleanArgument' - - $ref: 'defs_viash.yaml#/definitions/BooleanTrueArgument' - - $ref: 'defs_viash.yaml#/definitions/BooleanFalseArgument' - - $ref: 'defs_viash.yaml#/definitions/DoubleArgument' - - $ref: 'defs_viash.yaml#/definitions/IntegerArgument' - - $ref: 'defs_viash.yaml#/definitions/LongArgument' - - $ref: 'defs_viash.yaml#/definitions/StringArgument' - resources: - type: array - description: Resources required to run the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - test_resources: - type: array - description: One or more scripts and resources used to test the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - platforms: - type: array - description: A list of platforms which Viash generates target artifacts for. - items: - anyOf: - - "$ref": "defs_common.yaml#/definitions/PlatformDocker" - - "$ref": "defs_common.yaml#/definitions/PlatformNative" - - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/api_file.yaml b/src/common/schemas/api_file.yaml deleted file mode 100644 index 6294439eda..0000000000 --- a/src/common/schemas/api_file.yaml +++ /dev/null @@ -1,26 +0,0 @@ -title: File API -description: A file format specification file. -type: object -additionalProperties: false -required: [type, example, info] -properties: - type: - const: file - example: - description: A file in the `resources_test` folder which is an example of this file format. - type: string - __merge__: - $ref: "defs_common.yaml#/definitions/Merge" - info: - description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' - type: object - required: [label, summary] - properties: - label: - $ref: "defs_common.yaml#/definitions/Label" - summary: - $ref: "defs_common.yaml#/definitions/Summary" - description: - $ref: "defs_common.yaml#/definitions/Description" - slots: - $ref: "defs_common.yaml#/definitions/AnnDataSlots" diff --git a/src/common/schemas/defs_common.yaml b/src/common/schemas/defs_common.yaml deleted file mode 100644 index 60b9946210..0000000000 --- a/src/common/schemas/defs_common.yaml +++ /dev/null @@ -1,256 +0,0 @@ -definitions: - PlatformVdsl3: - title: VDSL3 - description: Next-gen platform for generating NextFlow VDSL3 modules. - properties: - type: - const: nextflow - description: Next-gen platform for generating NextFlow VDSL3 modules. - directives: - $ref: 'defs_viash.yaml#/definitions/NextflowDirectives' - required: [ type ] - additionalProperties: false - PlatformDocker: - title: Docker platform - description: | - Run a Viash component on a Docker backend platform. - By specifying which dependencies your component needs, users are be able to build - a docker container from scratch using the setup flag, or pull it from a docker repository. - type: object - properties: - type: - const: docker - description: Run a Viash component on a Docker backend platform. - image: - type: string - description: The base container to start from. You can also add the tag here - if you wish. - run_args: - anyOf: - - type: string - description: Add docker run arguments. - - type: array - items: - type: string - description: Add docker run arguments. - target_image_source: - type: string - description: The source of the target image. This is used for defining labels - in the dockerfile. - setup: - type: array - items: - "$ref": "defs_viash.yaml#/definitions/Requirements" - test_setup: - type: array - items: - "$ref": "defs_viash.yaml#/definitions/Requirements" - required: [type, image] - additionalProperties: false - PlatformNative: - title: Native platform - type: object - properties: - type: - const: native - description: Specifies the type of the platform. Running a Viash component - on a native platform means that the script will be executed in your current - environment. - required: [ type ] - additionalProperties: false - PreferredNormalization: - enum: [l1_sqrt, log_cpm, log_cp10k, log_scran_pooling, sqrt_cpm, sqrt_cp10k, counts] - description: | - Which normalization method a component prefers. - - Each value corresponds to a normalization component in the directory `src/datasets/normalization`. - ComponentSubtype: - type: string - description: | - A component subtype, in case the task has multiple subtypes of methods and metrics. - ComponentType: - type: string - description: | - A component subtype, in case the task has multiple subtypes of methods and metrics. - Name: - type: string - description: | - A unique identifier. Can only contain lowercase letters, numbers or underscores. - pattern: "^[a-z_][a-z0-9_]*$" - maxLength: 50 - Namespace: - type: string - description: | - The namespace a component is part of. - pattern: "^[a-z_][a-z0-9_/]*$" - Label: - type: string - description: | - A unique, human-readable, short label. Used for creating summary tables and visualisations. - maxLength: 50 - Image: - type: string - description: | - The name of the image file to use for the component on the website. - Summary: - type: string - description: | - A one sentence summary of purpose and methodology. Used for creating an overview tables. - minLength: 15 - maxLength: 180 - Description: - type: string - description: | - A longer description (one or more paragraphs). Used for creating reference documentation and supplementary information. - minLength: 30 - BibtexReference: - type: string - description: | - A bibtex reference key to the paper where the component is described. - DocumentationURL: - type: string - format: uri - pattern: "^https://" - description: The url to the documentation of the used software library. - RepositoryURL: - type: string - format: uri - pattern: "^https://" - description: The url to the repository of the used software library. - MethodVariants: - type: object - description: Alternative parameter sets which should be evaluated in the benchmark. - properties: - preferred_normalization: - "$ref": "#/definitions/PreferredNormalization" - CompAPIMerge: - type: string - description: | - The API specifies which type of component this is. - It contains specifications for: - - - The input/output files - - Common parameters - - A unit test - Merge: - type: string - description: | - Another YAML to inherit values from. - ComponentAPIFile: - description: A `file` type argument has a string value that points to a file or folder path. - type: object - properties: - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f` or `foo`. The number of dashes determines how values can be passed: \n\n - `--foo` is a long option, which can be passed with `executable_name --foo=value` or `executable_name --foo value`\n - `-f` is a short option, which can be passed with `executable_name -f value`\n - `foo` is an argument, which can be passed with `executable_name value` \n" - type: string - __merge__: - type: string - description: The file format specification file. - direction: - description: Makes this argument an `input` or an `output`, as in does the file/folder needs to be read or written. `input` by default. - $ref: 'defs_viash.yaml#/definitions/Direction' - info: - description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' - type: object - required: - description: Make the value for this argument required. If set to `true`, an error will be produced if no value was provided. `false` by default. - type: boolean - required: [name, __merge__, direction, required] - additionalProperties: false - AnnDataSlots: - properties: - X: - $ref: "#/definitions/AnnDataSlot" - layers: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - var: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - varm: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - varp: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - obs: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - obsm: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - obsp: - type: array - items: - $ref: "#/definitions/AnnDataSlot" - uns: - type: array - items: - oneOf: - - $ref: "#/definitions/AnnDataSlot" - - $ref: "#/definitions/AnnDataSlotObject" - AnnDataSlot: - properties: - type: - enum: [integer, double, string, boolean] - name: - type: string - description: A unique identifier. - pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" - description: - type: string - required: - type: boolean - required: [type, name, description, required] - AnnDataSlotObject: - properties: - type: - enum: [object] - name: - type: string - description: A unique identifier. - pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" - description: - type: string - required: - type: boolean - required: [type, name, description, required] - Author: - description: Author metadata. - type: object - additionalProperties: false - properties: - name: - description: Full name of the author, usually in the name of FirstName MiddleName LastName. - type: string - info: - description: Additional information on the author - type: object - additionalProperties: false - properties: - github: - type: string - orcid: - type: string - email: - type: string - twitter: - type: string - linkedin: - type: string - roles: - description: | - Role of the author. Possible values: - - * `"author"`: Authors who have made substantial contributions to the component. - * `"maintainer"`: The maintainer of the component. - * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). - type: array - items: - enum: [maintainer, author, contributor] \ No newline at end of file diff --git a/src/common/schemas/defs_viash.yaml b/src/common/schemas/defs_viash.yaml deleted file mode 100644 index fff25ab382..0000000000 --- a/src/common/schemas/defs_viash.yaml +++ /dev/null @@ -1,2252 +0,0 @@ -$schema: "https://json-schema.org/draft-07/schema#" -title: Viash config schema definitions. -oneOf: - - $ref: "#/definitions/Config" -definitions: - Config: - description: "A Viash Config" - properties: - functionality: - description: "The functionality-part of the config file describes the behaviour\ - \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ - \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ - \ generate a stylish command-line interface for you.\n" - $ref: "#/definitions/Functionality" - platforms: - description: "Definition of the platforms" - type: "array" - items: - $ref: "#/definitions/Platforms" - info: - description: "Definition of meta data" - $ref: "#/definitions/Info" - required: - - "functionality" - additionalProperties: false - NativePlatform: - description: "Running a Viash component on a native platform means that the script\ - \ will be executed in your current environment.\nAny dependencies are assumed\ - \ to have been installed by the user, so the native platform is meant for developers\ - \ (who know what they're doing) or for simple bash scripts (which have no extra\ - \ dependencies).\n" - type: "object" - properties: - id: - description: "As with all platforms, you can give a platform a different name.\ - \ By specifying `id: foo`, you can target this platform (only) by specifying\ - \ `-p foo` in any of the Viash commands." - type: "string" - type: - description: "Running a Viash component on a native platform means that the\ - \ script will be executed in your current environment.\nAny dependencies\ - \ are assumed to have been installed by the user, so the native platform\ - \ is meant for developers (who know what they're doing) or for simple bash\ - \ scripts (which have no extra dependencies).\n" - const: "native" - required: - - "type" - additionalProperties: false - DockerPlatform: - description: "Run a Viash component on a Docker backend platform.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a docker\ - \ container from scratch using the setup flag, or pull it from a docker repository.\n" - type: "object" - properties: - organization: - description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." - type: "string" - registry: - description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" - type: "string" - image: - description: "The base container to start from. You can also add the tag here\ - \ if you wish." - type: "string" - tag: - description: "Specify a Docker image based on its tag." - type: "string" - target_tag: - description: "The tag the resulting image gets. Advanced usage only." - type: "string" - run_args: - anyOf: - - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ - \ arguments." - type: "string" - - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ - \ arguments." - type: "array" - items: - type: "string" - namespace_separator: - description: "The separator between the namespace and the name of the component,\ - \ used for determining the image name. Default: `\"/\"`." - type: "string" - resolve_volume: - description: "Enables or disables automatic volume mapping. Enabled when set\ - \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." - $ref: "#/definitions/DockerResolveVolume" - port: - anyOf: - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" - items: - type: "string" - setup: - description: "A list of requirements for installing the following types of\ - \ packages:\n\n - @[apt](apt_req)\n - @[apk](apk_req)\n - @[Docker setup\ - \ instructions](docker_req)\n - @[JavaScript](javascript_req)\n - @[Python](python_req)\n\ - \ - @[R](r_req)\n - @[Ruby](ruby_req)\n - @[yum](yum_req)\n\nThe order in\ - \ which these dependencies are specified determines the order in which they\ - \ will be installed.\n" - type: "array" - items: - $ref: "#/definitions/Requirements" - workdir: - description: "The working directory when starting the container. This doesn't\ - \ change the Dockerfile but gets added as a command-line argument at runtime." - type: "string" - target_image: - description: "If anything is specified in the setup section, running the `---setup`\ - \ will result in an image with the name of `:`. If\ - \ nothing is specified in the `setup` section, simply `image` will be used.\ - \ Advanced usage only." - type: "string" - cmd: - anyOf: - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "string" - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "array" - items: - type: "string" - target_image_source: - description: "The source of the target image. This is used for defining labels\ - \ in the dockerfile." - type: "string" - test_setup: - description: "Additional requirements specific for running unit tests." - type: "array" - items: - $ref: "#/definitions/Requirements" - entrypoint: - anyOf: - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "string" - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "array" - items: - type: "string" - id: - description: "As with all platforms, you can give a platform a different name.\ - \ By specifying `id: foo`, you can target this platform (only) by specifying\ - \ `-p foo` in any of the Viash commands." - type: "string" - target_registry: - description: "The URL where the resulting image will be pushed to. Advanced\ - \ usage only." - type: "string" - setup_strategy: - description: "The Docker setup strategy to use when building a container.\n\ - \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ - \ / `b` | Always build the image from the dockerfile. This is the default\ - \ setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb` | Always\ - \ build the image from the dockerfile, with caching enabled.\n| `ifneedbebuild`\ - \ | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ - \ | Build the image with caching enabled if it does not exist locally, with\ - \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ - \ from [Docker Hub](https://hub.docker.com) or the @[specified docker registry](docker_registry).\n\ - | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ - \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ - \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ - \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ - \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ - \ If the image does not exist locally, pull the image. If the image does\ - \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ - \ not exist locally, pull the image. If the image does exist, build it with\ - \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ - \ or the @[specified docker registry](docker_registry).\n| `pushifnotpresent`\ - \ | Push the container to [Docker Hub](https://hub.docker.com) or the @[specified\ - \ docker registry](docker_registry) if the @[tag](docker_tag) does not exist\ - \ yet.\n| `donothing` / `meh` | Do not build or pull anything.\n\n" - $ref: "#/definitions/DockerSetupStrategy" - type: - description: "Run a Viash component on a Docker backend platform.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a\ - \ docker container from scratch using the setup flag, or pull it from a\ - \ docker repository.\n" - const: "docker" - target_organization: - description: "The organization set in the resulting image. Advanced usage\ - \ only." - type: "string" - chown: - description: "In Linux, files created by a Docker container will be owned\ - \ by `root`. With `chown: true`, Viash will automatically change the ownership\ - \ of output files (arguments with `type: file` and `direction: output`)\ - \ to the user running the Viash command after execution of the component.\ - \ Default value: `true`." - type: "boolean" - required: - - "image" - - "type" - additionalProperties: false - NextflowVdsl3Platform: - description: "Next-gen platform for generating NextFlow VDSL3 modules." - type: "object" - properties: - auto: - description: "@[Automated processing flags](nextflow_auto) which can be toggled\ - \ on or off:\n\n| Flag | Description | Default |\n|---|---------|----|\n\ - | `simplifyInput` | If `true`, an input tuple only containing only a single\ - \ File (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed\ - \ to a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n\ - | `simplifyOutput` | If `true`, an output tuple containing a map with a\ - \ File (e.g. `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically\ - \ transformed to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `true`\ - \ |\n| `transcript` | If `true`, the module's transcripts from `work/` are\ - \ automatically published to `params.transcriptDir`. If not defined, `params.publishDir\ - \ + \"/_transcripts\"` will be used. Will throw an error if neither are\ - \ defined. | `false` |\n| `publish` | If `true`, the module's outputs are\ - \ automatically published to `params.publishDir`. Will throw an error if\ - \ `params.publishDir` is not defined. | `false` |\n\n" - $ref: "#/definitions/NextflowAuto" - directives: - description: "@[Directives](nextflow_directives) are optional settings that\ - \ affect the execution of the process. These mostly match up with the Nextflow\ - \ counterparts. \n" - $ref: "#/definitions/NextflowDirectives" - container: - description: "Specifies the Docker platform id to be used to run Nextflow." - type: "string" - debug: - description: "Whether or not to print debug messages." - type: "boolean" - id: - description: "Every platform can be given a specific id that can later be\ - \ referred to explicitly when running or building the Viash component." - type: "string" - type: - description: "Next-gen platform for generating NextFlow VDSL3 modules." - const: "nextflow" - config: - description: "Allows tweaking how the @[Nextflow Config](nextflow_config)\ - \ file is generated." - $ref: "#/definitions/NextflowConfig" - required: - - "type" - additionalProperties: false - Platforms: - anyOf: - - $ref: "#/definitions/NativePlatform" - - $ref: "#/definitions/DockerPlatform" - - $ref: "#/definitions/NextflowVdsl3Platform" - Info: - description: "Meta information fields filled in by Viash during build." - type: "object" - properties: - git_tag: - description: "Git tag." - type: "string" - git_remote: - description: "Git remote name." - type: "string" - viash_version: - description: "The Viash version that was used to build the component." - type: "string" - config: - description: "Path to the config used during build." - type: "string" - output: - description: "Folder path to the build artifacts." - type: "string" - platform: - description: "The platform id used during build." - type: "string" - git_commit: - description: "Git commit hash." - type: "string" - executable: - description: "Output folder with main executable path." - type: "string" - required: - - "config" - additionalProperties: false - Functionality: - description: "The functionality-part of the config file describes the behaviour\ - \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ - \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ - \ generate a stylish command-line interface for you.\n" - type: "object" - properties: - name: - description: "Name of the component and the filename of the executable when\ - \ built with `viash build`." - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - version: - description: "Version of the component. This field will be used to version\ - \ the executable and the Docker container." - type: "string" - authors: - description: "A list of @[authors](author). An author must at least have a\ - \ name, but can also have a list of roles, an e-mail address, and a map\ - \ of custom properties.\n\nSuggested values for roles are:\n \n| Role |\ - \ Abbrev. | Description |\n|------|---------|-------------|\n| maintainer\ - \ | mnt | for the maintainer of the code. Ideally, exactly one maintainer\ - \ is specified. |\n| author | aut | for persons who have made substantial\ - \ contributions to the software. |\n| contributor | ctb| for persons who\ - \ have made smaller contributions (such as code patches).\n| datacontributor\ - \ | dtc | for persons or organisations that contributed data sets for the\ - \ software\n| copyrightholder | cph | for all copyright holders. This is\ - \ a legal concept so should use the legal name of an institution or corporate\ - \ body.\n| funder | fnd | for persons or organizations that furnished financial\ - \ support for the development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ - \ is extremely comprehensive.\n" - type: "array" - items: - $ref: "#/definitions/Author" - status: - description: "Allows setting a component to active, deprecated or disabled." - $ref: "#/definitions/Status" - requirements: - description: "@[Computational requirements](computational_requirements) related\ - \ to running the component. \n`cpus` specifies the maximum number of (logical)\ - \ cpus a component is allowed to use., whereas\n`memory` specifies the maximum\ - \ amount of memory a component is allowed to allicate. Memory units must\ - \ be\nin B, KB, MB, GB, TB or PB." - $ref: "#/definitions/ComputationalRequirements" - resources: - description: "@[Resources](resources) are files that support the component.\ - \ The first resource should be @[a script](scripting_languages) that will\ - \ be executed when the functionality is run. Additional resources will be\ - \ copied to the same directory.\n\nCommon properties:\n\n * type: `file`\ - \ / `r_script` / `python_script` / `bash_script` / `javascript_script` /\ - \ `scala_script` / `csharp_script`, specifies the type of the resource.\ - \ The first resource cannot be of type `file`. When the type is not specified,\ - \ the default type is simply `file`.\n * dest: filename, the resulting name\ - \ of the resource. From within a script, the file can be accessed at `meta[\"\ - resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ - \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ - \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ - \ exclusive with `text`.\n * text: ...multiline text..., the content of\ - \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ - \ * is_executable: `true` / `false`, whether the resulting resource file\ - \ should be made executable.\n" - type: "array" - items: - $ref: "#/definitions/Resource" - test_resources: - description: "One or more @[scripts](scripting_languages) to be used to test\ - \ the component behaviour when `viash test` is invoked. Additional files\ - \ of type `file` will be made available only during testing. Each test script\ - \ should expect no command-line inputs, be platform-independent, and return\ - \ an exit code >0 when unexpected behaviour occurs during testing. See @[Unit\ - \ Testing](unit_testing) for more info." - type: "array" - items: - $ref: "#/definitions/Resource" - argument_groups: - description: "A grouping of the @[arguments](argument), used to display the\ - \ help message.\n\n - `name: foo`, the name of the argument group. \n -\ - \ `description: Description of foo`, a description of the argument group.\ - \ Multiline descriptions are supported.\n - `arguments: [arg1, arg2, ...]`,\ - \ list of the arguments names.\n\n" - type: "array" - items: - $ref: "#/definitions/ArgumentGroup" - description: - description: "A description of the component. This will be displayed with\ - \ `--help`." - type: "string" - usage: - description: "A description on how to use the component. This will be displayed\ - \ with `--help` under the 'Usage:' section." - type: "string" - namespace: - description: "Namespace this component is a part of. See the @[Namespaces\ - \ guide](namespace) for more information on namespaces." - type: "string" - arguments: - description: "A list of @[arguments](argument) for this component. For each\ - \ argument, a type and a name must be specified. Depending on the type of\ - \ argument, different properties can be set. See these reference pages per\ - \ type for more information: \n\n - @[string](arg_string)\n - @[file](arg_file)\n\ - \ - @[integer](arg_integer)\n - @[double](arg_double)\n - @[boolean](arg_boolean)\n\ - \ - @[boolean_true](arg_boolean_true)\n - @[boolean_false](arg_boolean_false)\n" - type: "array" - items: - $ref: "#/definitions/Argument" - required: - - "name" - additionalProperties: false - Author: - description: "Author metadata." - type: "object" - properties: - name: - description: "Full name of the author, usually in the name of FirstName MiddleName\ - \ LastName." - type: "string" - email: - description: "E-mail of the author." - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - roles: - anyOf: - - description: "Role of the author. Suggested items:\n\n* `\"author\"`: Authors\ - \ who have made substantial contributions to the component.\n* `\"maintainer\"\ - `: The maintainer of the component.\n* `\"contributor\"`: Authors who\ - \ have made smaller contributions (such as code patches etc.).\n" - type: "string" - - description: "Role of the author. Suggested items:\n\n* `\"author\"`: Authors\ - \ who have made substantial contributions to the component.\n* `\"maintainer\"\ - `: The maintainer of the component.\n* `\"contributor\"`: Authors who\ - \ have made smaller contributions (such as code patches etc.).\n" - type: "array" - items: - type: "string" - props: - description: "Author properties. Must be a map of strings." - type: "object" - additionalProperties: - description: "Author properties. Must be a map of strings." - type: "string" - required: - - "name" - additionalProperties: false - ComputationalRequirements: - description: "Computational requirements related to running the component." - type: "object" - properties: - cpus: - description: "The maximum number of (logical) cpus a component is allowed\ - \ to use." - type: "integer" - commands: - description: "A list of commands which should be present on the system for\ - \ the script to function." - type: "array" - items: - type: "string" - memory: - description: "The maximum amount of memory a component is allowed to allocate.\ - \ Unit must be one of B, KB, MB, GB, TB or PB." - type: "string" - required: [] - additionalProperties: false - RubyRequirements: - description: "Specify which Ruby packages should be available in order to run\ - \ the component." - type: "object" - properties: - type: - description: "Specify which Ruby packages should be available in order to\ - \ run the component." - const: "ruby" - packages: - anyOf: - - description: "Specifies which packages to install." - type: "string" - - description: "Specifies which packages to install." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - YumRequirements: - description: "Specify which yum packages should be available in order to run the\ - \ component." - type: "object" - properties: - type: - description: "Specify which yum packages should be available in order to run\ - \ the component." - const: "yum" - packages: - anyOf: - - description: "Specifies which packages to install." - type: "string" - - description: "Specifies which packages to install." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - JavascriptRequirements: - description: "Specify which JavaScript packages should be available in order to\ - \ run the component." - type: "object" - properties: - github: - anyOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - description: "Specifies which packages to install from GitHub." - type: "array" - items: - type: "string" - url: - anyOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - description: "Specifies which packages to install using a generic URI." - type: "array" - items: - type: "string" - git: - anyOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - description: "Specifies which packages to install using a Git URI." - type: "array" - items: - type: "string" - npm: - anyOf: - - description: "Specifies which packages to install from npm." - type: "string" - - description: "Specifies which packages to install from npm." - type: "array" - items: - type: "string" - type: - description: "Specify which JavaScript packages should be available in order\ - \ to run the component." - const: "javascript" - packages: - anyOf: - - description: "Specifies which packages to install from npm." - type: "string" - - description: "Specifies which packages to install from npm." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - DockerRequirements: - description: "Specify which Docker commands should be run during setup." - type: "object" - properties: - run: - anyOf: - - description: "Specifies which `RUN` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - description: "Specifies which `RUN` entries to add to the Dockerfile while\ - \ building it." - type: "array" - items: - type: "string" - label: - anyOf: - - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ - \ building it." - type: "array" - items: - type: "string" - build_args: - anyOf: - - description: "Specifies which `ARG` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - description: "Specifies which `ARG` entries to add to the Dockerfile while\ - \ building it." - type: "array" - items: - type: "string" - type: - description: "Specify which Docker commands should be run during setup." - const: "docker" - add: - anyOf: - - description: "Specifies which `ADD` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - description: "Specifies which `ADD` entries to add to the Dockerfile while\ - \ building it." - type: "array" - items: - type: "string" - env: - anyOf: - - description: "Specifies which `ENV` entries to add to the Dockerfile while\ - \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ - \ the container." - type: "string" - - description: "Specifies which `ENV` entries to add to the Dockerfile while\ - \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ - \ the container." - type: "array" - items: - type: "string" - copy: - anyOf: - - description: "Specifies which `COPY` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - description: "Specifies which `COPY` entries to add to the Dockerfile while\ - \ building it." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - RRequirements: - description: "Specify which R packages should be available in order to run the\ - \ component." - type: "object" - properties: - bioc: - anyOf: - - description: "Specifies which packages to install from BioConductor." - type: "string" - - description: "Specifies which packages to install from BioConductor." - type: "array" - items: - type: "string" - github: - anyOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - description: "Specifies which packages to install from GitHub." - type: "array" - items: - type: "string" - gitlab: - anyOf: - - description: "Specifies which packages to install from GitLab." - type: "string" - - description: "Specifies which packages to install from GitLab." - type: "array" - items: - type: "string" - url: - anyOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - description: "Specifies which packages to install using a generic URI." - type: "array" - items: - type: "string" - bioc_force_install: - description: "Forces packages specified in `bioc` to be reinstalled, even\ - \ if they are already present in the container. Default: false." - type: "boolean" - git: - anyOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - description: "Specifies which packages to install using a Git URI." - type: "array" - items: - type: "string" - cran: - anyOf: - - description: "Specifies which packages to install from CRAN." - type: "string" - - description: "Specifies which packages to install from CRAN." - type: "array" - items: - type: "string" - bitbucket: - anyOf: - - description: "Specifies which packages to install from Bitbucket." - type: "string" - - description: "Specifies which packages to install from Bitbucket." - type: "array" - items: - type: "string" - svn: - anyOf: - - description: "Specifies which packages to install using an SVN URI." - type: "string" - - description: "Specifies which packages to install using an SVN URI." - type: "array" - items: - type: "string" - packages: - anyOf: - - description: "Specifies which packages to install from CRAN." - type: "string" - - description: "Specifies which packages to install from CRAN." - type: "array" - items: - type: "string" - script: - anyOf: - - description: "Specifies a code block to run as part of the build." - type: "string" - - description: "Specifies a code block to run as part of the build." - type: "array" - items: - type: "string" - type: - description: "Specify which R packages should be available in order to run\ - \ the component." - const: "r" - required: - - "type" - additionalProperties: false - ApkRequirements: - description: "Specify which apk packages should be available in order to run the\ - \ component." - type: "object" - properties: - type: - description: "Specify which apk packages should be available in order to run\ - \ the component." - const: "apk" - packages: - anyOf: - - description: "Specifies which packages to install." - type: "string" - - description: "Specifies which packages to install." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - PythonRequirements: - description: "Specify which Python packages should be available in order to run\ - \ the component." - type: "object" - properties: - github: - anyOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - description: "Specifies which packages to install from GitHub." - type: "array" - items: - type: "string" - gitlab: - anyOf: - - description: "Specifies which packages to install from GitLab." - type: "string" - - description: "Specifies which packages to install from GitLab." - type: "array" - items: - type: "string" - pip: - anyOf: - - description: "Specifies which packages to install from pip." - type: "string" - - description: "Specifies which packages to install from pip." - type: "array" - items: - type: "string" - pypi: - anyOf: - - description: "Specifies which packages to install from PyPI using pip." - type: "string" - - description: "Specifies which packages to install from PyPI using pip." - type: "array" - items: - type: "string" - git: - anyOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - description: "Specifies which packages to install using a Git URI." - type: "array" - items: - type: "string" - upgrade: - description: "Sets the `--upgrade` flag when set to true. Default: true." - type: "boolean" - packages: - anyOf: - - description: "Specifies which packages to install from pip." - type: "string" - - description: "Specifies which packages to install from pip." - type: "array" - items: - type: "string" - url: - anyOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - description: "Specifies which packages to install using a generic URI." - type: "array" - items: - type: "string" - svn: - anyOf: - - description: "Specifies which packages to install using an SVN URI." - type: "string" - - description: "Specifies which packages to install using an SVN URI." - type: "array" - items: - type: "string" - bazaar: - anyOf: - - description: "Specifies which packages to install using a Bazaar URI." - type: "string" - - description: "Specifies which packages to install using a Bazaar URI." - type: "array" - items: - type: "string" - script: - anyOf: - - description: "Specifies a code block to run as part of the build." - type: "string" - - description: "Specifies a code block to run as part of the build." - type: "array" - items: - type: "string" - type: - description: "Specify which Python packages should be available in order to\ - \ run the component." - const: "python" - mercurial: - anyOf: - - description: "Specifies which packages to install using a Mercurial URI." - type: "string" - - description: "Specifies which packages to install using a Mercurial URI." - type: "array" - items: - type: "string" - user: - description: "Sets the `--user` flag when set to true. Default: false." - type: "boolean" - required: - - "type" - additionalProperties: false - AptRequirements: - description: "Specify which apt packages should be available in order to run the\ - \ component." - type: "object" - properties: - interactive: - description: "If `false`, the Debian frontend is set to non-interactive (recommended).\ - \ Default: false." - type: "boolean" - type: - description: "Specify which apt packages should be available in order to run\ - \ the component." - const: "apt" - packages: - anyOf: - - description: "Specifies which packages to install." - type: "string" - - description: "Specifies which packages to install." - type: "array" - items: - type: "string" - required: - - "type" - additionalProperties: false - Requirements: - anyOf: - - $ref: "#/definitions/RubyRequirements" - - $ref: "#/definitions/YumRequirements" - - $ref: "#/definitions/JavascriptRequirements" - - $ref: "#/definitions/DockerRequirements" - - $ref: "#/definitions/RRequirements" - - $ref: "#/definitions/ApkRequirements" - - $ref: "#/definitions/PythonRequirements" - - $ref: "#/definitions/AptRequirements" - StringArgument: - description: "A `string` type argument has a value made up of an ordered sequences\ - \ of characters, like \"Hello\" or \"I'm a string\"." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "string" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "string" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `string` type argument has a value made up of an ordered sequences\ - \ of characters, like \"Hello\" or \"I'm a string\"." - const: "string" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - BooleanArgument: - description: "A `boolean` type argument has two possible values: `true` or `false`." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--trim`, `-t`\ - \ or `trim`. The number of dashes determines how values can be passed: \ - \ \n\n - `--trim` is a long option, which can be passed with `executable_name\ - \ --trim`\n - `-t` is a short option, which can be passed with `executable_name\ - \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ - \ trim` \n" - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "boolean" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "boolean" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "boolean" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "boolean" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `boolean` type argument has two possible values: `true` or\ - \ `false`." - const: "boolean" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - BooleanTrueArgument: - description: "An argument of the `boolean_true` type acts like a `boolean` flag\ - \ with a default value of `false`. When called as an argument it sets the `boolean`\ - \ to `true`." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--silent`,\ - \ `-s` or `silent`. The number of dashes determines how values can be passed:\ - \ \n\n - `--silent` is a long option, which can be passed with `executable_name\ - \ --silent`\n - `-s` is a short option, which can be passed with `executable_name\ - \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ - \ silent` \n" - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - type: - description: "An argument of the `boolean_true` type acts like a `boolean`\ - \ flag with a default value of `false`. When called as an argument it sets\ - \ the `boolean` to `true`." - const: "boolean_true" - required: - - "name" - - "type" - additionalProperties: false - IntegerArgument: - description: "An `integer` type argument has a numeric value without decimal points." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "integer" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - type: "integer" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "integer" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "integer" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - type: "integer" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "An `integer` type argument has a numeric value without decimal\ - \ points." - const: "integer" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - LongArgument: - description: "An `long` type argument has a numeric value without decimal points." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "integer" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - type: "integer" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "integer" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "integer" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - type: "integer" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "An `long` type argument has a numeric value without decimal\ - \ points." - const: "long" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - BooleanFalseArgument: - description: "An argument of the `boolean_false` type acts like an inverted `boolean`\ - \ flag with a default value of `true`. When called as an argument it sets the\ - \ `boolean` to `false`." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--no-log`,\ - \ `-n` or `no-log`. The number of dashes determines how values can be passed:\ - \ \n\n - `--no-log` is a long option, which can be passed with `executable_name\ - \ --no-log`\n - `-n` is a short option, which can be passed with `executable_name\ - \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ - \ no-log` \n" - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - type: - description: "An argument of the `boolean_false` type acts like an inverted\ - \ `boolean` flag with a default value of `true`. When called as an argument\ - \ it sets the `boolean` to `false`." - const: "boolean_false" - required: - - "name" - - "type" - additionalProperties: false - DoubleArgument: - description: "A `double` type argument has a numeric value with decimal points" - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - type: "number" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "number" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "number" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "number" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "number" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - type: "number" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `double` type argument has a numeric value with decimal points" - const: "double" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - FileArgument: - description: "A `file` type argument has a string value that points to a file\ - \ or folder path." - type: "object" - properties: - alternatives: - anyOf: - - description: "List of alternative format variations for this argument." - type: "string" - - description: "List of alternative format variations for this argument." - type: "array" - items: - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - create_parent: - description: "If the output filename is a path and it does not exist, create\ - \ it before executing the script (only for `direction: output`)." - type: "boolean" - direction: - description: "Makes this argument an `input` or an `output`, as in does the\ - \ file/folder needs to be read or written. `input` by default." - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - must_exist: - description: "Checks whether the file or folder exists. For input files, this\ - \ check will happen before the execution of the script, while for output\ - \ files the check will happen afterwards." - type: "boolean" - default: - anyOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "array" - items: - type: "string" - example: - anyOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "array" - items: - type: "string" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `file` type argument has a string value that points to a file\ - \ or folder path." - const: "file" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - Argument: - anyOf: - - $ref: "#/definitions/StringArgument" - - $ref: "#/definitions/BooleanArgument" - - $ref: "#/definitions/BooleanTrueArgument" - - $ref: "#/definitions/IntegerArgument" - - $ref: "#/definitions/LongArgument" - - $ref: "#/definitions/BooleanFalseArgument" - - $ref: "#/definitions/DoubleArgument" - - $ref: "#/definitions/FileArgument" - ArgumentGroup: - type: "object" - properties: - name: - description: "The name of the argument group." - type: "string" - description: - description: "A description of the argument group. Multiline descriptions\ - \ are supported." - type: "string" - arguments: - description: "List of the arguments names." - type: "array" - items: - $ref: "#/definitions/Argument" - required: - - "name" - - "arguments" - additionalProperties: false - JavaScriptScript: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "javascript_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - CSharpScript: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "csharp_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - Executable: - description: "An executable file." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable file." - const: "executable" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - ScalaScript: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "scala_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - NextflowScript: - description: "A Nextflow script. Work in progress; added mainly for annotation\ - \ at the moment." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - entrypoint: - description: "The name of the workflow to be executed." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "A Nextflow script. Work in progress; added mainly for annotation\ - \ at the moment." - const: "nextflow_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - PlainFile: - description: "A plain file. This can only be used as a supporting resource for\ - \ the main script or unit tests." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "A plain file. This can only be used as a supporting resource\ - \ for the main script or unit tests." - const: "file" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "path" - additionalProperties: false - BashScript: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "bash_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - PythonScript: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "python_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - RScript: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." - const: "r_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - Resource: - anyOf: - - $ref: "#/definitions/JavaScriptScript" - - $ref: "#/definitions/CSharpScript" - - $ref: "#/definitions/Executable" - - $ref: "#/definitions/ScalaScript" - - $ref: "#/definitions/NextflowScript" - - $ref: "#/definitions/PlainFile" - - $ref: "#/definitions/BashScript" - - $ref: "#/definitions/PythonScript" - - $ref: "#/definitions/RScript" - NextflowDirectives: - description: "Directives are optional settings that affect the execution of the\ - \ process.\n" - type: "object" - properties: - beforeScript: - description: "The `beforeScript` directive allows you to execute a custom\ - \ (Bash) snippet before the main process script is run. This may be useful\ - \ to initialise the underlying cluster environment or for other custom initialisation.\n\ - \nSee [`beforeScript`](https://www.nextflow.io/docs/latest/process.html#beforeScript).\n" - type: "string" - module: - anyOf: - - description: "Environment Modules is a package manager that allows you to\ - \ dynamically configure your execution environment and easily switch between\ - \ multiple versions of the same software tool.\n\nIf it is available in\ - \ your system you can use it with Nextflow in order to configure the processes\ - \ execution environment in your pipeline.\n\nIn a process definition you\ - \ can use the `module` directive to load a specific module version to\ - \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" - type: "string" - - description: "Environment Modules is a package manager that allows you to\ - \ dynamically configure your execution environment and easily switch between\ - \ multiple versions of the same software tool.\n\nIf it is available in\ - \ your system you can use it with Nextflow in order to configure the processes\ - \ execution environment in your pipeline.\n\nIn a process definition you\ - \ can use the `module` directive to load a specific module version to\ - \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" - type: "array" - items: - type: "string" - queue: - anyOf: - - description: "The `queue` directory allows you to set the queue where jobs\ - \ are scheduled when using a grid based executor in your pipeline.\n\n\ - See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" - type: "string" - - description: "The `queue` directory allows you to set the queue where jobs\ - \ are scheduled when using a grid based executor in your pipeline.\n\n\ - See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" - type: "array" - items: - type: "string" - label: - anyOf: - - description: "The `label` directive allows the annotation of processes with\ - \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" - type: "string" - - description: "The `label` directive allows the annotation of processes with\ - \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" - type: "array" - items: - type: "string" - container: - anyOf: - - description: "The `container` directive allows you to execute the process\ - \ script in a Docker container.\n\nIt requires the Docker daemon to be\ - \ running in machine where the pipeline is executed, i.e. the local machine\ - \ when using the local executor or the cluster nodes when the pipeline\ - \ is deployed through a grid executor.\n\nViash implements allows either\ - \ a string value or a map. In case a map is used, the allowed keys are:\ - \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ - \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" - type: "object" - additionalProperties: - description: "The `container` directive allows you to execute the process\ - \ script in a Docker container.\n\nIt requires the Docker daemon to\ - \ be running in machine where the pipeline is executed, i.e. the local\ - \ machine when using the local executor or the cluster nodes when the\ - \ pipeline is deployed through a grid executor.\n\nViash implements\ - \ allows either a string value or a map. In case a map is used, the\ - \ allowed keys are: `registry`, `image`, and `tag`. The `image` value\ - \ must be specified.\n\nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" - type: "string" - - description: "The `container` directive allows you to execute the process\ - \ script in a Docker container.\n\nIt requires the Docker daemon to be\ - \ running in machine where the pipeline is executed, i.e. the local machine\ - \ when using the local executor or the cluster nodes when the pipeline\ - \ is deployed through a grid executor.\n\nViash implements allows either\ - \ a string value or a map. In case a map is used, the allowed keys are:\ - \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ - \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" - type: "string" - publishDir: - anyOf: - - anyOf: - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ - \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ - \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ - \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ - \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "object" - additionalProperties: - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ - \ `path` key and value are required.\nThe allowed values for `mode`\ - \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ - \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are: `path`,\ - \ `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path` key\ - \ and value are required.\nThe allowed values for `mode` are: `symlink`,\ - \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "array" - items: - anyOf: - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ - \ `path` key and value are required.\nThe allowed values for `mode`\ - \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ - \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ - \ `path` key and value are required.\nThe allowed values for `mode`\ - \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ - \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "object" - additionalProperties: - description: "The `publishDir` directive allows you to publish the\ - \ process output files to a specified folder.\n\nViash implements\ - \ this directive as a plain string or a map. The allowed keywords\ - \ for the map are: `path`, `mode`, `overwrite`, `pattern`, `saveAs`,\ - \ `enabled`. The `path` key and value are required.\nThe allowed\ - \ values for `mode` are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`,\ - \ `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - maxForks: - anyOf: - - description: "The `maxForks` directive allows you to define the maximum\ - \ number of process instances that can be executed in parallel. By default\ - \ this value is equals to the number of CPU cores available minus 1.\n\ - \nIf you want to execute a process in a sequential manner, set this directive\ - \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" - type: "string" - - description: "The `maxForks` directive allows you to define the maximum\ - \ number of process instances that can be executed in parallel. By default\ - \ this value is equals to the number of CPU cores available minus 1.\n\ - \nIf you want to execute a process in a sequential manner, set this directive\ - \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" - type: "integer" - maxErrors: - anyOf: - - description: "The `maxErrors` directive allows you to specify the maximum\ - \ number of times a process can fail when using the `retry` error strategy.\ - \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" - type: "string" - - description: "The `maxErrors` directive allows you to specify the maximum\ - \ number of times a process can fail when using the `retry` error strategy.\ - \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" - type: "integer" - cpus: - anyOf: - - description: "The `cpus` directive allows you to define the number of (logical)\ - \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" - type: "integer" - - description: "The `cpus` directive allows you to define the number of (logical)\ - \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" - type: "string" - accelerator: - description: "The `accelerator` directive allows you to specify the hardware\ - \ accelerator requirement for the task execution e.g. GPU processor.\n\n\ - Viash implements this directive as a map with accepted keywords: `type`,\ - \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" - type: "object" - additionalProperties: - description: "The `accelerator` directive allows you to specify the hardware\ - \ accelerator requirement for the task execution e.g. GPU processor.\n\ - \nViash implements this directive as a map with accepted keywords: `type`,\ - \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" - type: "string" - time: - description: "The `time` directive allows you to define how long a process\ - \ is allowed to run.\n\nSee [`time`](https://www.nextflow.io/docs/latest/process.html#time).\n" - type: "string" - afterScript: - description: "The `afterScript` directive allows you to execute a custom (Bash)\ - \ snippet immediately after the main process has run. This may be useful\ - \ to clean up your staging area.\n\nSee [`afterScript`](https://www.nextflow.io/docs/latest/process.html#afterscript).\n" - type: "string" - executor: - description: "The `executor` defines the underlying system where processes\ - \ are executed. By default a process uses the executor defined globally\ - \ in the nextflow.config file.\n\nThe `executor` directive allows you to\ - \ configure what executor has to be used by the process, overriding the\ - \ default configuration. The following values can be used:\n\n| Name | Executor\ - \ |\n|------|----------|\n| awsbatch | The process is executed using the\ - \ AWS Batch service. | \n| azurebatch | The process is executed using the\ - \ Azure Batch service. | \n| condor | The process is executed using the\ - \ HTCondor job scheduler. | \n| google-lifesciences | The process is executed\ - \ using the Google Genomics Pipelines service. | \n| ignite | The process\ - \ is executed using the Apache Ignite cluster. | \n| k8s | The process is\ - \ executed using the Kubernetes cluster. | \n| local | The process is executed\ - \ in the computer where Nextflow is launched. | \n| lsf | The process is\ - \ executed using the Platform LSF job scheduler. | \n| moab | The process\ - \ is executed using the Moab job scheduler. | \n| nqsii | The process is\ - \ executed using the NQSII job scheduler. | \n| oge | Alias for the sge\ - \ executor. | \n| pbs | The process is executed using the PBS/Torque job\ - \ scheduler. | \n| pbspro | The process is executed using the PBS Pro job\ - \ scheduler. | \n| sge | The process is executed using the Sun Grid Engine\ - \ / Open Grid Engine. | \n| slurm | The process is executed using the SLURM\ - \ job scheduler. | \n| tes | The process is executed using the GA4GH TES\ - \ service. | \n| uge | Alias for the sge executor. |\n\nSee [`executor`](https://www.nextflow.io/docs/latest/process.html#executor).\n" - type: "string" - containerOptions: - anyOf: - - description: "The `containerOptions` directive allows you to specify any\ - \ container execution option supported by the underlying container engine\ - \ (ie. Docker, Singularity, etc). This can be useful to provide container\ - \ settings only for a specific process e.g. mount a custom path.\n\nSee\ - \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" - type: "string" - - description: "The `containerOptions` directive allows you to specify any\ - \ container execution option supported by the underlying container engine\ - \ (ie. Docker, Singularity, etc). This can be useful to provide container\ - \ settings only for a specific process e.g. mount a custom path.\n\nSee\ - \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" - type: "array" - items: - type: "string" - disk: - description: "The `disk` directive allows you to define how much local disk\ - \ storage the process is allowed to use.\n\nSee [`disk`](https://www.nextflow.io/docs/latest/process.html#disk).\n" - type: "string" - tag: - description: "The `tag` directive allows you to associate each process execution\ - \ with a custom label, so that it will be easier to identify them in the\ - \ log file or in the trace execution report.\n\nSee [`tag`](https://www.nextflow.io/docs/latest/process.html#tag).\n" - type: "string" - conda: - anyOf: - - description: "The `conda` directive allows for the definition of the process\ - \ dependencies using the Conda package manager.\n\nNextflow automatically\ - \ sets up an environment for the given package names listed by in the\ - \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" - type: "string" - - description: "The `conda` directive allows for the definition of the process\ - \ dependencies using the Conda package manager.\n\nNextflow automatically\ - \ sets up an environment for the given package names listed by in the\ - \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" - type: "array" - items: - type: "string" - machineType: - description: " The `machineType` can be used to specify a predefined Google\ - \ Compute Platform machine type when running using the Google Life Sciences\ - \ executor.\n\nSee [`machineType`](https://www.nextflow.io/docs/latest/process.html#machinetype).\n" - type: "string" - stageInMode: - description: "The `stageInMode` directive defines how input files are staged-in\ - \ to the process work directory. The following values are allowed:\n\n|\ - \ Value | Description |\n|-------|-------------| \n| copy | Input files\ - \ are staged in the process work directory by creating a copy. | \n| link\ - \ | Input files are staged in the process work directory by creating an\ - \ (hard) link for each of them. | \n| symlink | Input files are staged in\ - \ the process work directory by creating a symbolic link with an absolute\ - \ path for each of them (default). | \n| rellink | Input files are staged\ - \ in the process work directory by creating a symbolic link with a relative\ - \ path for each of them. | \n\nSee [`stageInMode`](https://www.nextflow.io/docs/latest/process.html#stageinmode).\n" - type: "string" - cache: - anyOf: - - description: "The `cache` directive allows you to store the process results\ - \ to a local cache. When the cache is enabled and the pipeline is launched\ - \ with the resume option, any following attempt to execute the process,\ - \ along with the same inputs, will cause the process execution to be skipped,\ - \ producing the stored data as the actual results.\n\nThe caching feature\ - \ generates a unique key by indexing the process script and inputs. This\ - \ key is used to identify univocally the outputs produced by the process\ - \ execution.\n\nThe `cache` is enabled by default, you can disable it\ - \ for a specific process by setting the cache directive to `false`.\n\n\ - Accepted values are: `true`, `false`, `\"deep\"`, and `\"lenient\"`.\n\ - \nSee [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" - type: "boolean" - - description: "The `cache` directive allows you to store the process results\ - \ to a local cache. When the cache is enabled and the pipeline is launched\ - \ with the resume option, any following attempt to execute the process,\ - \ along with the same inputs, will cause the process execution to be skipped,\ - \ producing the stored data as the actual results.\n\nThe caching feature\ - \ generates a unique key by indexing the process script and inputs. This\ - \ key is used to identify univocally the outputs produced by the process\ - \ execution.\n\nThe `cache` is enabled by default, you can disable it\ - \ for a specific process by setting the cache directive to `false`.\n\n\ - Accepted values are: `true`, `false`, `\"deep\"`, and `\"lenient\"`.\n\ - \nSee [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" - type: "string" - pod: - anyOf: - - description: "The `pod` directive allows the definition of pods specific\ - \ settings, such as environment variables, secrets and config maps when\ - \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" - type: "object" - additionalProperties: - description: "The `pod` directive allows the definition of pods specific\ - \ settings, such as environment variables, secrets and config maps when\ - \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" - type: "string" - - description: "The `pod` directive allows the definition of pods specific\ - \ settings, such as environment variables, secrets and config maps when\ - \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" - type: "array" - items: - type: "object" - additionalProperties: - type: "string" - penv: - description: "The `penv` directive allows you to define the parallel environment\ - \ to be used when submitting a parallel task to the SGE resource manager.\n\ - \nSee [`penv`](https://www.nextflow.io/docs/latest/process.html#penv).\n" - type: "string" - scratch: - anyOf: - - description: "The `scratch` directive allows you to execute the process\ - \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" - type: "boolean" - - description: "The `scratch` directive allows you to execute the process\ - \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" - type: "string" - storeDir: - description: "The `storeDir` directive allows you to define a directory that\ - \ is used as a permanent cache for your process results.\n\nSee [`storeDir`](https://www.nextflow.io/docs/latest/process.html#storeDir).\n" - type: "string" - maxRetries: - anyOf: - - description: "The `maxRetries` directive allows you to define the maximum\ - \ number of times a process instance can be re-submitted in case of failure.\ - \ This value is applied only when using the retry error strategy. By default\ - \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" - type: "string" - - description: "The `maxRetries` directive allows you to define the maximum\ - \ number of times a process instance can be re-submitted in case of failure.\ - \ This value is applied only when using the retry error strategy. By default\ - \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" - type: "integer" - echo: - anyOf: - - description: "By default the stdout produced by the commands executed in\ - \ all processes is ignored. By setting the `echo` directive to true, you\ - \ can forward the process stdout to the current top running process stdout\ - \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" - type: "boolean" - - description: "By default the stdout produced by the commands executed in\ - \ all processes is ignored. By setting the `echo` directive to true, you\ - \ can forward the process stdout to the current top running process stdout\ - \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" - type: "string" - errorStrategy: - description: "The `errorStrategy` directive allows you to define how an error\ - \ condition is managed by the process. By default when an error status is\ - \ returned by the executed script, the process stops immediately. This in\ - \ turn forces the entire pipeline to terminate.\n\nTable of available error\ - \ strategies:\n| Name | Executor |\n|------|----------|\n| `terminate` |\ - \ Terminates the execution as soon as an error condition is reported. Pending\ - \ jobs are killed (default) |\n| `finish` | Initiates an orderly pipeline\ - \ shutdown when an error condition is raised, waiting the completion of\ - \ any submitted job. |\n| `ignore` | Ignores processes execution errors.\ - \ |\n| `retry` | Re-submit for execution a process returning an error condition.\ - \ |\n\nSee [`errorStrategy`](https://www.nextflow.io/docs/latest/process.html#errorstrategy).\n" - type: "string" - memory: - description: "The `memory` directive allows you to define how much memory\ - \ the process is allowed to use.\n\nSee [`memory`](https://www.nextflow.io/docs/latest/process.html#memory).\n" - type: "string" - stageOutMode: - description: "The `stageOutMode` directive defines how output files are staged-out\ - \ from the scratch directory to the process work directory. The following\ - \ values are allowed:\n\n| Value | Description |\n|-------|-------------|\ - \ \n| copy | Output files are copied from the scratch directory to the work\ - \ directory. | \n| move | Output files are moved from the scratch directory\ - \ to the work directory. | \n| rsync | Output files are copied from the\ - \ scratch directory to the work directory by using the rsync utility. |\n\ - \nSee [`stageOutMode`](https://www.nextflow.io/docs/latest/process.html#stageoutmode).\n" - type: "string" - required: [] - additionalProperties: false - NextflowAuto: - description: "Automated processing flags which can be toggled on or off." - type: "object" - properties: - simplifyInput: - description: "If `true`, an input tuple only containing only a single File\ - \ (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed to\ - \ a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`).\n\nDefault: `true`.\n" - type: "boolean" - simplifyOutput: - description: "If `true`, an output tuple containing a map with a File (e.g.\ - \ `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically transformed\ - \ to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`).\n\nDefault: `true`.\n" - type: "boolean" - publish: - description: "If `true`, the module's outputs are automatically published\ - \ to `params.publishDir`.\nWill throw an error if `params.publishDir` is\ - \ not defined.\n\nDefault: `false`.\n" - type: "boolean" - transcript: - description: "If `true`, the module's transcripts from `work/` are automatically\ - \ published to `params.transcriptDir`.\nIf not defined, `params.publishDir\ - \ + \"/_transcripts\"` will be used.\nWill throw an error if neither are\ - \ defined.\n\nDefault: `false`.\n" - type: "boolean" - required: [] - additionalProperties: false - NextflowConfig: - description: "Allows tweaking how the Nextflow Config file is generated." - type: "object" - properties: - labels: - description: "A series of default labels to specify memory and cpu constraints.\n\ - \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"mem4gb\"\ - , ... upto \"mem512tb\" and follows powers of 2.\nThe default cpu labels\ - \ are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ... upto \"cpu1000\"\ - \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ - \ it is possible for a Viash Config to overwrite the full labels parameter,\ - \ however likely it is more efficient to add additional labels\nin the Viash\ - \ Project with a config mod.\n" - type: "object" - additionalProperties: - description: "A series of default labels to specify memory and cpu constraints.\n\ - \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"\ - mem4gb\", ... upto \"mem512tb\" and follows powers of 2.\nThe default\ - \ cpu labels are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ...\ - \ upto \"cpu1000\" and follows a semi logarithmic scale (1, 2, 5 per decade).\n\ - \nConceptually it is possible for a Viash Config to overwrite the full\ - \ labels parameter, however likely it is more efficient to add additional\ - \ labels\nin the Viash Project with a config mod.\n" - type: "string" - script: - anyOf: - - description: "Includes a single string or list of strings into the nextflow.config\ - \ file.\nThis can be used to add custom profiles or include an additional\ - \ config file.\n" - type: "string" - - description: "Includes a single string or list of strings into the nextflow.config\ - \ file.\nThis can be used to add custom profiles or include an additional\ - \ config file.\n" - type: "array" - items: - type: "string" - required: [] - additionalProperties: false - DockerSetupStrategy: - $comment: "TODO add descriptions to different strategies" - enum: - - "cb" - - "ifneedbepullelsecachedbuild" - - "donothing" - - "gentlepush" - - "alwayspullelsebuild" - - "build" - - "alwayspull" - - "alwaysbuild" - - "ifneedbebuild" - - "pullelsebuild" - - "p" - - "alwayspullelsecachedbuild" - - "pull" - - "maybepush" - - "ifneedbepullelsebuild" - - "cachedbuild" - - "pullelsecachedbuild" - - "push" - - "forcepush" - - "alwayspush" - - "b" - - "pushifnotpresent" - - "alwayscachedbuild" - - "meh" - - "ifneedbepull" - - "ifneedbecachedbuild" - description: "The Docker setup strategy to use when building a container." - Direction: - enum: - - "input" - - "output" - description: "Makes this argument an `input` or an `output`, as in does the file/folder\ - \ needs to be read or written. `input` by default." - Status: - enum: - - "enabled" - - "disabled" - - "deprecated" - description: "Allows setting a component to active, deprecated or disabled." - DockerResolveVolume: - $comment: "TODO make fully case insensitive" - enum: - - "manual" - - "automatic" - - "auto" - - "Manual" - - "Automatic" - - "Auto" - description: "Enables or disables automatic volume mapping. Enabled when set to\ - \ `Automatic` or disabled when set to `Manual`. Default: `Automatic`" diff --git a/src/common/schemas/task_control_method.yaml b/src/common/schemas/task_control_method.yaml deleted file mode 100644 index 8d62f6be43..0000000000 --- a/src/common/schemas/task_control_method.yaml +++ /dev/null @@ -1,68 +0,0 @@ -title: Control Method -description: | - A control method is used to test the relative performance of all other methods, - and also as a quality control for the pipeline as a whole. A control method can - either be a positive control or a negative control. The positive control and - negative control methods set a maximum and minimum threshold for performance, - so any new method should perform better than the negative control methods and - worse than the positive control method. -type: object -required: [__merge__, functionality, platforms] -properties: - __merge__: - "$ref": "defs_common.yaml#/definitions/CompAPIMerge" - functionality: - type: object - description: Information regarding the functionality of the component. - required: [name, info, resources] - additionalProperties: false - properties: - name: - "$ref": "defs_common.yaml#/definitions/Name" - status: - "$ref": "defs_viash.yaml#/definitions/Status" - info: - type: object - description: Metadata of the component. - additionalProperties: false - required: [label, summary, description, preferred_normalization] - properties: - label: - "$ref": "defs_common.yaml#/definitions/Label" - summary: - "$ref": "defs_common.yaml#/definitions/Summary" - description: - "$ref": "defs_common.yaml#/definitions/Description" - preferred_normalization: - "$ref": "defs_common.yaml#/definitions/PreferredNormalization" - reference: - "$ref": "defs_common.yaml#/definitions/BibtexReference" - documentation_url: - "$ref": "defs_common.yaml#/definitions/DocumentationURL" - repository_url: - "$ref": "defs_common.yaml#/definitions/RepositoryURL" - variants: - "$ref": "defs_common.yaml#/definitions/MethodVariants" - arguments: - type: array - description: Component-specific parameters. - items: - "$ref": "defs_viash.yaml#/definitions/Argument" - resources: - type: array - description: Resources required to run the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - test_resources: - type: array - description: One or more scripts and resources used to test the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - platforms: - type: array - description: A list of platforms which Viash generates target artifacts for. - items: - anyOf: - - "$ref": "defs_common.yaml#/definitions/PlatformDocker" - - "$ref": "defs_common.yaml#/definitions/PlatformNative" - - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/task_info.yaml b/src/common/schemas/task_info.yaml deleted file mode 100644 index be6a1e3447..0000000000 --- a/src/common/schemas/task_info.yaml +++ /dev/null @@ -1,22 +0,0 @@ -title: Task info -description: A file format specification file. -type: object -additionalProperties: false -required: [name, label, summary, motivation, description] -properties: - name: - $ref: "defs_common.yaml#/definitions/Name" - label: - $ref: "defs_common.yaml#/definitions/Label" - summary: - $ref: "defs_common.yaml#/definitions/Summary" - image: - $ref: "defs_common.yaml#/definitions/Image" - motivation: - $ref: "defs_common.yaml#/definitions/Description" - description: - $ref: "defs_common.yaml#/definitions/Description" - authors: - type: array - items: - $ref: "defs_common.yaml#/definitions/Author" diff --git a/src/common/schemas/task_method.yaml b/src/common/schemas/task_method.yaml deleted file mode 100644 index 25c59c7a47..0000000000 --- a/src/common/schemas/task_method.yaml +++ /dev/null @@ -1,65 +0,0 @@ -title: Method -description: | - A method is a specific technique used to solve the task problem and is - compared to the control methods and other methods to determine the best - approach for the task depending on the type of dataset. -type: object -required: [__merge__, functionality, platforms] -properties: - __merge__: - "$ref": "defs_common.yaml#/definitions/CompAPIMerge" - functionality: - type: object - description: Information regarding the functionality of the component. - required: [name, info, resources] - additionalProperties: false - properties: - name: - "$ref": "defs_common.yaml#/definitions/Name" - status: - "$ref": "defs_viash.yaml#/definitions/Status" - info: - type: object - description: Metadata of the component. - additionalProperties: false - required: [label, summary, description, preferred_normalization, reference, documentation_url, repository_url] - properties: - label: - "$ref": "defs_common.yaml#/definitions/Label" - summary: - "$ref": "defs_common.yaml#/definitions/Summary" - description: - "$ref": "defs_common.yaml#/definitions/Description" - preferred_normalization: - "$ref": "defs_common.yaml#/definitions/PreferredNormalization" - reference: - "$ref": "defs_common.yaml#/definitions/BibtexReference" - documentation_url: - "$ref": "defs_common.yaml#/definitions/DocumentationURL" - repository_url: - "$ref": "defs_common.yaml#/definitions/RepositoryURL" - variants: - "$ref": "defs_common.yaml#/definitions/MethodVariants" - arguments: - type: array - description: Component-specific parameters. - items: - "$ref": "defs_viash.yaml#/definitions/Argument" - resources: - type: array - description: Resources required to run the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - test_resources: - type: array - description: One or more scripts and resources used to test the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - platforms: - type: array - description: A list of platforms which Viash generates target artifacts for. - items: - anyOf: - - "$ref": "defs_common.yaml#/definitions/PlatformDocker" - - "$ref": "defs_common.yaml#/definitions/PlatformNative" - - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/task_metric.yaml b/src/common/schemas/task_metric.yaml deleted file mode 100644 index 35932e9e7a..0000000000 --- a/src/common/schemas/task_metric.yaml +++ /dev/null @@ -1,86 +0,0 @@ -title: Metric -description: | - A metric is a quantitative measure used to evaluate the performance of the - different methods in solving the specific task problem. -type: object -required: [__merge__, functionality, platforms] -properties: - __merge__: - "$ref": "defs_common.yaml#/definitions/CompAPIMerge" - functionality: - type: object - description: Information regarding the functionality of the component. - required: [name, info, resources] - additionalProperties: false - properties: - name: - "$ref": "defs_common.yaml#/definitions/Name" - status: - "$ref": "defs_viash.yaml#/definitions/Status" - info: - type: object - description: Metadata of the component. - additionalProperties: false - required: [metrics] - properties: - metrics: - type: array - minItems: 1 - items: - type: object - description: Metadata of each metric. - additionalProperties: false - required: [label, summary, description, reference, min, max, maximize] - properties: - name: - "$ref": "defs_common.yaml#/definitions/Name" - label: - "$ref": "defs_common.yaml#/definitions/Label" - summary: - "$ref": "defs_common.yaml#/definitions/Summary" - description: - "$ref": "defs_common.yaml#/definitions/Description" - reference: - "$ref": "defs_common.yaml#/definitions/BibtexReference" - documentation_url: - "$ref": "defs_common.yaml#/definitions/DocumentationURL" - repository_url: - "$ref": "defs_common.yaml#/definitions/RepositoryURL" - variants: - "$ref": "defs_common.yaml#/definitions/MethodVariants" - min: - description: The lowest possible value of the metric. - oneOf: - - type: number - - const: "-.inf" - max: - description: The highest possible value of the metric. - oneOf: - - type: number - - const: "+.inf" - maximize: - type: boolean - description: Whether a higher metric value is better. - arguments: - type: array - description: Component-specific parameters. - items: - "$ref": "defs_viash.yaml#/definitions/Argument" - resources: - type: array - description: Resources required to run the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - test_resources: - type: array - description: One or more scripts and resources used to test the component. - items: - "$ref": "defs_viash.yaml#/definitions/Resource" - platforms: - type: array - description: A list of platforms which Viash generates target artifacts for. - items: - anyOf: - - "$ref": "defs_common.yaml#/definitions/PlatformDocker" - - "$ref": "defs_common.yaml#/definitions/PlatformNative" - - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/sync_test_resources/config.vsh.yaml b/src/common/sync_test_resources/config.vsh.yaml deleted file mode 100644 index f443d634e8..0000000000 --- a/src/common/sync_test_resources/config.vsh.yaml +++ /dev/null @@ -1,44 +0,0 @@ -functionality: - name: "sync_test_resources" - namespace: "common" - version: "dev" - description: Synchronise the test resources from s3 to resources_test - usage: | - sync_test_resources - sync_test_resources --input s3://openproblems-data/resources_test --output resources_test - arguments: - - name: "--input" - alternatives: ["-i"] - type: string - description: "Path to the S3 bucket to sync from." - default: "s3://openproblems-data/resources_test" - - name: "--output" - alternatives: ["-o"] - type: file - default: resources_test - direction: output - description: "Path to the test resource directory." - - name: "--quiet" - type: boolean_true - description: "Displays the operations that would be performed using the specified command without actually running them." - - name: "--dryrun" - type: boolean_true - description: "Does not display the operations performed from the specified command." - - name: "--delete" - type: boolean_true - description: "Files that exist in the destination but not in the source are deleted during sync." - - name: "--exclude" - type: "string" - multiple: true - description: Exclude all files or objects from the command that matches the specified pattern. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: run_test.sh -platforms: - - type: docker - image: "amazon/aws-cli:2.7.12" - - type: native - - type: nextflow diff --git a/src/common/sync_test_resources/script.sh b/src/common/sync_test_resources/script.sh deleted file mode 100644 index c97b9fcdfd..0000000000 --- a/src/common/sync_test_resources/script.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -## VIASH START -par_input='s3://openproblems-data/resources_test' -par_output='resources_test' -## VIASH END - -extra_params=( ) - -if [ "$par_quiet" == "true" ]; then - extra_params+=( "--quiet" ) -fi -if [ "$par_dryrun" == "true" ]; then - extra_params+=( "--dryrun" ) -fi -if [ "$par_delete" == "true" ]; then - extra_params+=( "--delete" ) -fi - -if [ ! -z ${par_exclude+x} ]; then - IFS=":" - for var in $par_exclude; do - unset IFS - extra_params+=( "--exclude" "$var" ) - done -fi - - -# Disable the use of the Amazon EC2 instance metadata service (IMDS). -# see https://florian.ec/blog/github-actions-awscli-errors/ -# or https://github.com/aws/aws-cli/issues/5234#issuecomment-705831465 -export AWS_EC2_METADATA_DISABLED=true - -aws s3 sync "$par_input" "$par_output" --no-sign-request "${extra_params[@]}" diff --git a/src/datasets/api/comp_dataset_loader.yaml b/src/datasets/api/comp_dataset_loader.yaml index 75909b106a..f3ea6426bb 100644 --- a/src/datasets/api/comp_dataset_loader.yaml +++ b/src/datasets/api/comp_dataset_loader.yaml @@ -1,16 +1,15 @@ -functionality: - namespace: "datasets/loaders" - info: - type: dataset_loader - type_info: - label: Dataset loader - summary: A component which generates a "Common dataset". - description: | - A dataset loader will typically have an identifier (e.g. a GEO identifier) - or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it. - arguments: - - name: "--output" - __merge__: file_raw.yaml - direction: "output" - required: true - test_resources: [] \ No newline at end of file +# namespace: "datasets/loaders" +info: + type: dataset_loader + type_info: + label: Dataset loader + summary: A component which generates a "Common dataset". + description: | + A dataset loader will typically have an identifier (e.g. a GEO identifier) + or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it. +arguments: + - name: "--output" + __merge__: file_raw.yaml + direction: "output" + required: true +test_resources: [] \ No newline at end of file diff --git a/src/datasets/api/comp_normalization.yaml b/src/datasets/api/comp_normalization.yaml index 6f2c1ffa64..38cd4efe81 100644 --- a/src/datasets/api/comp_normalization.yaml +++ b/src/datasets/api/comp_normalization.yaml @@ -1,36 +1,35 @@ -functionality: - namespace: "datasets/normalization" - info: - type: dataset_normalization - type_info: - label: Dataset normalization - summary: | - A normalization method which processes the raw counts into a normalized dataset. - description: - A component for normalizing the raw counts as output by dataset loaders into a normalized dataset. - arguments: - - name: "--input" - __merge__: file_raw.yaml - direction: input - required: true - - name: "--output" - __merge__: file_normalized.yaml - direction: output - required: true - - name: "--normalization_id" - type: string - description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used." - required: false - - name: "--layer_output" - type: string - default: "normalized" - description: The name of the layer in which to store the normalized data. - - name: "--obs_size_factors" - type: string - default: "size_factors" - description: In which .obs slot to store the size factors (if any). - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/normalization" +info: + type: dataset_normalization + type_info: + label: Dataset normalization + summary: | + A normalization method which processes the raw counts into a normalized dataset. + description: + A component for normalizing the raw counts as output by dataset loaders into a normalized dataset. +arguments: + - name: "--input" + __merge__: file_raw.yaml + direction: input + required: true + - name: "--output" + __merge__: file_normalized.yaml + direction: output + required: true + - name: "--normalization_id" + type: string + description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used." + required: false + - name: "--layer_output" + type: string + default: "normalized" + description: The name of the layer in which to store the normalized data. + - name: "--obs_size_factors" + type: string + default: "size_factors" + description: In which .obs slot to store the size factors (if any). +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_hvg.yaml b/src/datasets/api/comp_processor_hvg.yaml index 2e24033aac..bfed255d02 100644 --- a/src/datasets/api/comp_processor_hvg.yaml +++ b/src/datasets/api/comp_processor_hvg.yaml @@ -1,40 +1,39 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: HVG - summary: | - Computes the highly variable genes scores. - description: | - The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'. - arguments: - - name: "--input" - __merge__: file_normalized.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_hvg.yaml - required: true - - name: "--var_hvg" - type: string - default: "hvg" - description: "In which .var slot to store whether a feature is considered to be hvg." - - name: "--var_hvg_score" - type: string - default: "hvg_score" - description: "In which .var slot to store the gene variance score (normalized dispersion)." - - name: "--num_features" - type: integer - default: 1000 - description: "The number of HVG to select" - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: HVG + summary: | + Computes the highly variable genes scores. + description: | + The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'. +arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_hvg.yaml + required: true + - name: "--var_hvg" + type: string + default: "hvg" + description: "In which .var slot to store whether a feature is considered to be hvg." + - name: "--var_hvg_score" + type: string + default: "hvg_score" + description: "In which .var slot to store the gene variance score (normalized dispersion)." + - name: "--num_features" + type: integer + default: 1000 + description: "The number of HVG to select" +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_knn.yaml b/src/datasets/api/comp_processor_knn.yaml index b0e16f8fc4..be95b83e38 100644 --- a/src/datasets/api/comp_processor_knn.yaml +++ b/src/datasets/api/comp_processor_knn.yaml @@ -1,39 +1,38 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: KNN - summary: | - Computes the k-nearest-neighbours for each cell. - description: | - The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'. - arguments: - - name: "--input" - __merge__: file_pca.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_knn.yaml - required: true - - name: "--key_added" - type: string - default: "knn" - description: | - The neighbors data is added to `.uns[key_added]`, - distances are stored in `.obsp[key_added+'_distances']` and - connectivities in `.obsp[key_added+'_connectivities']`. - - name: "--num_neighbors" - type: integer - default: 15 - description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation." - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: KNN + summary: | + Computes the k-nearest-neighbours for each cell. + description: | + The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'. +arguments: + - name: "--input" + __merge__: file_pca.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_knn.yaml + required: true + - name: "--key_added" + type: string + default: "knn" + description: | + The neighbors data is added to `.uns[key_added]`, + distances are stored in `.obsp[key_added+'_distances']` and + connectivities in `.obsp[key_added+'_connectivities']`. + - name: "--num_neighbors" + type: integer + default: 15 + description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation." +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_pca.yaml b/src/datasets/api/comp_processor_pca.yaml index a7ca82bc07..051532cf1e 100644 --- a/src/datasets/api/comp_processor_pca.yaml +++ b/src/datasets/api/comp_processor_pca.yaml @@ -1,49 +1,48 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: PCA - summary: | - Computes a PCA embedding of the normalized data. - description: - The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'. - arguments: - - name: "--input" - __merge__: file_hvg.yaml - required: true - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--input_var_features" - type: string - description: Column name in .var matrix that will be used to select which genes to run the PCA on. - default: hvg - - name: "--output" - direction: output - __merge__: file_pca.yaml - required: true - - name: "--obsm_embedding" - type: string - default: "X_pca" - description: "In which .obsm slot to store the resulting embedding." - - name: "--varm_loadings" - type: string - default: "pca_loadings" - description: "In which .varm slot to store the resulting loadings matrix." - - name: "--uns_variance" - type: string - default: "pca_variance" - description: "In which .uns slot to store the resulting variance objects." - - name: "--num_components" - type: integer - example: 25 - description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: PCA + summary: | + Computes a PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'. +arguments: + - name: "--input" + __merge__: file_hvg.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--input_var_features" + type: string + description: Column name in .var matrix that will be used to select which genes to run the PCA on. + default: hvg + - name: "--output" + direction: output + __merge__: file_pca.yaml + required: true + - name: "--obsm_embedding" + type: string + default: "X_pca" + description: "In which .obsm slot to store the resulting embedding." + - name: "--varm_loadings" + type: string + default: "pca_loadings" + description: "In which .varm slot to store the resulting loadings matrix." + - name: "--uns_variance" + type: string + default: "pca_variance" + description: "In which .uns slot to store the resulting variance objects." + - name: "--num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_subset.yaml b/src/datasets/api/comp_processor_subset.yaml index bad64a6762..c49e7f2ece 100644 --- a/src/datasets/api/comp_processor_subset.yaml +++ b/src/datasets/api/comp_processor_subset.yaml @@ -1,31 +1,30 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: Subset - summary: Sample cells and genes randomly. - description: This component subsets the layers, obs and var to create smaller test datasets. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - required: true - direction: input - - name: "--input_mod2" - __merge__: file_common_dataset.yaml - direction: input - required: false - - name: "--output" - __merge__: file_common_dataset.yaml - direction: output - required: true - - name: "--output_mod2" - __merge__: file_common_dataset.yaml - direction: output - required: false - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: Subset + summary: Sample cells and genes randomly. + description: This component subsets the layers, obs and var to create smaller test datasets. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_common_dataset.yaml + direction: input + required: false + - name: "--output" + __merge__: file_common_dataset.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_common_dataset.yaml + direction: output + required: false +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/comp_processor_svd.yaml b/src/datasets/api/comp_processor_svd.yaml index 91413c2624..d5c0ae8ba8 100644 --- a/src/datasets/api/comp_processor_svd.yaml +++ b/src/datasets/api/comp_processor_svd.yaml @@ -1,45 +1,44 @@ -functionality: - namespace: "datasets/processors" - info: - type: dataset_processor - type_info: - label: SVD - summary: | - Computes a SVD PCA embedding of the normalized data. - description: - The resulting AnnData will contain an embedding in obsm. - arguments: - - name: "--input" - __merge__: file_normalized.yaml - required: true - direction: input - - name: "--input_mod2" - __merge__: file_normalized.yaml - required: false - direction: input - - name: "--input_layer" - type: string - default: "normalized" - description: Which layer to use as input. - - name: "--output" - direction: output - __merge__: file_svd.yaml - required: true - - name: "--output_mod2" - direction: output - __merge__: file_svd.yaml - required: false - - name: "--obsm_embedding" - type: string - default: "X_svd" - description: "In which .obsm slot to store the resulting embedding." - - name: "--num_components" - type: integer - default: 100 - description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation. - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py +namespace: "datasets/processors" +info: + type: dataset_processor + type_info: + label: SVD + summary: | + Computes a SVD PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm. +arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_normalized.yaml + required: false + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_svd.yaml + required: true + - name: "--output_mod2" + direction: output + __merge__: file_svd.yaml + required: false + - name: "--obsm_embedding" + type: string + default: "X_svd" + description: "In which .obsm slot to store the resulting embedding." + - name: "--num_components" + type: integer + default: 100 + description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation. +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py diff --git a/src/datasets/api/file_common_dataset.yaml b/src/datasets/api/file_common_dataset.yaml index ed7836bf5c..4ca8722aa7 100644 --- a/src/datasets/api/file_common_dataset.yaml +++ b/src/datasets/api/file_common_dataset.yaml @@ -1,9 +1,8 @@ __merge__: file_knn.yaml type: file example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common dataset" - summary: A dataset processed by the common dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a PCA embedding, HVG selection and a kNN graph. +label: "Common dataset" +summary: A dataset processed by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices, + as well as a PCA embedding, HVG selection and a kNN graph. \ No newline at end of file diff --git a/src/datasets/api/file_hvg.yaml b/src/datasets/api/file_hvg.yaml index 697be29e32..47b8f88922 100644 --- a/src/datasets/api/file_hvg.yaml +++ b/src/datasets/api/file_hvg.yaml @@ -1,10 +1,11 @@ __merge__: file_normalized.yaml type: file example: "resources_test/common/pancreas/hvg.h5ad" +label: "Dataset+HVG" +summary: "A normalised dataset with a PCA embedding and HVG selection." info: - label: "Dataset+HVG" - summary: "A normalised dataset with a PCA embedding and HVG selection." - slots: + format: + type: h5ad var: - type: boolean name: hvg diff --git a/src/datasets/api/file_knn.yaml b/src/datasets/api/file_knn.yaml index de7d2b8df5..c2f320e08e 100644 --- a/src/datasets/api/file_knn.yaml +++ b/src/datasets/api/file_knn.yaml @@ -1,10 +1,11 @@ __merge__: file_pca.yaml type: file example: "resources_test/common/pancreas/knn.h5ad" +label: "Dataset+HVG+PCA+kNN" +summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph" info: - label: "Dataset+HVG+PCA+kNN" - summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph" - slots: + format: + type: h5ad obsp: - type: double name: knn_distances diff --git a/src/datasets/api/file_multimodal_dataset.yaml b/src/datasets/api/file_multimodal_dataset.yaml index daac29d77b..b8ae760225 100644 --- a/src/datasets/api/file_multimodal_dataset.yaml +++ b/src/datasets/api/file_multimodal_dataset.yaml @@ -1,14 +1,15 @@ type: file example: "resources_test/common/pancreas/dataset.h5ad" +label: "Common dataset" +summary: A dataset processed by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices, + as well as a SVD embedding and a HVG selection. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). info: - label: "Common dataset" - summary: A dataset processed by the common dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a SVD embedding and a HVG selection. - - The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). - slots: + format: + type: h5ad layers: - type: integer name: counts diff --git a/src/datasets/api/file_normalized.yaml b/src/datasets/api/file_normalized.yaml index ea6f14e9fb..f163e31db9 100644 --- a/src/datasets/api/file_normalized.yaml +++ b/src/datasets/api/file_normalized.yaml @@ -1,10 +1,11 @@ __merge__: file_raw.yaml type: file example: "resources_test/common/pancreas/normalized.h5ad" +label: "Normalized dataset" +summary: "A normalized dataset" info: - label: "Normalized dataset" - summary: "A normalized dataset" - slots: + format: + type: h5ad layers: - type: double name: normalized diff --git a/src/datasets/api/file_pca.yaml b/src/datasets/api/file_pca.yaml index daa26618e1..2d2e48f95b 100644 --- a/src/datasets/api/file_pca.yaml +++ b/src/datasets/api/file_pca.yaml @@ -1,10 +1,11 @@ __merge__: file_hvg.yaml type: file example: "resources_test/common/pancreas/pca.h5ad" +label: "Dataset+HVG+PCA" +summary: "A normalised dataset with a PCA embedding" info: - label: "Dataset+HVG+PCA" - summary: "A normalised dataset with a PCA embedding" - slots: + format: + type: h5ad obsm: - type: double name: X_pca diff --git a/src/datasets/api/file_raw.yaml b/src/datasets/api/file_raw.yaml index 7ffab3b43e..f42b022a38 100644 --- a/src/datasets/api/file_raw.yaml +++ b/src/datasets/api/file_raw.yaml @@ -1,13 +1,14 @@ type: file example: "resources_test/common/pancreas/raw.h5ad" -info: - label: "Raw dataset" - summary: An unprocessed dataset as output by a dataset loader. - description: | - This dataset contains raw counts and metadata as output by a dataset loader. +label: "Raw dataset" +summary: An unprocessed dataset as output by a dataset loader. +description: | + This dataset contains raw counts and metadata as output by a dataset loader. - The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). - slots: + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). +info: + format: + type: h5ad layers: - type: integer name: counts @@ -203,3 +204,15 @@ info: description: The organism of the sample in the dataset. required: false multiple: true + - name: dataset_technology + type: string + description: The technology used to generate the dataset. + required: false + - name: dataset_organ + type: string + description: The organ of the sample in the dataset. + required: false + - name: dataset_tissue + type: string + description: The tissue of the sample in the dataset. + required: false diff --git a/src/datasets/api/file_spatial_dataset.yaml b/src/datasets/api/file_spatial_dataset.yaml new file mode 100644 index 0000000000..d7971c52ac --- /dev/null +++ b/src/datasets/api/file_spatial_dataset.yaml @@ -0,0 +1,194 @@ +type: file +example: "resources_test/common/mouse_brain_coronal/dataset.h5ad" +label: "Common dataset" +summary: An unprocessed dataset as output by the common dataset processing pipeline. +description: | + This dataset contains both raw counts and normalized data matrices. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalised expression values + required: true + obs: + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + obsm: + - type: double + name: spatial + description: Spatial coordinates of the cells in the format `x, y`. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true + - name: dataset_technology + type: string + description: The technology used to generate the dataset. + required: false + - name: dataset_organ + type: string + description: The organ of the sample in the dataset. + required: false + - name: dataset_tissue + type: string + description: The tissue of the sample in the dataset. + required: false \ No newline at end of file diff --git a/src/datasets/api/file_svd.yaml b/src/datasets/api/file_svd.yaml index 2a727369e3..c9f22b50f7 100644 --- a/src/datasets/api/file_svd.yaml +++ b/src/datasets/api/file_svd.yaml @@ -1,10 +1,11 @@ __merge__: file_normalized.yaml type: file example: "resources_test/common/pancreas/svd.h5ad" +label: "Dataset+SVD" +summary: "A normalised dataset with a SVD embedding" info: - label: "Dataset+SVD" - summary: "A normalised dataset with a SVD embedding" - slots: + format: + type: h5ad obsm: - type: double name: X_svd diff --git a/src/datasets/loaders/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/cellxgene_census/config.vsh.yaml deleted file mode 100644 index 667e1c6a6b..0000000000 --- a/src/datasets/loaders/cellxgene_census/config.vsh.yaml +++ /dev/null @@ -1,167 +0,0 @@ -functionality: - name: cellxgene_census - namespace: datasets/loaders - description: | - Query cells from a CellxGene Census or custom TileDBSoma object. - Aside from fetching the cells' RNA counts (`.X`), cell metadata - (`.obs`) and gene metadata (`.var`), this component also fetches - the dataset metadata and joins it into the cell metadata. - argument_groups: - - name: Input database - description: "Open CellxGene Census by version or URI." - arguments: - - name: "--input_uri" - type: string - description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." - required: false - example: "s3://bucket/path" - - name: "--census_version" - description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." - type: string - example: "stable" - required: false - - name: Cell query - description: Arguments related to the query. - arguments: - - name: "--species" - type: string - description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. - required: true - example: "homo_sapiens" - - name: "--obs_value_filter" - type: string - description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." - required: true - example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" - - name: Filter cells by grouping - description: - arguments: - - name: "--cell_filter_grouping" - type: string - description: | - A subset of 'obs' columns by which to group the cells for filtering. - Only groups surpassing or equal to the `--cell_filter_minimum_count` - threshold will be retained. Take care not to introduce a selection - bias against cells with more fine-grained ontology annotations. - required: false - example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] - multiple: true - - name: "--cell_filter_minimum_count" - type: integer - description: | - A minimum number of cells per group to retain. If `--cell_filter_grouping` - is defined, this parameter should also be provided and vice versa. - required: false - example: 100 - - name: Count filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--cell_filter_min_genes" - type: integer - description: Remove cells with less than this number of genes. - required: false - default: 50 - - name: "--cell_filter_min_counts" - type: integer - description: Remove cells with less than this number of counts. - required: false - default: 0 - - name: "--gene_filter_min_cells" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - default: 5 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - default: 0 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Outputs - description: Output arguments. - arguments: - - name: "--output" - type: file - description: Output h5ad file. - direction: output - required: true - example: output.h5ad - - name: "--output_compression" - type: string - choices: ["gzip", "lzf"] - required: false - example: "gzip" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/setup_logger.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - #image: openproblems/base_python:1.0.0 - image: python:3.11 - setup: - - type: python - packages: - - cellxgene-census - - scanpy - test_setup: - - type: python - packages: - - viashpy - - type: nextflow - directives: - label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml deleted file mode 100644 index 7ee4166d9d..0000000000 --- a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml +++ /dev/null @@ -1,130 +0,0 @@ -functionality: - name: cellxgene_census_from_source_h5ad - namespace: datasets/loaders - description: | - Query cells from a CellxGene Census or custom TileDBSoma object. - Aside from fetching the cells' RNA counts (`.X`), cell metadata - (`.obs`) and gene metadata (`.var`), this component also fetches - the dataset metadata and joins it into the cell metadata. - argument_groups: - - name: Input - description: Input arguments - arguments: - - name: "--input_id" - type: string - description: | - The dataset ID of the CellxGene Census dataset to query. - required: true - example: "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4" - - name: Count filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--cell_filter_min_genes" - type: integer - description: Remove cells with less than this number of genes. - required: false - default: 50 - - name: "--cell_filter_min_counts" - type: integer - description: Remove cells with less than this number of counts. - required: false - default: 0 - - name: "--gene_filter_min_cells" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - default: 5 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - default: 0 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Outputs - description: Output arguments. - arguments: - - name: "--output" - type: file - description: Output h5ad file. - direction: output - required: true - example: output.h5ad - - name: "--output_compression" - type: string - choices: ["gzip", "lzf"] - required: false - example: "gzip" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/setup_logger.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - #image: openproblems/base_python:1.0.0 - image: python:3.11 - setup: - - type: python - packages: - - cellxgene-census - - scanpy - test_setup: - - type: python - packages: - - viashpy - - type: nextflow - directives: - label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..3a73fe0538 --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,75 @@ +name: openproblems_neurips2021_bmmc +namespace: datasets/loaders/multimodal +description: Fetch a dataset from the OpenProblems NeurIPS2021 competition +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122. + required: true + example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: A unique identifier for the dataset + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + # - type: file + # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py similarity index 98% rename from src/datasets/loaders/openproblems_neurips2021_bmmc/script.py rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py index de62f039f6..eb62dd67e9 100644 --- a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py +++ b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/script.py @@ -19,7 +19,7 @@ "output_mod2": "output/mod2.h5ad" } meta = { - "functionality_name": "openproblems_neurips2021_bmmc", + "name": "openproblems_neurips2021_bmmc", "resources_dir": "/tmp/viash_inject_openproblems_neurips2021_bmmc14365472827677740971", } ## VIASH END diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py similarity index 100% rename from src/datasets/loaders/openproblems_neurips2021_bmmc/test.py rename to src/datasets/loaders/multimodal/openproblems_neurips2021_bmmc/test.py diff --git a/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..5994e4ccc9 --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,81 @@ +name: openproblems_neurips2022_pbmc +namespace: datasets/loaders/multimodal +description: Fetch a dataset from the OpenProblems NeurIPS2022 competition +argument_groups: + - name: Inputs + arguments: + - name: --input_mod1 + type: file + description: Processed RNA h5ad file + required: true + example: cite_rna_merged.h5ad + - name: --input_mod2 + type: file + description: Processed ADT or ATAC h5ad file + required: true + example: cite_prot_merged.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: A unique identifier for the dataset + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +# skip unit test until data is public +# test_resources: +# - type: python_script +# path: test.py +# - type: file +# path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py similarity index 98% rename from src/datasets/loaders/openproblems_neurips2022_pbmc/script.py rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py index d0dd855b55..8940afed26 100644 --- a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py +++ b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/script.py @@ -18,7 +18,7 @@ "output_mod2": "output/mod2.h5ad" } meta = { - "functionality_name": "openproblems_neurips2022_pbmc", + "name": "openproblems_neurips2022_pbmc", } ## VIASH END diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py b/src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py similarity index 100% rename from src/datasets/loaders/openproblems_neurips2022_pbmc/test.py rename to src/datasets/loaders/multimodal/openproblems_neurips2022_pbmc/test.py diff --git a/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..b0afb9311b --- /dev/null +++ b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,95 @@ +name: openproblems_v1_multimodal +namespace: datasets/loaders/multimodal +description: Fetch a dataset from OpenProblems v1 +argument_groups: + - name: Inputs + arguments: + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: --output_mod1 + __merge__: ../../../api/file_raw.yaml + direction: output + - name: --output_mod2 + __merge__: ../../../api/file_raw.yaml + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_v1_multimodal/script.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py similarity index 100% rename from src/datasets/loaders/openproblems_v1_multimodal/script.py rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/script.py diff --git a/src/datasets/loaders/openproblems_v1_multimodal/test.py b/src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py similarity index 100% rename from src/datasets/loaders/openproblems_v1_multimodal/test.py rename to src/datasets/loaders/multimodal/openproblems_v1_multimodal/test.py diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml deleted file mode 100644 index 96dad30e76..0000000000 --- a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml +++ /dev/null @@ -1,74 +0,0 @@ -functionality: - name: "openproblems_neurips2021_bmmc" - namespace: "datasets/loaders" - description: "Fetch a dataset from the OpenProblems NeurIPS2021 competition" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - type: file - description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122. - required: true - example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: "A unique identifier for the dataset" - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - # - type: file - # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml deleted file mode 100644 index b2141482f1..0000000000 --- a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml +++ /dev/null @@ -1,80 +0,0 @@ -functionality: - name: "openproblems_neurips2022_pbmc" - namespace: "datasets/loaders" - description: "Fetch a dataset from the OpenProblems NeurIPS2022 competition" - argument_groups: - - name: Inputs - arguments: - - name: "--input_mod1" - type: file - description: "Processed RNA h5ad file" - required: true - example: cite_rna_merged.h5ad - - name: "--input_mod2" - type: file - description: "Processed ADT or ATAC h5ad file" - required: true - example: cite_prot_merged.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: "A unique identifier for the dataset" - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - # skip unit test until data is public - # test_resources: - # - type: python_script - # path: test.py - # - type: file - # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/openproblems_v1/config.vsh.yaml deleted file mode 100644 index d3a3ad846f..0000000000 --- a/src/datasets/loaders/openproblems_v1/config.vsh.yaml +++ /dev/null @@ -1,86 +0,0 @@ -__merge__: ../../api/comp_dataset_loader.yaml -functionality: - name: "openproblems_v1" - description: "Fetch a dataset from OpenProblems v1" - argument_groups: - - name: Inputs - arguments: - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: git - - type: docker - run: | - git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ - pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ - pip install --no-cache-dir --editable /opt/openproblems - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml deleted file mode 100644 index 6247ae3bf9..0000000000 --- a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml +++ /dev/null @@ -1,94 +0,0 @@ -functionality: - name: "openproblems_v1_multimodal" - namespace: "datasets/loaders" - description: "Fetch a dataset from OpenProblems v1" - argument_groups: - - name: Inputs - arguments: - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: ../../api/file_raw.yaml - direction: "output" - - name: "--output_mod2" - __merge__: ../../api/file_raw.yaml - direction: "output" - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: git - - type: docker - run: | - git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ - pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ - pip install --no-cache-dir --editable /opt/openproblems - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..deccdb4219 --- /dev/null +++ b/src/datasets/loaders/scrnaseq/cellxgene_census/config.vsh.yaml @@ -0,0 +1,175 @@ +name: cellxgene_census +namespace: datasets/loaders/scrnaseq +description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. +argument_groups: + - name: Input database + description: Open CellxGene Census by version or URI. + arguments: + - name: --input_uri + type: string + description: If specified, a URI containing the Census SOMA objects. If specified, + will take precedence over the `--census_version` argument. + required: false + example: s3://bucket/path + - name: --census_version + description: Which release of CellxGene census to use. Possible values are + "latest", "stable", or the date of one of the releases (e.g. "2023-07-25"). + For more information, check the documentation on [Census data + releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html). + type: string + example: stable + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: --species + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus + musculus`. + required: true + example: homo_sapiens + - name: --obs_value_filter + type: string + description: Filter for selecting the `obs` metadata (i.e. cells). Value is + a filter query written in the SOMA `value_filter` syntax. + required: true + example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', + 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell' + - name: Filter cells by grouping + description: + arguments: + - name: --cell_filter_grouping + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: [dataset_id, tissue, assay, disease, cell_type] + multiple: true + - name: --cell_filter_minimum_count + type: integer + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --cell_filter_min_genes + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: --cell_filter_min_counts + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: --gene_filter_min_cells + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: --output + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: --output_compression + type: string + choices: [gzip, lzf] + required: false + example: gzip +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu] diff --git a/src/datasets/loaders/cellxgene_census/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census/script.py similarity index 97% rename from src/datasets/loaders/cellxgene_census/script.py rename to src/datasets/loaders/scrnaseq/cellxgene_census/script.py index 49c44b6b32..de089d303a 100644 --- a/src/datasets/loaders/cellxgene_census/script.py +++ b/src/datasets/loaders/scrnaseq/cellxgene_census/script.py @@ -1,7 +1,9 @@ -import sys import cellxgene_census import scanpy as sc -import tiledbsoma as soma +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) ## VIASH START par = { @@ -22,14 +24,8 @@ "output": "output.h5ad", "output_compression": "gzip", } -meta = {"resources_dir": "src/common/helper_functions"} ## VIASH END -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger -logger = setup_logger() - def connect_census(uri, census_version): """ Connect to CellxGene Census or user-provided TileDBSoma object diff --git a/src/datasets/loaders/cellxgene_census/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census/test.py similarity index 100% rename from src/datasets/loaders/cellxgene_census/test.py rename to src/datasets/loaders/scrnaseq/cellxgene_census/test.py diff --git a/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml new file mode 100644 index 0000000000..31a4d4412c --- /dev/null +++ b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/config.vsh.yaml @@ -0,0 +1,131 @@ +name: cellxgene_census_from_source_h5ad +namespace: datasets/loaders/scrnaseq +description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. +argument_groups: + - name: Input + description: Input arguments + arguments: + - name: --input_id + type: string + description: | + The dataset ID of the CellxGene Census dataset to query. + required: true + example: a93eab58-3d82-4b61-8a2f-d7666dcdb7c4 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --cell_filter_min_genes + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: --cell_filter_min_counts + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: --gene_filter_min_cells + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: --output + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: --output_compression + type: string + choices: [gzip, lzf] + required: false + example: gzip +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu] diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py similarity index 96% rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py index 900232e6a4..fb3bf53ac4 100644 --- a/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py +++ b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/script.py @@ -1,7 +1,10 @@ -import sys import cellxgene_census import scanpy as sc import tempfile +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) ## VIASH START par = { @@ -17,14 +20,8 @@ "output": "output.h5ad", "output_compression": "gzip", } -meta = {"resources_dir": "src/common/helper_functions"} ## VIASH END -sys.path.append(meta["resources_dir"]) - -from setup_logger import setup_logger -logger = setup_logger() - def get_anndata(par): with tempfile.TemporaryDirectory() as tmp: path = tmp + "/source.h5ad" diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py b/src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py similarity index 100% rename from src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py rename to src/datasets/loaders/scrnaseq/cellxgene_census_from_source_h5ad/test.py diff --git a/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..6ebb63b410 --- /dev/null +++ b/src/datasets/loaders/scrnaseq/openproblems_v1/config.vsh.yaml @@ -0,0 +1,91 @@ +__merge__: ../../../api/comp_dataset_loader.yaml +name: openproblems_v1 +namespace: datasets/loaders/scrnaseq +description: Fetch a dataset from OpenProblems v1 +argument_groups: + - name: Inputs + arguments: + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone -b 'v0.8.0' --depth 1 https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/openproblems_v1/script.py b/src/datasets/loaders/scrnaseq/openproblems_v1/script.py similarity index 100% rename from src/datasets/loaders/openproblems_v1/script.py rename to src/datasets/loaders/scrnaseq/openproblems_v1/script.py diff --git a/src/datasets/loaders/openproblems_v1/test.py b/src/datasets/loaders/scrnaseq/openproblems_v1/test.py similarity index 100% rename from src/datasets/loaders/openproblems_v1/test.py rename to src/datasets/loaders/scrnaseq/openproblems_v1/test.py diff --git a/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..b673826a91 --- /dev/null +++ b/src/datasets/loaders/spatial/tenx_visium/config.vsh.yaml @@ -0,0 +1,96 @@ +name: tenx_visium +namespace: datasets/loaders/spatial +description: | + Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). +argument_groups: + - name: Inputs + arguments: + - name: --input_expression + type: string + description: URL to the feature / barcode matrix HDF5 of the 10x dataset. + required: true + - name: --input_spatial + type: string + description: URL to the Spatial imaging data of the 10x dataset. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - squidpy +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/tenx_visium/script.py b/src/datasets/loaders/spatial/tenx_visium/script.py similarity index 98% rename from src/datasets/loaders/tenx_visium/script.py rename to src/datasets/loaders/spatial/tenx_visium/script.py index 100bfde555..2cfa3c9054 100644 --- a/src/datasets/loaders/tenx_visium/script.py +++ b/src/datasets/loaders/spatial/tenx_visium/script.py @@ -18,7 +18,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "tenx_visium" + "name": "tenx_visium" } ## VIASH END diff --git a/src/datasets/loaders/tenx_visium/test.py b/src/datasets/loaders/spatial/tenx_visium/test.py similarity index 100% rename from src/datasets/loaders/tenx_visium/test.py rename to src/datasets/loaders/spatial/tenx_visium/test.py diff --git a/src/datasets/loaders/spatial/zenodo/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml new file mode 100644 index 0000000000..b4e06238a8 --- /dev/null +++ b/src/datasets/loaders/spatial/zenodo/config.vsh.yaml @@ -0,0 +1,88 @@ +name: zenodo +namespace: datasets/loaders/spatial +description: | + Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. +argument_groups: + - name: Inputs + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/script.py b/src/datasets/loaders/spatial/zenodo/script.py similarity index 96% rename from src/datasets/loaders/zenodo_spatial/script.py rename to src/datasets/loaders/spatial/zenodo/script.py index 83aeb86056..7392274a42 100644 --- a/src/datasets/loaders/zenodo_spatial/script.py +++ b/src/datasets/loaders/spatial/zenodo/script.py @@ -5,7 +5,7 @@ # VIASH START par = { "input_data": "ps://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1", - "dataset_id": "zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2", + "dataset_id": "zenodo/mouse_olfactory_bulb_puck_slideseqv2", "dataset_name": "Mouse Olfactory Bulk Puck", "dataset_url": "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary", "dataset_summary": "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2", @@ -16,7 +16,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "zenodo_spatial" + "name": "zenodo" } # VIASH END diff --git a/src/datasets/loaders/zenodo_spatial/test.py b/src/datasets/loaders/spatial/zenodo/test.py similarity index 97% rename from src/datasets/loaders/zenodo_spatial/test.py rename to src/datasets/loaders/spatial/zenodo/test.py index 07dcd953a8..17a87366ec 100644 --- a/src/datasets/loaders/zenodo_spatial/test.py +++ b/src/datasets/loaders/spatial/zenodo/test.py @@ -3,7 +3,7 @@ import anndata as ad input_data ="https://zenodo.org/records/12784832/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" -dataset_id = "zenodo_spatial/mouse_olfactory_bulb_puck" +dataset_id = "zenodo/mouse_olfactory_bulb_puck" dataset_name = "mouse_olfactory_bulb_puck" dataset_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" dataset_summary = "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2" diff --git a/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..0355c8bb64 --- /dev/null +++ b/src/datasets/loaders/spatial/zenodo_slidetags/config.vsh.yaml @@ -0,0 +1,88 @@ +name: zenodo_slidetags +namespace: datasets/loaders/spatial +description: | + Download a compressed file containing gene expression matrix and spatial locations from zenodo. +argument_groups: + - name: Inputs + arguments: + - name: --input_data + type: string + description: URL to the file. + required: true + - name: Outputs + arguments: + - name: --dataset + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: --dataset_id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitochondrial genes? + required: false + +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/script.py b/src/datasets/loaders/spatial/zenodo_slidetags/script.py similarity index 96% rename from src/datasets/loaders/zenodo_spatial_slidetags/script.py rename to src/datasets/loaders/spatial/zenodo_slidetags/script.py index 5a8cf212fa..777f7e9e45 100644 --- a/src/datasets/loaders/zenodo_spatial_slidetags/script.py +++ b/src/datasets/loaders/spatial/zenodo_slidetags/script.py @@ -6,7 +6,7 @@ # VIASH START par = { "input_data": "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1", - "dataset_id": "zenodo_spatial_slidetags/human_cortex_slidetags", + "dataset_id": "zenodo_slidetags/human_cortex_slidetags", "dataset_name": "slidetag_human_cortex", "dataset_url": "https://www.nature.com/articles/s41586-023-06837-4", "dataset_summary": "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics", @@ -17,7 +17,7 @@ "remove_mitochondrial": True } meta = { - "functionality_name": "zenodo_spatial_slidetags" + "name": "zenodo_slidetags" } # VIASH END diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/test.py b/src/datasets/loaders/spatial/zenodo_slidetags/test.py similarity index 97% rename from src/datasets/loaders/zenodo_spatial_slidetags/test.py rename to src/datasets/loaders/spatial/zenodo_slidetags/test.py index 9f859ebea6..c97203735b 100644 --- a/src/datasets/loaders/zenodo_spatial_slidetags/test.py +++ b/src/datasets/loaders/spatial/zenodo_slidetags/test.py @@ -3,7 +3,7 @@ import anndata as ad input_data ="https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" -dataset_id = "zenodo_spatial_slidetags/human_cortex" +dataset_id = "zenodo_slidetags/human_cortex" dataset_name = "slidetag_human_cortex" dataset_url = "https://www.nature.com/articles/s41586-023-06837-4" dataset_summary = "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics" diff --git a/src/datasets/loaders/tenx_visium/config.vsh.yaml b/src/datasets/loaders/tenx_visium/config.vsh.yaml deleted file mode 100644 index ba28b32b89..0000000000 --- a/src/datasets/loaders/tenx_visium/config.vsh.yaml +++ /dev/null @@ -1,96 +0,0 @@ -functionality: - name: tenx_visium - namespace: datasets/loaders - description: | - Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). - - argument_groups: - - name: Inputs - arguments: - - name: "--input_expression" - type: string - description: URL to the feature / barcode matrix HDF5 of the 10x dataset. - required: true - - name: "--input_spatial" - type: string - description: URL to the Spatial imaging data of the 10x dataset. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - packages: - - squidpy - - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml deleted file mode 100644 index 776b177481..0000000000 --- a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml +++ /dev/null @@ -1,87 +0,0 @@ -functionality: - name: zenodo_spatial - namespace: datasets/loaders - description: | - Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. - argument_groups: - - name: Inputs - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml deleted file mode 100644 index 905be3514c..0000000000 --- a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml +++ /dev/null @@ -1,88 +0,0 @@ -functionality: - name: zenodo_spatial_slidetags - namespace: datasets/loaders - description: | - Download a compressed file containing gene expression matrix and spatial locations from zenodo. - - argument_groups: - - name: Inputs - arguments: - - name: "--input_data" - type: string - description: URL to the file. - required: true - - name: Outputs - arguments: - - name: "--dataset" - type: file - direction: output - description: Output h5ad file - required: true - example: dataset.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitochondrial genes? - required: false - - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test.py - -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml index 31319f0958..850b49363b 100644 --- a/src/datasets/normalization/atac_tfidf/config.vsh.yaml +++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml @@ -1,16 +1,15 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "atac_tfidf" - description: | - Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency). +name: atac_tfidf +description: | + Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency). - TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF + TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF - By default, log(TF) * log(IDF) is returned. - resources: - - type: python_script - path: script.py -platforms: + By default, log(TF) * log(IDF) is returned. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: @@ -18,6 +17,8 @@ platforms: packages: - muon - numpy<2 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/atac_tfidf/script.py b/src/datasets/normalization/atac_tfidf/script.py index ecb772bd64..1d38a8fcc8 100644 --- a/src/datasets/normalization/atac_tfidf/script.py +++ b/src/datasets/normalization/atac_tfidf/script.py @@ -7,7 +7,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "tfidf" + 'name': "tfidf" } ## VIASH END @@ -20,7 +20,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = normalized_counts -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/l1_sqrt/config.vsh.yaml b/src/datasets/normalization/l1_sqrt/config.vsh.yaml index 212eadc968..3a3aca8ed0 100644 --- a/src/datasets/normalization/l1_sqrt/config.vsh.yaml +++ b/src/datasets/normalization/l1_sqrt/config.vsh.yaml @@ -1,20 +1,19 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "l1_sqrt" - description: | - Scaled L1 sqrt normalization. +name: l1_sqrt +description: | + Scaled L1 sqrt normalization. - This normalization method causes all cells to have the same sum of values. + This normalization method causes all cells to have the same sum of values. - Steps: + Steps: - * Compute the square root of the counts. - * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1). - * Multiply by the median UMI count per cell, causing all cells to have the sum of values. - resources: - - type: python_script - path: script.py -platforms: + * Compute the square root of the counts. + * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1). + * Multiply by the median UMI count per cell, causing all cells to have the sum of values. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: @@ -22,6 +21,8 @@ platforms: packages: - scprep - numpy<2 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/l1_sqrt/script.py b/src/datasets/normalization/l1_sqrt/script.py index 76c69cf897..9dd5c96505 100644 --- a/src/datasets/normalization/l1_sqrt/script.py +++ b/src/datasets/normalization/l1_sqrt/script.py @@ -8,7 +8,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "l1_sqrt" + 'name': "l1_sqrt" } ## VIASH END @@ -23,7 +23,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = l1_sqrt -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml index 89b2a283f9..d686c54147 100644 --- a/src/datasets/normalization/log_cp/config.vsh.yaml +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -1,18 +1,20 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "log_cp" - description: "Normalize data using Log CP" - resources: - - type: python_script - path: script.py - arguments: - - name: "--n_cp" - type: integer - default: 1e4 - description: "Number of counts per cell. When set to -1, will use None." -platforms: +name: log_cp +links: {} +description: Normalize data using Log CP +resources: + - type: python_script + path: script.py +arguments: + - name: --n_cp + type: integer + default: 1e4 + description: Number of counts per cell. When set to -1, will use None. +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_cp/script.py b/src/datasets/normalization/log_cp/script.py index 39ddf61636..d537ee4a7c 100644 --- a/src/datasets/normalization/log_cp/script.py +++ b/src/datasets/normalization/log_cp/script.py @@ -9,7 +9,7 @@ 'n_cp': 1e6, } meta = { - "functionality_name": "normalize_log_cp10k" + "name": "normalize_log_cp10k" } ## VIASH END @@ -36,7 +36,7 @@ print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml index 4cbf81ff5a..e010c1032b 100644 --- a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml +++ b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml @@ -1,18 +1,20 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "log_scran_pooling" - description: "Normalize data using scran pooling" - resources: - - type: r_script - path: script.R -platforms: +name: log_scran_pooling +links: {} +description: Normalize data using scran pooling +resources: + - type: r_script + path: script.R +engines: - type: docker image: openproblems/base_r:1.0.0 setup: - type: r - cran: [ Matrix, rlang, scran, BiocParallel ] + cran: [Matrix, rlang, scran, BiocParallel] - type: python pip: scanpy +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_scran_pooling/script.R b/src/datasets/normalization/log_scran_pooling/script.R index be51e21f38..33bb13e8eb 100644 --- a/src/datasets/normalization/log_scran_pooling/script.R +++ b/src/datasets/normalization/log_scran_pooling/script.R @@ -30,7 +30,7 @@ adata$obs[[par$obs_size_factors]] <- size_factors adata$layers[[par$layer_output]] <- lognorm norm_id <- par[["normalization_id"]] if (is.null(norm_id)) { - norm_id <- meta[["functionality_name"]] + norm_id <- meta[["name"]] } adata$uns[["normalization_id"]] <- norm_id diff --git a/src/datasets/normalization/prot_clr/config.vsh.yaml b/src/datasets/normalization/prot_clr/config.vsh.yaml index 8f6bbe269f..3262ca73b7 100644 --- a/src/datasets/normalization/prot_clr/config.vsh.yaml +++ b/src/datasets/normalization/prot_clr/config.vsh.yaml @@ -1,26 +1,27 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "prot_clr" - description: | - Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017). +name: prot_clr +description: | + Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017). - The CLR transformation is defined as: + The CLR transformation is defined as: - $$ - x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right) - $$ + $$ + x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right) + $$ - where $\(g(x)\)$ is the geometric mean of the row $\(x\)$. - resources: - - type: python_script - path: script.py -platforms: + where $\(g(x)\)$ is the geometric mean of the row $\(x\)$. +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: - type: python packages: - muon +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/prot_clr/script.py b/src/datasets/normalization/prot_clr/script.py index 3f0a2fb3fd..4741625935 100644 --- a/src/datasets/normalization/prot_clr/script.py +++ b/src/datasets/normalization/prot_clr/script.py @@ -7,7 +7,7 @@ 'output': "output_norm.h5ad" } meta = { - 'functionality_name': "clr" + 'name': "clr" } ## VIASH END @@ -22,7 +22,7 @@ print("Store output in adata", flush=True) adata.layers[par["layer_output"]] = normalized_counts.X -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print("Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/sqrt_cp/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml index 4d95636f4c..1e241e89d0 100644 --- a/src/datasets/normalization/sqrt_cp/config.vsh.yaml +++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml @@ -1,18 +1,19 @@ __merge__: ../../api/comp_normalization.yaml -functionality: - name: "sqrt_cp" - description: "Normalize data using Log Sqrt" - resources: - - type: python_script - path: script.py - arguments: - - name: "--n_cp" - type: integer - default: 1e4 - description: "Number of counts per cell" -platforms: +name: sqrt_cp +description: Normalize data using Log Sqrt +resources: + - type: python_script + path: script.py +arguments: + - name: --n_cp + type: integer + default: 1e4 + description: Number of counts per cell +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/sqrt_cp/script.py b/src/datasets/normalization/sqrt_cp/script.py index 84afdaa19d..d2540a519a 100644 --- a/src/datasets/normalization/sqrt_cp/script.py +++ b/src/datasets/normalization/sqrt_cp/script.py @@ -10,7 +10,7 @@ 'n_cp': 1e6, } meta = { - "functionality_name": "normalize_sqrt_cpm" + "name": "normalize_sqrt_cpm" } ## VIASH END @@ -29,7 +29,7 @@ print(">> Store output in adata", flush=True) adata.layers[par["layer_output"]] = lognorm adata.obs[par["obs_size_factors"]] = norm["norm_factor"] -adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] +adata.uns["normalization_id"] = par["normalization_id"] or meta['name'] print(">> Write data", flush=True) adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/processors/hvg/config.vsh.yaml b/src/datasets/processors/hvg/config.vsh.yaml index aed18c6d38..886fe92f94 100644 --- a/src/datasets/processors/hvg/config.vsh.yaml +++ b/src/datasets/processors/hvg/config.vsh.yaml @@ -1,13 +1,14 @@ __merge__: ../../api/comp_processor_hvg.yaml -functionality: - name: "hvg" - description: "Compute HVG" - resources: - - type: python_script - path: script.py -platforms: +name: hvg +description: Compute HVG +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/knn/config.vsh.yaml b/src/datasets/processors/knn/config.vsh.yaml index 9908fe9086..b3cf894420 100644 --- a/src/datasets/processors/knn/config.vsh.yaml +++ b/src/datasets/processors/knn/config.vsh.yaml @@ -1,13 +1,14 @@ __merge__: ../../api/comp_processor_knn.yaml -functionality: - name: "knn" - description: "Compute KNN" - resources: - - type: python_script - path: script.py -platforms: +name: knn +description: Compute KNN +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/pca/config.vsh.yaml b/src/datasets/processors/pca/config.vsh.yaml index 7f0213b922..b1c1a7ba14 100644 --- a/src/datasets/processors/pca/config.vsh.yaml +++ b/src/datasets/processors/pca/config.vsh.yaml @@ -1,17 +1,18 @@ __merge__: ../../api/comp_processor_pca.yaml -functionality: - name: "pca" - description: "Compute PCA" - resources: - - type: python_script - path: script.py +name: pca +description: Compute PCA +resources: + - type: python_script + path: script.py # test_resources: # - type: python_script # path: test_script.py # - path: "../../../resources_test/common/pancreas" -platforms: +engines: - type: docker image: openproblems/base_python:1.0.0 +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/config.vsh.yaml b/src/datasets/processors/subsample/config.vsh.yaml index 4e52e93db5..9cc62c3602 100644 --- a/src/datasets/processors/subsample/config.vsh.yaml +++ b/src/datasets/processors/subsample/config.vsh.yaml @@ -1,51 +1,54 @@ __merge__: ../../api/comp_processor_subset.yaml -functionality: - name: "subsample" - description: "Subsample an h5ad file" - arguments: - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Cell type indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: test_script.py - - path: /resources_test/common/pancreas -platforms: +name: subsample +description: Subsample an h5ad file +arguments: + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being less + because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Cell type indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test_script.py + - path: /resources_test/common/pancreas +engines: - type: docker image: openproblems/base_python:1.0.0 test_setup: - type: python packages: - viashpy +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/test_script.py b/src/datasets/processors/subsample/test_script.py index 80dde5d383..cb7f90189a 100644 --- a/src/datasets/processors/subsample/test_script.py +++ b/src/datasets/processors/subsample/test_script.py @@ -42,9 +42,9 @@ def test_keep_functionality(run_component): run_component([ "--input", input_path, - "--keep_cell_type_categories", "acinar:beta", - "--keep_batch_categories", "celseq:inDrop4:smarter", - "--keep_features", ":".join(keep_features), + "--keep_cell_type_categories", "acinar;beta", + "--keep_batch_categories", "celseq;inDrop4;smarter", + "--keep_features", ";".join(keep_features), "--output", output_path, "--seed", "123" ]) diff --git a/src/datasets/processors/svd/config.vsh.yaml b/src/datasets/processors/svd/config.vsh.yaml index bbad17f58c..bd71cae4c8 100644 --- a/src/datasets/processors/svd/config.vsh.yaml +++ b/src/datasets/processors/svd/config.vsh.yaml @@ -1,16 +1,17 @@ __merge__: ../../api/comp_processor_svd.yaml -functionality: - name: "svd" - description: "Compute SVD pca reduction" - resources: - - type: python_script - path: script.py -platforms: +name: svd +description: Compute SVD pca reduction +resources: + - type: python_script + path: script.py +engines: - type: docker image: openproblems/base_python:1.0.0 setup: - type: python pypi: [scikit-learn] +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/resource_scripts/cellxgene_census.sh b/src/datasets/resource_scripts/cellxgene_census.sh index 5d6181f91e..62eaff1f34 100755 --- a/src/datasets/resource_scripts/cellxgene_census.sh +++ b/src/datasets/resource_scripts/cellxgene_census.sh @@ -126,7 +126,7 @@ output_normalized: force_null output_pca: force_null output_hvg: force_null output_knn: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/scrnaseq HERE cat > /tmp/nextflow.config << HERE @@ -145,7 +145,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + --main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh index 04c032916f..ead3d45506 100755 --- a/src/datasets/resource_scripts/dataset_info.sh +++ b/src/datasets/resource_scripts/dataset_info.sh @@ -5,13 +5,13 @@ DATASETS_DIR="s3://openproblems-data/resources/datasets" cat > "/tmp/params.yaml" << HERE param_list: - id: openproblems_v1 - input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/scrnaseq/openproblems_v1/**/log_cp10k/state.yaml" rename_keys: 'input:output_dataset' - id: openproblems_v1_multimodal - input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/multimodal/openproblems_v1_multimodal/**/log_cp10k/state.yaml" rename_keys: 'input:output_mod1' - id: cellxgene_census - input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml" + input_states: "$DATASETS_DIR/scrnaseq/cellxgene_census/**/log_cp10k/state.yaml" rename_keys: 'input:output_dataset' settings: '{"output": "dataset_info.yaml"}' output_state: state.yaml diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh index a306ba2ef8..42c3456b1b 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh @@ -32,13 +32,13 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh index be8444371b..652d39aa0a 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh @@ -32,12 +32,12 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: resources/datasets/openproblems_neurips2021 +publish_dir: resources/datasets/multimodal/openproblems_neurips2021 HERE export NXF_VER=23.10.1 nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ -profile docker \ -resume \ -params-file "$params_file" diff --git a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh index e3e6783a8e..681d8f3d36 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh @@ -34,7 +34,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE cat > /tmp/nextflow.config << HERE @@ -49,7 +49,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ --workspace 53907369739130 \ --compute-env 1pK56PjjzeraOOC2LDZvN2 \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1.sh b/src/datasets/resource_scripts/openproblems_v1.sh index 8d40e57c46..7e5b12c348 100755 --- a/src/datasets/resource_scripts/openproblems_v1.sh +++ b/src/datasets/resource_scripts/openproblems_v1.sh @@ -162,7 +162,7 @@ output_normalized: force_null output_pca: force_null output_hvg: force_null output_knn: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/scrnaseq HERE cat > /tmp/nextflow.config << HERE @@ -174,7 +174,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + --main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh index 2d516a8ccb..f8e83f3582 100755 --- a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh @@ -60,7 +60,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/multimodal HERE @@ -77,7 +77,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh index 268a17cf7d..e3d012e3c5 100755 --- a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" export TOWER_WORKSPACE_ID=53907369739130 -OUTPUT_DIR="resources/datasets" +OUTPUT_DIR="resources/datasets/multimodal" if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" @@ -38,7 +38,7 @@ HERE export NXF_VER=22.04.5 nextflow \ run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ -profile docker \ -resume \ -params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/openproblems_v1_test.sh b/src/datasets/resource_scripts/openproblems_v1_test.sh index a79545f052..dab792fd13 100755 --- a/src/datasets/resource_scripts/openproblems_v1_test.sh +++ b/src/datasets/resource_scripts/openproblems_v1_test.sh @@ -8,7 +8,7 @@ cd "$REPO_ROOT" export TOWER_WORKSPACE_ID=53907369739130 -OUTPUT_DIR="resources/datasets" +OUTPUT_DIR="resources/datasets/scrnasrq" if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" @@ -42,7 +42,7 @@ HERE export NXF_VER=23.04.2 nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ -profile docker \ -resume \ -params-file "$params_file" \ diff --git a/src/datasets/resource_scripts/tenx_visium.sh b/src/datasets/resource_scripts/tenx_visium.sh index 3e2fb68a61..cc7199c81f 100755 --- a/src/datasets/resource_scripts/tenx_visium.sh +++ b/src/datasets/resource_scripts/tenx_visium.sh @@ -125,7 +125,7 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' @@ -253,7 +253,7 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' @@ -290,13 +290,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/zenodo_spatial.sh b/src/datasets/resource_scripts/zenodo.sh similarity index 92% rename from src/datasets/resource_scripts/zenodo_spatial.sh rename to src/datasets/resource_scripts/zenodo.sh index c1386aeb84..dc297492c2 100755 --- a/src/datasets/resource_scripts/zenodo_spatial.sh +++ b/src/datasets/resource_scripts/zenodo.sh @@ -2,7 +2,7 @@ # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/visium/human_heart_myocardial_infarction_1 +# - id: zenodo/visium/human_heart_myocardial_infarction_1 # input_data: "https://zenodo.org/records/13328275/files/10X0018.h5ad?download=1" # dataset_name: 10X Visium - Human Heart MI 1 # dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" @@ -14,7 +14,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/visium/human_heart_myocardial_infarction_2 +# - id: zenodo/visium/human_heart_myocardial_infarction_2 # input_data: "https://zenodo.org/records/13328275/files/10X009.h5ad?download=1" # dataset_name: 10X Visium - Human Heart MI 2 # dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" @@ -32,13 +32,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # remove_mitochondrial: true # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/dbitseq/mouse_e10_brain +# - id: zenodo/dbitseq/mouse_e10_brain # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_brain_gene_25um_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Brain (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -50,7 +50,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e10_eye +# - id: zenodo/dbitseq/mouse_e10_eye # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_eye_and_nearby_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Eye (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -62,7 +62,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e10_whole_body +# - id: zenodo/dbitseq/mouse_e10_whole_body # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_whole_gene_best_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body (E10) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -74,7 +74,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_lower_body +# - id: zenodo/dbitseq/mouse_e11_lower_body # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E11_lower_body_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Lower Body (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -86,7 +86,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_1 +# - id: zenodo/dbitseq/mouse_e11_1 # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364244_E11-FL-1L_gene_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body 1 (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -98,7 +98,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: true -# - id: zenodo_spatial/dbitseq/mouse_e11_2 +# - id: zenodo/dbitseq/mouse_e11_2 # input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364245_E11-FL-2L_gene_data.h5ad?download=1" # dataset_name: DBiT-seq - Mouse Whole Body 2 (E11) # dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" @@ -116,12 +116,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/merfish/human_cortex_1 +# - id: zenodo/merfish/human_cortex_1 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.250.expand.rep1_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 1 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -133,7 +133,7 @@ # gene_filter_min_spots: 100 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_2 +# - id: zenodo/merfish/human_cortex_2 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep1_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 2 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -145,7 +145,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_3 +# - id: zenodo/merfish/human_cortex_3 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep2_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 3 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -157,7 +157,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/human_cortex_4 +# - id: zenodo/merfish/human_cortex_4 # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep3_data.h5ad?download=1" # dataset_name: MERFISH - Human Cortex 4 # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -169,7 +169,7 @@ # gene_filter_min_spots: 50 # remove_mitochondrial: false -# - id: zenodo_spatial/merfish/mouse_cortex +# - id: zenodo/merfish/mouse_cortex # input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_mouse1.AUD_TEA_VIS.242.unexpand_data.h5ad?download=1" # dataset_name: MERFISH - Mouse Cortex # dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" @@ -187,12 +187,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/seqfish/mouse_organogenesis_seqfish +# - id: zenodo/seqfish/mouse_organogenesis # input_data: "https://zenodo.org/records/12785822/files/seqfish.h5ad?download=1" # dataset_name: Seqfish - Mouse Organogenesis # dataset_url: "https://www.nature.com/articles/s41587-021-01006-2" @@ -210,13 +210,13 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # remove_mitochondrial: true # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/slideseqv2/mouse_olfactory_bulb_puck +# - id: zenodo/slideseqv2/mouse_olfactory_bulb_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Olfactory Bulb Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -228,7 +228,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_cortex +# - id: zenodo/slideseqv2/mouse_cortex # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_palla2021squidpy_Slide-seqV2_Mouse_Cortex_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Cortex # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -240,7 +240,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_cerebellum +# - id: zenodo/slideseqv2/mouse_cerebellum # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Cerebellum_SCP948_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Cerebellum # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -252,7 +252,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_hippocampus_puck +# - id: zenodo/slideseqv2/mouse_hippocampus_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Hippocampus_Puck_200115_08_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Hippocampus Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -264,7 +264,7 @@ # gene_filter_min_spots: 500 # remove_mitochondrial: true -# - id: zenodo_spatial/slideseqv2/mouse_somatosensory_cortex_puck +# - id: zenodo/slideseqv2/mouse_somatosensory_cortex_puck # input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_SomatosensoryCortex_Puck_200306_03_data_whole.h5ad?download=1" # dataset_name: Slide-seqV2 - Mouse Somatosensory Cortex Puck # dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" @@ -282,12 +282,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE # cat > "/tmp/params.yaml" << 'HERE' # param_list: -# - id: zenodo_spatial/starmap/mouse_brain_2d_zstep10_0 +# - id: zenodo/starmap/mouse_brain_2d_zstep10_0 # input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep10_0_data.h5ad?download=1" # dataset_name: STARmap - Mouse Brain 1 # dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" @@ -299,7 +299,7 @@ # gene_filter_min_spots: 1 # remove_mitochondrial: true -# - id: zenodo_spatial/starmap/mouse_brain_2d_zstep15_0 +# - id: zenodo/starmap/mouse_brain_2d_zstep15_0 # input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep15_0_data.h5ad?download=1" # dataset_name: STARmap - Mouse Brain 2 # dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" @@ -317,12 +317,12 @@ # output_state: '$id/state.yaml' # output_raw: force_null # output_normalized: force_null -# publish_dir: s3://openproblems-data/resources/datasets +# publish_dir: s3://openproblems-data/resources/datasets/spatial # HERE cat > "/tmp/params.yaml" << 'HERE' param_list: - - id: zenodo_spatial/stereoseq/drosophila_embryo_e5_6 + - id: zenodo/stereoseq/drosophila_embryo_e5_6 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_5.6.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E5_6 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -334,7 +334,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e6_3 + - id: zenodo/stereoseq/drosophila_embryo_e6_3 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_6.3.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E6_3 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -346,7 +346,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e7 + - id: zenodo/stereoseq/drosophila_embryo_e7 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_7.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E7 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -358,7 +358,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e9_1 + - id: zenodo/stereoseq/drosophila_embryo_e9_1 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_9.1.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E9_1 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -370,7 +370,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial/stereoseq/drosophila_embryo_e10 + - id: zenodo/stereoseq/drosophila_embryo_e10 input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_10.5.h5ad?download=1" dataset_name: Stereo-seq - Drosophila embryo E10 dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" @@ -388,7 +388,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: s3://openproblems-data/resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/spatial HERE cat > /tmp/nextflow.config << HERE @@ -407,7 +407,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_zenodo_spatial/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_zenodo/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh b/src/datasets/resource_scripts/zenodo_slidetags.sh similarity index 89% rename from src/datasets/resource_scripts/zenodo_spatial_slidetags.sh rename to src/datasets/resource_scripts/zenodo_slidetags.sh index aa4e7e094b..e35df8edc3 100755 --- a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh +++ b/src/datasets/resource_scripts/zenodo_slidetags.sh @@ -2,7 +2,7 @@ cat > "/tmp/params.yaml" << 'HERE' param_list: - - id: zenodo_spatial_slidetags/slidetags/human_cortex + - id: zenodo_slidetags/slidetags/human_cortex input_data: "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" dataset_name: Slide-tags - Human Cortex dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -14,7 +14,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/human_skin_melanoma + - id: zenodo_slidetags/slidetags/human_skin_melanoma input_data: "https://zenodo.org/records/12785822/files/slidetag_human_skin_melanoma.tar.gz?download=1" dataset_name: Slide-tags - Human Skin Melanoma dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -26,7 +26,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/human_tonsil + - id: zenodo_slidetags/slidetags/human_tonsil input_data: "https://zenodo.org/records/12785822/files/slidetag_human_tonsil.tar.gz?download=1" dataset_name: Slide-tags - Human Tonsil dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -38,7 +38,7 @@ param_list: gene_filter_min_spots: 50 remove_mitochondrial: true - - id: zenodo_spatial_slidetags/slidetags/mouse_embryo + - id: zenodo_slidetags/slidetags/mouse_embryo input_data: "https://zenodo.org/records/12785822/files/slidetag_mouse_embryo.tar.gz?download=1" dataset_name: Slide-tags - Mouse Embryo dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" @@ -56,7 +56,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: resources/datasets +publish_dir: s3://openproblems-data/resources/datasets/spatial HERE cat > /tmp/nextflow.config << HERE @@ -75,7 +75,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_zenodo_spatial_slidetags/main.nf \ + --main-script target/nextflow/datasets/workflows/spatial/process_zenodo_slidetags/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "/tmp/params.yaml" \ diff --git a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh index 285dc55ec4..8a9d7de486 100755 --- a/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh +++ b/src/datasets/resource_test_scripts/cxg_immune_cell_atlas.sh @@ -1,7 +1,6 @@ #!/bin/bash -DATASET_DIR=resources_test/common - +DATASET_DIR=resources_test/common/scrnaseq mkdir -p $DATASET_DIR @@ -40,7 +39,7 @@ keep_features: '$KEEP_FEATURES' HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh index 3b5d35ee5c..c90ae39ac7 100755 --- a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh +++ b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh @@ -1,6 +1,6 @@ #!/bin/bash -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/scrnaseq mkdir -p $DATASET_DIR @@ -40,7 +40,7 @@ keep_features: '$KEEP_FEATURES' HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh index 962c4c067d..ed65408dc6 100755 --- a/src/datasets/resource_test_scripts/mouse_brain_coronal.sh +++ b/src/datasets/resource_test_scripts/mouse_brain_coronal.sh @@ -22,7 +22,7 @@ output_meta: '$id/dataset_metadata.yaml' output_state: '$id/state.yaml' output_raw: force_null output_normalized: force_null -publish_dir: resources_test/common +publish_dir: resources_test/common/spatial do_subsample: true spot_filter_min_genes: 200 gene_filter_min_spots: 50 @@ -30,7 +30,7 @@ remove_mitochondrial: true HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + -main-script target/nextflow/datasets/workflows/spatial/process_tenx_visium/main.nf \ -c src/wf_utils/labels_ci.config \ -profile docker \ -params-file "/tmp/params.yaml" diff --git a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh index 98644d9dbf..ebd8faf892 100755 --- a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh +++ b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh @@ -38,7 +38,7 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -# publish_dir: s3://openproblems-data/resources_test/common +# publish_dir: s3://openproblems-data/resources_test/common/multimodal HERE # cat > /tmp/nextflow.config << HERE @@ -51,10 +51,10 @@ HERE # HERE nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf \ -profile docker \ -resume \ - --publish_dir resources_test/common \ + --publish_dir resources_test/common/multimodal \ -params-file "$params_file" \ -c src/wf_utils/labels.config @@ -68,4 +68,4 @@ nextflow run . \ # --labels predict_modality # run task process dataset components -src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh \ No newline at end of file +# src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh index b62e6f40e1..b3a11eb67e 100755 --- a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh +++ b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh @@ -38,11 +38,11 @@ output_mod2: '$id/dataset_mod2.h5ad' output_meta_mod1: '$id/dataset_metadata_mod1.yaml' output_meta_mod2: '$id/dataset_metadata_mod2.yaml' output_state: '$id/state.yaml' -publish_dir: s3://openproblems-data/resources_test/common +publish_dir: s3://openproblems-data/resources_test/common/multimodal HERE # nextflow run . \ -# -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ +# -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ # -profile docker \ # -resume \ # --publish_dir resources_test/common \ @@ -63,7 +63,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf \ --workspace 53907369739130 \ --compute-env 1pK56PjjzeraOOC2LDZvN2 \ --params-file "$params_file" \ diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh index fb26f7ef30..3857449636 100755 --- a/src/datasets/resource_test_scripts/pancreas.sh +++ b/src/datasets/resource_test_scripts/pancreas.sh @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/scrnaseq set -e @@ -18,7 +18,7 @@ KEEP_FEATURES=`cat $DATASET_DIR/temp_g2m_genes_tirosh_hm.txt $DATASET_DIR/temp_s # download dataset nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -main-script target/nextflow/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf \ -profile docker \ -c src/wf_utils/labels_ci.config \ -resume \ @@ -55,7 +55,7 @@ nextflow run . \ rm -r $DATASET_DIR/temp_* # run task process dataset components -src/tasks/batch_integration/resources_test_scripts/process.sh -src/tasks/denoising/resources_test_scripts/pancreas.sh -src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh -src/tasks/label_projection/resources_test_scripts/pancreas.sh \ No newline at end of file +# src/tasks/batch_integration/resources_test_scripts/process.sh +# src/tasks/denoising/resources_test_scripts/pancreas.sh +# src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh +# src/tasks/label_projection/resources_test_scripts/pancreas.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/scicar_cell_lines.sh b/src/datasets/resource_test_scripts/scicar_cell_lines.sh index f765744136..f9c9a7b842 100755 --- a/src/datasets/resource_test_scripts/scicar_cell_lines.sh +++ b/src/datasets/resource_test_scripts/scicar_cell_lines.sh @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -DATASET_DIR=resources_test/common +DATASET_DIR=resources_test/common/multimodal set -e @@ -14,7 +14,7 @@ mkdir -p $DATASET_DIR # download dataset nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -main-script target/nextflow/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf \ -profile docker \ -resume \ --id scicar_cell_lines \ diff --git a/src/datasets/resource_test_scripts/slideseq_test.sh b/src/datasets/resource_test_scripts/slideseq_test.sh deleted file mode 100755 index a9050be40a..0000000000 --- a/src/datasets/resource_test_scripts/slideseq_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -e - -cat > /tmp/params.yaml << 'HERE' -param_list: - - id: mouse_cerebellum - input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" - dataset_name: Mouse cerebellum - dataset_url: "..." - dataset_summary: ... - dataset_description: "..." - dataset_reference: ref - dataset_organism: Mus musculus - -normalization_methods: [log_cp10k] -n_obs: 600 -n_vars: 500 -output_dataset: '$id/dataset.h5ad' -output_meta: '$id/dataset_metadata.yaml' -output_state: '$id/state.yaml' -output_raw: force_null -output_normalized: force_null -publish_dir: resources_test/common -do_subsample: true -spot_filter_min_genes: 200 -gene_filter_min_spots: 50 -remove_mitochondrial: true -HERE - -nextflow run . \ - -main-script target/nextflow/datasets/workflows/process_spatial_from_zenodo/main.nf \ - -c src/wf_utils/labels_ci.config \ - -profile docker \ - -params-file "/tmp/params.yaml" - diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml index 58433db567..6ce06dabfd 100644 --- a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml +++ b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml @@ -1,34 +1,34 @@ -functionality: - name: "extract_dataset_info" - namespace: "datasets/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_raw.yaml - required: true - direction: input - - name: Filter arguments - arguments: - - name: "--filter_normalization_id" - type: string - required: false - direction: input - description: If defined, only the normalization with this ID will be included in the output. - multiple: true - example: [ log_cp10k ] - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - direction: output - example: dataset_uns.yaml - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/extract_metadata -platforms: +name: extract_dataset_info +namespace: datasets/workflows +argument_groups: + - name: Inputs + arguments: + - name: --input + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Filter arguments + arguments: + - name: --filter_normalization_id + type: string + required: false + direction: input + description: If defined, only the normalization with this ID will be included + in the output. + multiple: true + example: [log_cp10k] + - name: Outputs + arguments: + - name: --output + type: file + required: true + direction: output + example: dataset_uns.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: utils/extract_uns_metadata +runners: - type: nextflow diff --git a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml index 26041b1039..6ed62ee13b 100644 --- a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml +++ b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml @@ -1,25 +1,24 @@ -functionality: - name: "extract_dataset_meta" - namespace: "datasets/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_raw.yaml - required: true - direction: input - - name: Outputs - arguments: - - name: "--output" - type: file - required: true - direction: output - example: meta.yaml - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: common/extract_metadata -platforms: +name: extract_dataset_meta +namespace: datasets/workflows +argument_groups: + - name: Inputs + arguments: + - name: --input + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: --output + type: file + required: true + direction: output + example: meta.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: utils/extract_uns_metadata +runners: - type: nextflow diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..0af8ba07e0 --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,138 @@ +name: process_openproblems_neurips2021_bmmc +namespace: datasets/workflows/multimodal +description: | + Fetch and process Neurips 2021 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: The ID of the dataset + required: true + - name: --input + type: file + description: Path to the input dataset + required: true + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_neurips2021_bmmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: utils/extract_uns_metadata + - name: utils/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_neurips2021_bmmc/main.nf diff --git a/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..ec21d83b94 --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,144 @@ +name: process_openproblems_neurips2022_pbmc +namespace: datasets/workflows/multimodal +description: | + Fetch and process Neurips 2022 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: The ID of the dataset + required: true + - name: --input_mod1 + type: file + description: Processed RNA h5ad file + required: true + example: cite_rna_merged.h5ad + - name: --input_mod2 + type: file + description: Processed ADT or ATAC h5ad file + required: true + example: cite_prot_merged.h5ad + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_neurips2022_pbmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: utils/extract_uns_metadata + - name: utils/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf b/src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_neurips2022_pbmc/main.nf diff --git a/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..ccec1c920c --- /dev/null +++ b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,165 @@ +name: process_openproblems_v1_multimodal +namespace: datasets/workflows/multimodal +description: | + Fetch and process legacy OpenProblems v1 multimodal datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: --mod1 + type: string + description: Name of the first modality. + required: true + example: GEX + - name: --mod2 + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_mod1 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_mod2 + direction: output + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: --output_meta_mod1 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod1.yaml + - name: --output_meta_mod2 + direction: output + type: file + description: Dataset metadata + example: dataset_metadata_mod2.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/multimodal/openproblems_v1_multimodal + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: utils/extract_uns_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf b/src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_v1_multimodal/main.nf rename to src/datasets/workflows/multimodal/process_openproblems_v1_multimodal/main.nf diff --git a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml deleted file mode 100644 index 3e1fd5263b..0000000000 --- a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml +++ /dev/null @@ -1,201 +0,0 @@ -functionality: - name: process_cellxgene_census - namespace: datasets/workflows - description: | - Fetch and process datasets originating from the CELLxGENE census. - argument_groups: - - name: Input database - description: "Open CellxGene Census by version or URI." - arguments: - - name: "--input_uri" - type: string - description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." - required: false - example: "s3://bucket/path" - - name: "--census_version" - description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." - type: string - example: "stable" - required: false - - name: Cell query - description: Arguments related to the query. - arguments: - - name: "--species" - type: string - description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. - required: false - default: "homo_sapiens" - multiple: false - - name: "--obs_value_filter" - type: string - description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." - required: false - example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" - - name: Cell filter - description: Filter the cells based on a minimum cell count per specified group - arguments: - - name: "--cell_filter_grouping" - type: string - description: | - A subset of 'obs' columns by which to group the cells for filtering. - Only groups surpassing or equal to the `--cell_filter_minimum_count` - threshold will be retained. Take care not to introduce a selection - bias against cells with more fine-grained ontology annotations. - required: false - example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] - multiple: true - - name: "--cell_filter_minimum_count" - type: double - description: | - A minimum number of cells per group to retain. If `--cell_filter_grouping` - is defined, this parameter should also be provided and vice versa. - required: false - example: 100 - - name: Cell metadata - description: Cell metadata arguments - arguments: - - name: "--obs_batch" - type: string - description: | - Location of where to find the observation batch IDs. - - * If not specified, the `.obs["batch"]` field will not be included. - * If one or more values are specified, the `.obs["batch"]` field will be - set to the concatenated values of the specified fields, separated by - the `obs_batch_separator`. - required: false - multiple: true - multiple_sep: "," - example: ["batch"] - - name: "--obs_batch_separator" - type: string - description: Separator to use when concatenating the values of the `--obs_batch` fields. - required: false - default: "+" - - name: Dataset metadata - description: Information about the dataset that will be stored in the `.uns` slot. - arguments: - - name: "--id" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: "output" - required: true - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: "--output_raw" - __merge__: /src/datasets/api/file_raw.yaml - direction: "output" - required: false - - name: "--output_normalized" - __merge__: /src/datasets/api/file_normalized.yaml - direction: "output" - required: false - - name: "--output_pca" - __merge__: /src/datasets/api/file_pca.yaml - direction: "output" - required: false - - name: "--output_hvg" - __merge__: /src/datasets/api/file_hvg.yaml - direction: "output" - required: false - - name: "--output_knn" - __merge__: /src/datasets/api/file_knn.yaml - direction: "output" - required: false - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/cellxgene_census - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: datasets/processors/pca - - name: datasets/processors/hvg - - name: datasets/processors/knn - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml deleted file mode 100644 index 8d3ca51d0b..0000000000 --- a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml +++ /dev/null @@ -1,137 +0,0 @@ -functionality: - name: process_openproblems_neurips2021_bmmc - namespace: datasets/workflows - description: | - Fetch and process Neurips 2021 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: "string" - description: "The ID of the dataset" - required: true - - name: "--input" - type: "file" - description: "Path to the input dataset" - required: true - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_neurips2021_bmmc - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - - name: common/decompress_gzip - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml deleted file mode 100644 index 96bcc3ee2c..0000000000 --- a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml +++ /dev/null @@ -1,143 +0,0 @@ -functionality: - name: process_openproblems_neurips2022_pbmc - namespace: datasets/workflows - description: | - Fetch and process Neurips 2022 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: "string" - description: "The ID of the dataset" - required: true - - name: "--input_mod1" - type: file - description: "Processed RNA h5ad file" - required: true - example: cite_rna_merged.h5ad - - name: "--input_mod2" - type: file - description: "Processed ADT or ATAC h5ad file" - required: true - example: cite_prot_merged.h5ad - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_neurips2022_pbmc - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - - name: common/decompress_gzip - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml deleted file mode 100644 index fb0cd73a65..0000000000 --- a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml +++ /dev/null @@ -1,163 +0,0 @@ -functionality: - name: process_openproblems_v1 - namespace: datasets/workflows - description: | - Fetch and process legacy OpenProblems v1 datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: "output" - required: true - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: "--output_raw" - __merge__: /src/datasets/api/file_raw.yaml - direction: "output" - required: false - - name: "--output_normalized" - __merge__: /src/datasets/api/file_normalized.yaml - direction: "output" - required: false - - name: "--output_pca" - __merge__: /src/datasets/api/file_pca.yaml - direction: "output" - required: false - - name: "--output_hvg" - __merge__: /src/datasets/api/file_hvg.yaml - direction: "output" - required: false - - name: "--output_knn" - __merge__: /src/datasets/api/file_knn.yaml - direction: "output" - required: false - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_v1 - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: datasets/processors/pca - - name: datasets/processors/hvg - - name: datasets/processors/knn - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml deleted file mode 100644 index 58b045cc3b..0000000000 --- a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml +++ /dev/null @@ -1,161 +0,0 @@ -functionality: - name: process_openproblems_v1_multimodal - namespace: datasets/workflows - description: | - Fetch and process legacy OpenProblems v1 multimodal datasets - argument_groups: - - name: Inputs - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--input_id" - type: "string" - description: "The ID of the dataset in OpenProblems v1" - required: true - - name: "--obs_cell_type" - type: "string" - description: "Location of where to find the observation cell types." - - name: "--obs_batch" - type: "string" - description: "Location of where to find the observation batch IDs." - - name: "--obs_tissue" - type: "string" - description: "Location of where to find the observation tissue information." - - name: "--layer_counts" - type: "string" - description: "In which layer to find the counts matrix. Leave undefined to use `.X`." - example: counts - - name: "--sparse" - type: boolean - default: true - description: Convert layers to a sparse CSR format. - - name: "--var_feature_id" - type: "string" - description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." - example: gene_ids - - name: "--var_feature_name" - type: "string" - description: "Location of where to find the feature names. Can be set to index if the feature names are the index." - default: index - - name: "--mod1" - type: string - description: Name of the first modality. - required: true - example: GEX - - name: "--mod2" - type: string - description: Name of the second modality. - required: true - example: ADT - - name: Metadata - arguments: - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--keep_features" - type: string - multiple: true - description: A list of genes to keep. - - name: "--keep_cell_type_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--keep_batch_categories" - type: "string" - multiple: true - description: "Categories indexes to be selected" - required: false - - name: "--even" - type: "boolean_true" - description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - - name: Outputs - arguments: - - name: "--output_mod1" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_mod2" - direction: "output" - __merge__: /src/datasets/api/file_multimodal_dataset.yaml - - name: "--output_meta_mod1" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod1.yaml" - - name: "--output_meta_mod2" - direction: "output" - type: file - description: "Dataset metadata" - example: "dataset_metadata_mod2.yaml" - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/openproblems_v1_multimodal - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/normalization/prot_clr - - name: datasets/normalization/atac_tfidf - - name: datasets/processors/subsample - - name: datasets/processors/svd - - name: datasets/processors/hvg - - name: common/extract_metadata - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow diff --git a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml deleted file mode 100644 index 91a2867820..0000000000 --- a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml +++ /dev/null @@ -1,142 +0,0 @@ -functionality: - name: process_tenx_visium - namespace: datasets/workflows - description: | - Download and process datasets originating from 10x Genomics. - argument_groups: - - name: Input - arguments: - - name: "--input_expression" - type: string - description: URL to the feature / barcode matrix HDF5. - required: true - - name: "--input_spatial" - type: string - description: URL to the Spatial imaging data. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/tenx_visium - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml deleted file mode 100644 index 45b938b716..0000000000 --- a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml +++ /dev/null @@ -1,138 +0,0 @@ -functionality: - name: process_zenodo_spatial - namespace: datasets/workflows - description: | - Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. - argument_groups: - - name: Input - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 600 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/zenodo_spatial - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml deleted file mode 100644 index 23934fe161..0000000000 --- a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml +++ /dev/null @@ -1,138 +0,0 @@ -functionality: - name: process_zenodo_spatial_slidetags - namespace: datasets/workflows - description: | - Download and process slide tags datasets originating from Zenodo. - argument_groups: - - name: Input - arguments: - - name: "--input_data" - type: string - description: URL to the Anndata file. - required: true - - name: Outputs - arguments: - - name: "--output_dataset" - type: file - direction: output - description: Output h5ad file - required: true - __merge__: /src/datasets/api/file_raw.yaml - - name: "--output_meta" - direction: "output" - type: file - description: "Dataset metadata" - default: "dataset_metadata.yaml" - - name: Metadata - arguments: - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: false - - name: Gene or spot filtering - description: Arguments related to filtering cells and genes by counts. - arguments: - - name: "--spot_filter_min_genes" - type: integer - description: Remove spots with less than this number of genes. - required: false - example: 200 - - name: "--spot_filter_min_counts" - type: integer - description: Remove spots with less than this number of counts. - required: false - - name: "--gene_filter_min_spots" - type: integer - description: Remove genes expressed in less than this number of cells. - required: false - example: 50 - - name: "--gene_filter_min_counts" - type: integer - description: Remove genes with less than this number of counts. - required: false - - name: "--remove_mitochondrial" - type: boolean - description: Remove mitovhondrial genes? - required: false - - name: Sampling options - arguments: - - name: "--do_subsample" - type: boolean - default: false - description: "Whether or not to subsample the dataset" - - name: "--n_obs" - type: integer - description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. - default: 600 - - name: "--n_vars" - type: integer - description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. - default: 500 - # - name: "--keep_features" - # type: string - # multiple: true - # description: A list of genes to keep. - # - name: "--keep_cell_type_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--keep_batch_categories" - # type: "string" - # multiple: true - # description: "Categories indexes to be selected" - # required: false - # - name: "--even" - # type: "boolean_true" - # description: Subsample evenly from different batches - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - - name: Normalization - arguments: - - name: "--normalization_methods" - type: string - multiple: true - choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] - default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] - description: "Which normalization methods to run." - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: datasets/loaders/zenodo_spatial_slidetags - - name: datasets/normalization/log_cp - - name: datasets/normalization/log_scran_pooling - - name: datasets/normalization/sqrt_cp - - name: datasets/normalization/l1_sqrt - - name: datasets/processors/subsample - - name: common/extract_metadata -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..63ae6e1fa6 --- /dev/null +++ b/src/datasets/workflows/scrnaseq/process_cellxgene_census/config.vsh.yaml @@ -0,0 +1,209 @@ +name: process_cellxgene_census +namespace: datasets/workflows/scrnaseq +description: | + Fetch and process datasets originating from the CELLxGENE census. +argument_groups: + - name: Input database + description: Open CellxGene Census by version or URI. + arguments: + - name: --input_uri + type: string + description: If specified, a URI containing the Census SOMA objects. If specified, + will take precedence over the `--census_version` argument. + required: false + example: s3://bucket/path + - name: --census_version + description: Which release of CellxGene census to use. Possible values are + "latest", "stable", or the date of one of the releases (e.g. "2023-07-25"). + For more information, check the documentation on [Census data + releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html). + type: string + example: stable + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: --species + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus + musculus`. + required: false + default: homo_sapiens + multiple: false + - name: --obs_value_filter + type: string + description: Filter for selecting the `obs` metadata (i.e. cells). Value is + a filter query written in the SOMA `value_filter` syntax. + required: false + example: is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell' + - name: Cell filter + description: Filter the cells based on a minimum cell count per specified group + arguments: + - name: --cell_filter_grouping + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: [dataset_id, tissue, assay, disease, cell_type] + multiple: true + - name: --cell_filter_minimum_count + type: double + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: --obs_batch + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: ',' + example: [batch] + - name: --obs_batch_separator + type: string + description: Separator to use when concatenating the values of the `--obs_batch` + fields. + required: false + default: + + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: --id + type: string + description: Nicely formatted name. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: true + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_dataset + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: output + required: true + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: --output_raw + __merge__: /src/datasets/api/file_raw.yaml + direction: output + required: false + - name: --output_normalized + __merge__: /src/datasets/api/file_normalized.yaml + direction: output + required: false + - name: --output_pca + __merge__: /src/datasets/api/file_pca.yaml + direction: output + required: false + - name: --output_hvg + __merge__: /src/datasets/api/file_hvg.yaml + direction: output + required: false + - name: --output_knn + __merge__: /src/datasets/api/file_knn.yaml + direction: output + required: false +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/scrnaseq/cellxgene_census + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: utils/extract_uns_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_cellxgene_census/main.nf b/src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf similarity index 100% rename from src/datasets/workflows/process_cellxgene_census/main.nf rename to src/datasets/workflows/scrnaseq/process_cellxgene_census/main.nf diff --git a/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..4d8a85cf1b --- /dev/null +++ b/src/datasets/workflows/scrnaseq/process_openproblems_v1/config.vsh.yaml @@ -0,0 +1,167 @@ +name: process_openproblems_v1 +namespace: datasets/workflows/scrnaseq +description: | + Fetch and process legacy OpenProblems v1 datasets +argument_groups: + - name: Inputs + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --input_id + type: string + description: The ID of the dataset in OpenProblems v1 + required: true + - name: --obs_cell_type + type: string + description: Location of where to find the observation cell types. + - name: --obs_batch + type: string + description: Location of where to find the observation batch IDs. + - name: --obs_tissue + type: string + description: Location of where to find the observation tissue information. + - name: --layer_counts + type: string + description: In which layer to find the counts matrix. Leave undefined to + use `.X`. + example: counts + - name: --sparse + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: --var_feature_id + type: string + description: Location of where to find the feature IDs. Can be set to index + if the feature IDs are the index. + example: gene_ids + - name: --var_feature_name + type: string + description: Location of where to find the feature names. Can be set to index + if the feature names are the index. + default: index + - name: Metadata + arguments: + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --keep_features + type: string + multiple: true + description: A list of genes to keep. + - name: --keep_cell_type_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --keep_batch_categories + type: string + multiple: true + description: Categories indexes to be selected + required: false + - name: --even + type: boolean_true + description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. + - name: Outputs + arguments: + - name: --output_dataset + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: output + required: true + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: --output_raw + __merge__: /src/datasets/api/file_raw.yaml + direction: output + required: false + - name: --output_normalized + __merge__: /src/datasets/api/file_normalized.yaml + direction: output + required: false + - name: --output_pca + __merge__: /src/datasets/api/file_pca.yaml + direction: output + required: false + - name: --output_hvg + __merge__: /src/datasets/api/file_hvg.yaml + direction: output + required: false + - name: --output_knn + __merge__: /src/datasets/api/file_knn.yaml + direction: output + required: false +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/scrnaseq/openproblems_v1 + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: utils/extract_uns_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf similarity index 100% rename from src/datasets/workflows/process_openproblems_v1/main.nf rename to src/datasets/workflows/scrnaseq/process_openproblems_v1/main.nf diff --git a/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..4cb867151c --- /dev/null +++ b/src/datasets/workflows/spatial/process_tenx_visium/config.vsh.yaml @@ -0,0 +1,143 @@ +name: process_tenx_visium +namespace: datasets/workflows/spatial +description: | + Download and process datasets originating from 10x Genomics. +argument_groups: + - name: Input + arguments: + - name: --input_expression + type: string + description: URL to the feature / barcode matrix HDF5. + required: true + - name: --input_spatial + type: string + description: URL to the Spatial imaging data. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/tenx_visium + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: utils/extract_uns_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_tenx_visium/main.nf b/src/datasets/workflows/spatial/process_tenx_visium/main.nf similarity index 100% rename from src/datasets/workflows/process_tenx_visium/main.nf rename to src/datasets/workflows/spatial/process_tenx_visium/main.nf diff --git a/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml new file mode 100644 index 0000000000..746f83ed08 --- /dev/null +++ b/src/datasets/workflows/spatial/process_zenodo/config.vsh.yaml @@ -0,0 +1,139 @@ +name: process_zenodo +namespace: datasets/workflows/spatial +description: | + Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. +argument_groups: + - name: Input + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 600 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/zenodo + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: utils/extract_uns_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_zenodo_spatial/main.nf b/src/datasets/workflows/spatial/process_zenodo/main.nf similarity index 99% rename from src/datasets/workflows/process_zenodo_spatial/main.nf rename to src/datasets/workflows/spatial/process_zenodo/main.nf index a5893c0ab4..6343cdc277 100644 --- a/src/datasets/workflows/process_zenodo_spatial/main.nf +++ b/src/datasets/workflows/spatial/process_zenodo/main.nf @@ -49,7 +49,7 @@ workflow run_wf { } // fetch data from legacy openproblems - | zenodo_spatial.run( + | zenodo.run( fromState: [ "input_data": "input_data", "dataset_id": "id", diff --git a/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..63d92591b1 --- /dev/null +++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/config.vsh.yaml @@ -0,0 +1,139 @@ +name: process_zenodo_slidetags +namespace: datasets/workflows/spatial +description: | + Download and process slide tags datasets originating from Zenodo. +argument_groups: + - name: Input + arguments: + - name: --input_data + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: --output_dataset + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_spatial_dataset.yaml + - name: --output_meta + direction: output + type: file + description: Dataset metadata + default: dataset_metadata.yaml + - name: Metadata + arguments: + - name: --id + type: string + description: Unique identifier of the dataset. + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - name: --dataset_url + type: string + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: --spot_filter_min_genes + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: --spot_filter_min_counts + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: --gene_filter_min_spots + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: --gene_filter_min_counts + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: --remove_mitochondrial + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: --do_subsample + type: boolean + default: false + description: Whether or not to subsample the dataset + - name: --n_obs + type: integer + description: Maximum number of observations to be kept. It might end up being + less because empty cells / genes are removed. + default: 600 + - name: --n_vars + type: integer + description: Maximum number of variables to be kept. It might end up being + less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: --seed + type: integer + description: A seed for the subsampling. + example: 123 + - name: Normalization + arguments: + - name: --normalization_methods + type: string + multiple: true + choices: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt, log_scran_pooling] + default: [log_cp10k, log_cpm, sqrt_cp10k, sqrt_cpm, l1_sqrt] + description: Which normalization methods to run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: datasets/loaders/spatial/zenodo_slidetags + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: utils/extract_uns_metadata +runners: + - type: nextflow diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf similarity index 98% rename from src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf rename to src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf index 2bb6b9300a..e2f43188a9 100644 --- a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf +++ b/src/datasets/workflows/spatial/process_zenodo_slidetags/main.nf @@ -49,7 +49,7 @@ workflow run_wf { } // fetch data from legacy openproblems - | zenodo_spatial_slidetags.run( + | zenodo_slidetags.run( fromState: [ "input_data": "input_data", "dataset_id": "id", diff --git a/src/project/create_component/config.vsh.yaml b/src/project/create_component/config.vsh.yaml new file mode 100644 index 0000000000..ee7c884791 --- /dev/null +++ b/src/project/create_component/config.vsh.yaml @@ -0,0 +1,64 @@ +name: create_component +namespace: project +description: | + Create a new component +usage: + create_component --type method --language r --name foo + create_component --type metric --language python --name bar +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + direction: input + description: Path to the root of the project. + default: "." + - type: file + name: --api_file + description: | + Which API file to use. Defaults to `src/api/comp_.yaml`. + In tasks with different subtypes of method, this location might not exist and you might need + to manually specify a different API file to inherit from. + must_exist: false + default: src/api/comp_${VIASH_PAR_TYPE}.yaml + - name: Arguments + arguments: + - type: string + name: --type + example: metric + description: The type of component to create. Typically must be one of 'method', 'control_method' or 'metric'. + - type: string + name: --language + description: Which scripting language to use. Options are 'python', 'r'. + default: python + choices: [python, r] + - type: string + name: --name + example: new_comp + description: Name of the new method, formatted in snake case. + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: Path to the component directory. Suggested location is `src/s/`. + default: src/${VIASH_PAR_TYPE}s/${VIASH_PAR_NAME} +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: apt + packages: git + - type: docker + run: | + git clone https://github.com/openproblems-bio/task_template.git /opt/task_template +runners: + - type: executable + - type: nextflow + diff --git a/src/common/create_component/script.py b/src/project/create_component/script.py similarity index 73% rename from src/common/create_component/script.py rename to src/project/create_component/script.py index 8c954a66d4..bc1dcdc410 100644 --- a/src/common/create_component/script.py +++ b/src/project/create_component/script.py @@ -3,6 +3,8 @@ import sys import os import re +from openproblems.utils import strip_margin +from openproblems.project import read_nested_yaml, find_project_root ## VIASH START par = { @@ -10,23 +12,17 @@ "type": "method", "language": "python", "name": "new_comp", - "output": "src/tasks/denoising/methods/new_comp", - "api_file": "src/tasks/denoising/api/comp_method.yaml", + "output": "src/methods/new_comp", + "api_file": "src/api/comp_method.yaml", "viash_yaml": "_viash.yaml" } ## VIASH END -# import helper function -sys.path.append(meta["resources_dir"]) -from read_and_merge_yaml import read_and_merge_yaml - -def strip_margin(text: str) -> str: - return re.sub("(^|\n)[ \t]*\|", "\\1", text) - def create_config(par, component_type, pretty_name, script_path) -> str: + general_str = generate_general_info(par, component_type, pretty_name) info_str = generate_info(par, component_type, pretty_name) resources_str = generate_resources(par, script_path) - docker_platform = generate_docker_platform(par) + docker_engine = generate_docker_engine(par) return strip_margin(f'''\ |# The API specifies which type of component this is. @@ -36,68 +32,89 @@ def create_config(par, component_type, pretty_name, script_path) -> str: |# - A unit test |__merge__: {os.path.relpath(par["api_file"], par["output"])} | - |functionality: - | # A unique identifier for your component (required). - | # Can contain only lowercase letters or underscores. - | name: {par["name"]} + |{general_str} + | | - | # Metadata for your component - | info: + |# Metadata for your component + |info: |{info_str} - | # Component-specific parameters (optional) - | # arguments: - | # - name: "--n_neighbors" - | # type: "integer" - | # default: 5 - | # description: Number of neighbors to use. + |# Component-specific parameters (optional) + |# arguments: + |# - name: "--n_neighbors" + |# type: "integer" + |# default: 5 + |# description: Number of neighbors to use. | - | # Resources required to run the component - | resources: + |# Resources required to run the component + |resources: |{resources_str} - |platforms: + |engines: | # Specifications for the Docker image for this component. - |{docker_platform} + |{docker_engine} + |runners: | # This platform allows running the component natively - | - type: native + | - type: executable | # Allows turning the component into a Nextflow module / pipeline. | - type: nextflow | directives: - | label: [midtime,midmem, midcpu] + | label: [midtime,midmem,midcpu] |''' ) -def generate_info(par, component_type, pretty_name) -> str: - """Generate the functionality info for a component.""" +def generate_general_info(par, component_type, pretty_name) -> str: + """Generate the general info for a method.""" + str = strip_margin(f'''\ + |# A unique identifier for your component (required). + |# Can contain only lowercase letters or underscores. + |name: {par["name"]} + |''') if component_type in ["method", "control_method"]: - str = strip_margin(f'''\ - | # A relatively short label, used when rendering visualisarions (required) - | label: {pretty_name} - | # A one sentence summary of how this method works (required). Used when - | # rendering summary tables. - | summary: "FILL IN: A one sentence summary of this method." - | # A multi-line description of how this component works (required). Used - | # when rendering reference documentation. - | description: | - | FILL IN: A (multi-line) description of how this method works. - | # Which normalisation method this component prefers to use (required). - | preferred_normalization: log_cp10k + str += strip_margin(f'''\ + |# A relatively short label, used when rendering visualisations (required) + |label: {pretty_name} + |# A one sentence summary of how this method works (required). Used when + |# rendering summary tables. + |summary: "FILL IN: A one sentence summary of this method." + |# A multi-line description of how this component works (required). Used + |# when rendering reference documentation. + |description: | + | FILL IN: A (multi-line) description of how this method works. |''') if component_type == "method": str += strip_margin(f'''\ - | # A reference key from the bibtex library at src/common/library.bib (required). - | reference: bibtex_reference_key - | # URL to the documentation for this method (required). - | documentation_url: https://url.to/the/documentation - | # URL to the code repository for this method (required). - | repository_url: https://github.com/organisation/repository + |# references: + |# doi: + |# - 10.1000/xx.123456.789 + |# bibtex: + |# - | + |# @article{{foo, + |# title={{Foo}}, + |# author={{Bar}}, + |# journal={{Baz}}, + |# year={{2024}} + |# }} + |links: + | # URL to the documentation for this method (required). + | documentation: https://url.to/the/documentation + | # URL to the code repository for this method (required). + | repository: https://github.com/organisation/repository |''') + return str + +def generate_info(par, component_type, pretty_name) -> str: + """Generate the info for a component.""" + if component_type in ["method", "control_method"]: + str = strip_margin(f'''\ + | # Which normalisation method this component prefers to use (required). + | preferred_normalization: log_cp10k + |''') return str elif component_type == "metric": return strip_margin(f'''\ - | metrics: + | metrics: | # A unique identifier for your metric (required). | # Can contain only lowercase letters or underscores. - | name: {par["name"]} + | - name: {par["name"]} | # A relatively short label, used when rendering visualisarions (required) | label: {pretty_name} | # A one sentence summary of how this metric works (required). Used when @@ -107,12 +124,22 @@ def generate_info(par, component_type, pretty_name) -> str: | # when rendering reference documentation. | description: | | FILL IN: A (multi-line) description of how this metric works. - | # A reference key from the bibtex library at src/common/library.bib (required). - | reference: bibtex_reference_key - | # URL to the documentation for this metric (required). - | documentation_url: https://url.to/the/documentation - | # URL to the code repository for this metric (required). - | repository_url: https://github.com/organisation/repository + | # references: + | # doi: + | # - 10.1000/xx.123456.789 + | # bibtex: + | # - | + | # @article{{foo, + | # title={{Foo}}, + | # author={{Bar}}, + | # journal={{Baz}}, + | # year={{2024}} + | # }} + | links: + | # URL to the documentation for this metric (required). + | documentation: https://url.to/the/documentation + | # URL to the code repository for this metric (required). + | repository: https://github.com/organisation/repository | # The minimum possible value for this metric (required) | min: 0 | # The maximum possible value for this metric (required) @@ -123,36 +150,36 @@ def generate_info(par, component_type, pretty_name) -> str: def generate_resources(par, script_path) -> str: - """Add the script to the functionality resources.""" + """Add the script to the resources.""" if par["language"] == "python": type_str = "python_script" elif par["language"] == "r": type_str = "r_script" return strip_margin(f'''\ - | # The script of your component (required) - | - type: {type_str} - | path: {script_path} - | # Additional resources your script needs (optional) - | # - type: file - | # path: weights.pt + | # The script of your component (required) + | - type: {type_str} + | path: {script_path} + | # Additional resources your script needs (optional) + | # - type: file + | # path: weights.pt |''') -def generate_docker_platform(par) -> str: - """Set up the docker platform for Python.""" +def generate_docker_engine(par) -> str: + """Set up the docker engine for Python.""" if par["language"] == "python": image_str = "openproblems/base_python:1.0.0" setup_type = "python" - package_example = "scib==1.1.5" + package_example = "numpy<2" elif par["language"] == "r": image_str = "openproblems/base_r:1.0.0" setup_type = "r" - package_example = "tidyverse" + package_example = "tibble" return strip_margin(f'''\ | - type: docker | image: {image_str} | # Add custom dependencies here (optional). For more information, see - | # https://viash.io/reference/config/platforms/docker/#setup . + | # https://viash.io/reference/config/engines/docker/#setup . | # setup: | # - type: {setup_type} | # packages: {package_example} @@ -160,7 +187,7 @@ def generate_docker_platform(par) -> str: def set_par_values(config) -> None: """Adds values to each of the arguments in a config file.""" - args = config['functionality']['arguments'] + args = config['arguments'] for argi, arg in enumerate(args): key = re.sub("^-*", "", arg['name']) @@ -169,14 +196,14 @@ def set_par_values(config) -> None: value = arg.get("default", arg.get("example", "...")) elif arg.get("direction", "input") == "input": key_strip = key.replace("input_", "") - value = f'resources_test/{par["task"]}/pancreas/{key_strip}.h5ad' + value = f'resources_test/.../{key_strip}.h5ad' else: key_strip = key.replace("output_", "") value = f'{key_strip}.h5ad' # store key and value - config['functionality']['arguments'][argi]["key"] = key - config['functionality']['arguments'][argi]["value"] = value + config['arguments'][argi]["key"] = key + config['arguments'][argi]["value"] = value def look_for_adata_arg(args, uns_field): """Look for an argument that has a .uns[uns_field] in its info.slots.""" @@ -200,7 +227,7 @@ def write_output_python(arg, copy_from_adata, is_metric): if is_metric: value = f"{copy_from_adata}.uns['{slot['name']}']" else: - value = "meta['functionality_name']" + value = "meta['name']" else: value = group_name + "_" + slot["name"] inner.append(f"'{slot['name']}': {value}") @@ -229,7 +256,7 @@ def write_output_r(arg, copy_from_adata, is_metric): if is_metric: value = f"{copy_from_adata}$uns[[\"{slot['name']}\"]]" else: - value = "meta[[\"functionality_name\"]]" + value = "meta[[\"name\"]]" else: value = group_name + "_" + slot["name"] inner.append(f"{slot['name']} = {value}") @@ -246,7 +273,7 @@ def write_output_r(arg, copy_from_adata, is_metric): ) def create_python_script(par, config, type): - args = config['functionality']['arguments'] + args = config['arguments'] # create the arguments of the par string par_string = ",\n ".join(f"'{arg['key']}': '{arg['value']}'" for arg in args) @@ -298,7 +325,7 @@ def create_python_script(par, config, type): | {par_string} |}} |meta = {{ - | 'functionality_name': '{par["name"]}' + | 'name': '{par["name"]}' |}} |## VIASH END | @@ -313,7 +340,7 @@ def create_python_script(par, config, type): return script def create_r_script(par, api_spec, type): - args = api_spec['functionality']['arguments'] + args = api_spec['arguments'] # create the arguments of the par string par_string = ",\n ".join(f'{arg["key"]} = "{arg["value"]}"' for arg in args) @@ -363,7 +390,7 @@ def create_r_script(par, api_spec, type): | {par_string} |) |meta <- list( - | functionality_name = "{par["name"]}" + | name = "{par["name"]}" |) |## VIASH END | @@ -377,25 +404,6 @@ def create_r_script(par, api_spec, type): return script -# def read_viash_config(file): -# file = file.absolute() - -# # read in config -# command = ["viash", "config", "view", str(file)] - -# # Execute the command and capture the output -# output = subprocess.check_output( -# command, -# universal_newlines=True, -# cwd=str(file.parent) -# ) - -# # Parse the output as YAML -# config = yaml.load(output) - -# return config - - def main(par): ####### CHECK INPUTS ####### print("Check inputs", flush=True) @@ -417,8 +425,7 @@ def main(par): ## CHECK API FILE print("Check API file", flush=True) api_file = Path(par["api_file"]) - viash_yaml = Path(par["viash_yaml"]) - project_dir = viash_yaml.parent + project_dir = find_project_root(api_file) if not api_file.exists(): comp_types = [x.with_suffix("").name.removeprefix("comp_") for x in api_file.parent.glob("**/comp_*.y*ml")] list.sort(comp_types) @@ -429,12 +436,12 @@ def main(par): ## READ API FILE print("Read API file", flush=True) - api = read_and_merge_yaml(api_file) - comp_type = api.get("functionality", {}).get("info", {}).get("type", {}) + api = read_nested_yaml(api_file) + comp_type = api.get("info", {}).get("type", {}) if not comp_type: sys.exit(strip_margin(f"""\ |Error: API file is incorrectly formatted. - | Reason: Could not find component type at `.functionality.info.type`.' + | Reason: Could not find component type at `.info.type`.' | Please fix the formatting of the API file.""")) ####### CREATE OUTPUT DIR ####### @@ -473,4 +480,4 @@ def main(par): if __name__ == "__main__": - main(par) + main(par) \ No newline at end of file diff --git a/src/project/create_component/test.py b/src/project/create_component/test.py new file mode 100644 index 0000000000..6e9236d61b --- /dev/null +++ b/src/project/create_component/test.py @@ -0,0 +1,51 @@ +import subprocess +from os import path +import yaml + +## VIASH START +meta = { + 'executable': 'foo' +} +## VIASH END + +task_template = "/opt/task_template" +output_path = f"{task_template}/src/methods/test_method" + +assert path.exists(task_template), "Task template does not exist" + +cmd = [ + meta['executable'], + '--type', 'method', + '--name', 'test_method', + '--language', 'python', + '--api_file', 'src/api/comp_method.yaml', + '--output', 'src/methods/test_method' +] + +print('>> Running the script as test', flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT, cwd=task_template) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print('>> Checking whether output files exist', flush=True) +assert path.exists(output_path), "Output dir does not exist" + +conf_f = path.join(output_path, 'config.vsh.yaml') +assert path.exists(conf_f), "Config file does not exist" + +script_f = path.join(output_path, "script.py") +assert path.exists(script_f), "Script file does not exist" + +print('>> Checking file contents', flush=True) +with open(conf_f) as f: + conf_data = yaml.safe_load(f) + +assert conf_data['name'] == 'test_method', "Name should be equal to 'test_method'" + +print('All checks succeeded!', flush=True) + diff --git a/src/project/fetch_run_work_dir/config.vsh.yaml b/src/project/fetch_run_work_dir/config.vsh.yaml new file mode 100644 index 0000000000..1a210ba842 --- /dev/null +++ b/src/project/fetch_run_work_dir/config.vsh.yaml @@ -0,0 +1,23 @@ +name: fetch_run_work_dir +namespace: project +description: Fetches a task execution directory from an S3 bucket and sync the input data. +arguments: + - type: string + name: --input + required: true + direction: input + example: s3://openproblems-work/work/65/369e661a583488755685199c87fadf + description: The S3 path to the task execution directory to fetch. + - type: file + name: --output + required: true + direction: output + description: Output directory containing the fetched task execution files. +resources: + - type: bash_script + path: script.sh +engines: + - type: docker + image: amazon/aws-cli:latest +runners: + - type: executable diff --git a/src/project/fetch_run_work_dir/script.sh b/src/project/fetch_run_work_dir/script.sh new file mode 100644 index 0000000000..c25d59f2d1 --- /dev/null +++ b/src/project/fetch_run_work_dir/script.sh @@ -0,0 +1,24 @@ +## VIASH START +par_input=s3://openproblems-nextflow/work/f6/8565066aee4771cc2790b92b4ac660 +par_aws_profile=op +par_aws_credentials=~/.aws/credentials +par_output=debug +## VIASH END + +if [ -n "$par_aws_credentials" ]; then + export AWS_SHARED_CREDENTIALS_FILE="$par_aws_credentials" +fi + +if [ -n "$par_aws_profile" ]; then + export AWS_PROFILE="$par_aws_profile" +fi + +aws s3 sync "$par_input" "$par_output" + +cd "$par_output" + +# Replace the miniconda aws with the system aws +sed -i 's#/home/ec2-user/miniconda/bin/aws#aws#g' .command.run +# sed -i 's# s3 cp # s3 sync #g' .command.run + +bash .command.run nxf_stage diff --git a/src/project/render_readme/config.vsh.yaml b/src/project/render_readme/config.vsh.yaml new file mode 100644 index 0000000000..37ad2a86cf --- /dev/null +++ b/src/project/render_readme/config.vsh.yaml @@ -0,0 +1,50 @@ +name: render_readme +namespace: project +description: | + Render the task README +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Path to the root directory + default: "." + required: false + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: Path to the component directory. Suggested location is `README.md`. + default: README.md + required: false +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: test.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: + - processx + github: + - openproblems-bio/core/packages/r/openproblems.utils + - openproblems-bio/core/packages/r/openproblems + - openproblems-bio/core/packages/r/openproblems.docs + - type: apt + packages: [jq, curl] + - type: docker + # download and install quarto-*-linux-amd64.deb from latest release + run: | + release_info=$(curl -s https://api.github.com/repos/quarto-dev/quarto-cli/releases/latest) && \ + download_url=$(printf "%s" "$release_info" | jq -r '.assets[] | select(.name | test("quarto-.*-linux-amd64.deb")) | .browser_download_url') && \ + curl -sL "$download_url" -o /opt/quarto.deb && \ + dpkg -i /opt/quarto.deb && \ + rm /opt/quarto.deb +runners: + - type: executable + - type: nextflow diff --git a/src/project/render_readme/script.R b/src/project/render_readme/script.R new file mode 100644 index 0000000000..dcd394df4d --- /dev/null +++ b/src/project/render_readme/script.R @@ -0,0 +1,35 @@ +requireNamespace("openproblems.docs", quietly = TRUE) +requireNamespace("processx", quietly = TRUE) + +## VIASH START +par <- list( + "input" = "path/to/input", + "output" = "path/to/input/README.md" +) +## VIASH END + +cat("Read task metadata\n") +metadata <- openproblems.docs::read_task_metadata(par$input) + +cat("Render README.qmd content\n") +qmd_content <- openproblems.docs::render_task_readme_qmd(metadata) + +cat("Write README.qmd to file\n") +if (!dir.exists(meta$temp_dir)) { + dir.create(meta$temp_dir, recursive = TRUE) +} +qmd_file <- tempfile( + pattern = "README_", + fileext = ".qmd", + tmpdir = meta$temp_dir +) +writeLines(qmd_content, qmd_file) + +cat("Render README.qmd to README.md\n") +out <- processx::run( + command = "quarto", + args = c("render", qmd_file, "--output", "-"), + echo = TRUE +) + +writeLines(out$stdout, par$output) diff --git a/src/project/render_readme/test.R b/src/project/render_readme/test.R new file mode 100644 index 0000000000..96b029d1c5 --- /dev/null +++ b/src/project/render_readme/test.R @@ -0,0 +1,27 @@ +requireNamespace("assertthat", quietly = TRUE) + +## VIASH START +## VIASH END + +input <- system.file("extdata", "example_project", "api", package = "openproblems.docs") + +output_path <- "output.md" + +cat(">> Running the script as test\n") +out <- processx::run( + meta[["executable"]], + args = c("--input", input, "--output", output_path) +) + +cat(">> Checking whether output files exist\n") +assertthat::assert_that(file.exists(output_path)) + +cat(">> Checking file contents\n") +lines <- readLines(output_path) +assertthat::assert_that(any(grepl("# Template", lines))) +assertthat::assert_that(any(grepl("## Description", lines))) +# assertthat::assert_that(any(grepl("## Authors", lines))) +assertthat::assert_that(any(grepl("flowchart TB", lines))) +assertthat::assert_that(any(grepl("## File format:", lines))) + +cat("All checks succeeded!\n") diff --git a/src/project/sync_resources/config.vsh.yaml b/src/project/sync_resources/config.vsh.yaml new file mode 100644 index 0000000000..908f1a7d8c --- /dev/null +++ b/src/project/sync_resources/config.vsh.yaml @@ -0,0 +1,56 @@ +name: sync_resources +namespace: project +description: Sync test resources to the local filesystem +usage: | + sync_resources + sync_resources --input _viash.yaml --output . +argument_groups: + - name: Inputs + arguments: + - name: "--input" + alternatives: ["-i"] + type: file + description: "Path to the _viash.yaml project configuration file." + default: _viash.yaml + - name: Outputs + arguments: + - name: "--output" + alternatives: ["-o"] + type: file + default: . + direction: output + description: "Path to the directory where the resources will be synced to." + - name: Arguments + arguments: + - name: "--quiet" + type: boolean_true + description: "Displays the operations that would be performed using the specified command without actually running them." + - name: "--dryrun" + type: boolean_true + description: "Does not display the operations performed from the specified command." + - name: "--delete" + type: boolean_true + description: "Files that exist in the destination but not in the source are deleted during sync." + - name: "--exclude" + type: "string" + multiple: true + description: Exclude all files or objects from the command that matches the specified pattern. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: "amazon/aws-cli:2.17.11" + setup: + - type: yum + packages: [wget] + - type: docker + run : | + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && \ + chmod +x /usr/bin/yq +runners: + - type: executable + - type: nextflow diff --git a/src/project/sync_resources/script.sh b/src/project/sync_resources/script.sh new file mode 100644 index 0000000000..29afb29b9e --- /dev/null +++ b/src/project/sync_resources/script.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +## VIASH START +par_input='_viash.yaml' +par_output='.' +## VIASH END + +extra_params=( ) + +if [ "$par_quiet" == "true" ]; then + extra_params+=( "--quiet" ) +fi +if [ "$par_dryrun" == "true" ]; then + extra_params+=( "--dryrun" ) +fi +if [ "$par_delete" == "true" ]; then + extra_params+=( "--delete" ) +fi + +if [ ! -z ${par_exclude+x} ]; then + IFS=";" + for var in $par_exclude; do + unset IFS + extra_params+=( "--exclude" "$var" ) + done +fi + +function sync_s3() { + local s3_path="$1" + local dest_path="$2" + AWS_EC2_METADATA_DISABLED=true \ + aws s3 sync \ + "$s3_path" \ + "$dest_path" \ + --no-sign-request \ + "${extra_params[@]}" +} + +yq e \ + '.info.test_resources[] | "{type: " + (.type // "s3") + ", path: " + .path + ", dest: " + .dest + "}"' \ + "${par_input}" | \ + while read -r line; do + type=$(echo "$line" | yq e '.type') + path=$(echo "$line" | yq e '.path') + dest=$(echo "$line" | yq e '.dest') + + echo "Syncing '$path' to '$dest'..." + + if [ "$type" == "s3" ]; then + sync_s3 "$path" "$par_output/$dest" + fi + done diff --git a/src/common/sync_test_resources/run_test.sh b/src/project/sync_resources/test.sh similarity index 52% rename from src/common/sync_test_resources/run_test.sh rename to src/project/sync_resources/test.sh index 67f2504531..55034d720d 100755 --- a/src/common/sync_test_resources/run_test.sh +++ b/src/project/sync_resources/test.sh @@ -3,10 +3,18 @@ ## VIASH START ## VIASH END +cat > _viash.yaml << EOM +info: + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/common/pancreas + dest: foo +EOM + echo ">> Run aws s3 sync" -./$meta_functionality_name \ - --input s3://openproblems-data/resources_test/common/pancreas \ - --output foo \ +"$meta_executable" \ + --input _viash.yaml \ + --output . \ --quiet echo ">> Check whether the right files were copied" diff --git a/src/project/upgrade_config/config.vsh.yaml b/src/project/upgrade_config/config.vsh.yaml new file mode 100644 index 0000000000..1c3d75346d --- /dev/null +++ b/src/project/upgrade_config/config.vsh.yaml @@ -0,0 +1,38 @@ +name: upgrade_config +namespace: project +description: | + Upgrade a component config from viash version 0.8 to version 0.9. +usage: + viash run upgrade_config/config.vsh.yaml -- --input method --output foo + +arguments: + - type: file + name: --input + direction: input + description: Path to the input config. + example: input.vsh.yaml + - type: file + name: --output + description: Path to the output config. + example: input.vsh.yaml + direction: output + +resources: + - type: python_script + path: script.py + # - path: library.bib + +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: ruamel.yaml +runners: + - type: executable + - type: nextflow + diff --git a/src/project/upgrade_config/script.py b/src/project/upgrade_config/script.py new file mode 100644 index 0000000000..c04af6c5c7 --- /dev/null +++ b/src/project/upgrade_config/script.py @@ -0,0 +1,91 @@ +import ruamel.yaml +# import re + +## VIASH START +par = { + "input": "input.vsh.yaml", + "output": "output.vsh.yaml", +} +## VIASH END + +yaml = ruamel.yaml.YAML() + +# Set indentation rules +yaml.indent(mapping=2, sequence=4, offset=2) + +# Load input config +with open(par["input"], "r") as file: + data = yaml.load(file) + +transformed_yaml_content = ruamel.yaml.CommentedMap() + +# Add __merge__, if necessary +if "__merge__" in data: + transformed_yaml_content["__merge__"] = data["__merge__"] + +# Remove .functionality +if "functionality" in data: + if "info" in data["functionality"]: + info_content = data["functionality"]["info"] + label = info_content.pop("label", None) + summary = info_content.pop("summary", None) + description = info_content.pop("description", None) + reference = info_content.pop("reference", None) + repository = info_content.pop("repository_url", None) + documentation = info_content.pop("documentation_url", None) + + # Remove 'info' if it becomes empty + if not info_content: + data["functionality"].pop("info") + + updated_functionality = ruamel.yaml.CommentedMap() + updated_functionality["name"] = data["functionality"].pop("name") + + # Move out of info + if label is not None: + updated_functionality["label"] = label + if summary is not None: + updated_functionality["summary"] = summary + if description is not None: + updated_functionality["description"] = description + + # Fetch doi using reference key + if reference is not None: + updated_functionality["references"] = {} + # with open(f"library.bib", "r") as file: + # bib = file.read() + # entry_pattern = r"(@\w+{[^}]*" + reference + r"[^}]*}(.|\n)*?)(?=@)" + # bib_entry = re.search(entry_pattern, bib) + # if bib_entry: + # doi_pattern = r"(?=[Dd][Oo][Ii]\s*=\s*{([^,}]+)})" + # entry_doi = re.search(doi_pattern, bib_entry.group(1)) + # updated_functionality["references"]["doi"] = entry_doi.group(1) + updated_functionality["references"]["bibtex"] = reference + + # Add links + updated_functionality["links"] = {} + if repository is not None: + updated_functionality["links"]["repository"] = repository + if documentation is not None: + updated_functionality["links"]["documentation"] = documentation + + # Add remaining contents from .functionality + updated_functionality.update(data["functionality"]) + + transformed_yaml_content.update(updated_functionality) + +# Mapping platforms to engines and runners +transformed_yaml_content["engines"] = [] +transformed_yaml_content["runners"] = [] +for platform in data["platforms"]: + if platform["type"] == "docker": + transformed_yaml_content["engines"].append(platform) + elif platform["type"] == "nextflow": + transformed_yaml_content["runners"].append(platform) + +# Insert `type: executable` into runners +transformed_yaml_content["runners"].insert(0, {"type": "executable"}) + +# Write the transformed YAML to a new file +with open(par["output"], 'w') as file: + yaml.dump(transformed_yaml_content, file) \ No newline at end of file diff --git a/src/project/upgrade_config/test.py b/src/project/upgrade_config/test.py new file mode 100644 index 0000000000..19bc5084bc --- /dev/null +++ b/src/project/upgrade_config/test.py @@ -0,0 +1,64 @@ +from openproblems.utils import strip_margin +from os import path +import subprocess +import yaml + +test_data = strip_margin(f'''\ + |functionality: + | name: "phate" + | info: + | label: PHATE + | summary: Preservating trajectories in a dataset by using heat diffusion potential. + | description: | + | PHATE uses the potential of heat diffusion to preserve trajectories in a dataset via a diffusion process + | reference: "moon2019visualizing" + | repository_url: "https://github.com/KrishnaswamyLab/PHATE" + | documentation_url: "https://github.com/KrishnaswamyLab/PHATE#readme" + | preferred_normalization: sqrt_cp10k + | # component specific arguments + | arguments: + | - name: '--n_pca_dims' + | type: integer + | description: Number of principal components of PCA to use. + | resources: + | - type: python_script + | path: script.py + |platforms: + | - type: docker + | image: ghcr.io/openproblems-bio/base_python:1.0.4 + | - type: nextflow + | directives: + | label: [midtime, highmem, highcpu] + |''' +) + +input = "input.vsh.yaml" +with open(input, "w") as file: + file.write(test_data) + +output = "output.vsh.yaml" + +cmd = [ + meta['executable'], + '--input', input, + '--output', output +] + +print('>> Running the script as test', flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print('>> Checking whether output files exist', flush=True) +assert path.exists(output), "Output file does not exist" + +print('>> Checking file contents', flush=True) +with open(output) as f: + conf_data = yaml.safe_load(f) + +assert "functionality" not in conf_data, ".functionality not removed" +assert "engines" in conf_data, ".platforms not updated" + +print('All checks succeeded!', flush=True) diff --git a/src/reporting/generate_qc/config.vsh.yaml b/src/reporting/generate_qc/config.vsh.yaml new file mode 100644 index 0000000000..2fd9786e9c --- /dev/null +++ b/src/reporting/generate_qc/config.vsh.yaml @@ -0,0 +1,49 @@ +name: generate_qc +namespace: reporting +description: Generate task QC metrics +arguments: + - name: --task_info + type: file + description: Task info file + example: resources_test/openproblems/task_results_v3/processed/task_info.json + - name: --method_info + type: file + description: Method info file + example: resources_test/openproblems/task_results_v3/processed/method_info.json + - name: --metric_info + type: file + description: Metric info file + example: resources_test/openproblems/task_results_v3/processed/metric_info.json + - name: --dataset_info + type: file + description: Dataset info file + example: resources_test/openproblems/task_results_v3/processed/dataset_info.json + - name: --results + type: file + description: Results file + example: resources_test/openproblems/task_results_v3/processed/results.json + - name: --output + type: file + direction: output + default: output.json + description: Output json + info: + format: + type: json + # TODO: add schema +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/generate_qc/script.py b/src/reporting/generate_qc/script.py similarity index 94% rename from src/common/process_task_results/generate_qc/script.py rename to src/reporting/generate_qc/script.py index f15a877522..685cc6436e 100644 --- a/src/common/process_task_results/generate_qc/script.py +++ b/src/reporting/generate_qc/script.py @@ -2,6 +2,14 @@ import numpy as np ## VIASH START +par = { + "task_info": "resources_test/openproblems/task_results_v3/processed/task_info.json", + "method_info": "resources_test/openproblems/task_results_v3/processed/method_info.json", + "metric_info": "resources_test/openproblems/task_results_v3/processed/metric_info.json", + "dataset_info": "resources_test/openproblems/task_results_v3/processed/dataset_info.json", + "results": "resources_test/openproblems/task_results_v3/processed/results.json", + "output": "output.json" +} ## VIASH END EXPECTED_TASK_FIELDS = ["task_id", "task_name", "task_summary", "task_description"] @@ -57,7 +65,7 @@ def add_qc( def percent_missing(list_of_dicts, field): are_missing = [] for item in list_of_dicts: - if field == 'paper_reference' and item.get('is_baseline', False): + if field == "paper_reference" and item.get("is_baseline", False): are_missing.append(0.0) elif field in item and item[field] is not None: are_missing.append(0.0) @@ -124,7 +132,7 @@ def percent_missing(list_of_dicts, field): # turn results into long format for easier processing results_long = [ { - "task_id": x["task_id"], + "task_id": task_id, "method_id": x["method_id"], "dataset_id": x["dataset_id"], "metric_id": metric["metric_id"], diff --git a/src/reporting/get_dataset_info/config.vsh.yaml b/src/reporting/get_dataset_info/config.vsh.yaml new file mode 100644 index 0000000000..0c32dd5557 --- /dev/null +++ b/src/reporting/get_dataset_info/config.vsh.yaml @@ -0,0 +1,37 @@ +name: get_dataset_info +namespace: reporting +description: Extract dataset info and convert to expected format for website results +arguments: + - name: --input + type: file + description: A yaml file + required: true + example: resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml + - name: --output + type: file + direction: output + default: output.json + description: Output json + info: + format: + type: json + # TODO: add schema +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_dataset_info/script.R b/src/reporting/get_dataset_info/script.R similarity index 89% rename from src/common/process_task_results/get_dataset_info/script.R rename to src/reporting/get_dataset_info/script.R index a2c5317c05..797fdb1ad6 100644 --- a/src/common/process_task_results/get_dataset_info/script.R +++ b/src/reporting/get_dataset_info/script.R @@ -5,8 +5,8 @@ library(rlang, warn.conflicts = FALSE) ## VIASH START par <- list( - input = "output/label_projection/dataset_uns.yaml", - output = "output/dataset_info.json" + input = "resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml", + output = "resources_test/openproblems/task_results_v3/processed/dataset_info.json" ) ## VIASH END @@ -20,7 +20,6 @@ outputs <- map(datasets, function(dataset) { # construct v1 format out <- list( - "task_id" = par$task_id, "dataset_id" = dataset$dataset_id, "dataset_name" = dataset$dataset_name, "dataset_summary" = dataset$dataset_summary, diff --git a/src/reporting/get_method_info/config.vsh.yaml b/src/reporting/get_method_info/config.vsh.yaml new file mode 100644 index 0000000000..0a4e980329 --- /dev/null +++ b/src/reporting/get_method_info/config.vsh.yaml @@ -0,0 +1,37 @@ +name: get_method_info +namespace: reporting +description: Extract method info +arguments: + - name: --input + type: file + description: A yaml file + required: true + example: resources_test/openproblems/task_results_v3/raw/method_configs.yaml + - name: --output + type: file + direction: output + default: output.json + description: Output json + info: + format: + type: json + # TODO: add schema +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_method_info/script.R b/src/reporting/get_method_info/script.R similarity index 79% rename from src/common/process_task_results/get_method_info/script.R rename to src/reporting/get_method_info/script.R index a332413b69..2d20cbf0cf 100644 --- a/src/common/process_task_results/get_method_info/script.R +++ b/src/reporting/get_method_info/script.R @@ -5,8 +5,8 @@ library(rlang, warn.conflicts = FALSE) ## VIASH START par <- list( - input = "output/temp/method_configs.yaml", - output = "output/test/method_info.json" + input = "resources_test/openproblems/task_results_v3/raw/method_configs.yaml", + output = "resources_test/openproblems/task_results_v3/processed/method_info.json" ) ## VIASH END @@ -31,13 +31,17 @@ outputs <- map(configs, function(config) { info$task_id <- gsub("/.*", "", config$namespace) info$id <- config$name info$namespace <- config$namespace + info$label <- config$label %||% info$label + info$summary <- config$summary %||% info$summary + info$description <- config$description %||% info$description info$commit_sha <- build_info$git_commit %||% "missing-sha" info$code_version <- "missing-version" - info$implementation_url <- paste0( - build_info$git_remote, "/blob/", - build_info$git_commit, "/", - info$config_path - ) + info$implementation_url <- paste0( + build_info$git_remote, "/blob/", + build_info$git_commit, "/", + info$config_path + ) + info$type_info <- NULL # ↑ this could be used as the new format diff --git a/src/reporting/get_metric_info/config.vsh.yaml b/src/reporting/get_metric_info/config.vsh.yaml new file mode 100644 index 0000000000..0eb2fb22c0 --- /dev/null +++ b/src/reporting/get_metric_info/config.vsh.yaml @@ -0,0 +1,37 @@ +name: get_metric_info +namespace: reporting +description: Extract metric info +arguments: + - name: --input + type: file + description: A yaml file + required: true + example: resources_test/openproblems/task_results_v3/raw/metric_configs.yaml + - name: --output + type: file + direction: output + default: output.json + description: Output json + info: + format: + type: json + # TODO: add schema +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_metric_info/script.R b/src/reporting/get_metric_info/script.R similarity index 92% rename from src/common/process_task_results/get_metric_info/script.R rename to src/reporting/get_metric_info/script.R index 5ef8f6b04b..0b3aff3088 100644 --- a/src/common/process_task_results/get_metric_info/script.R +++ b/src/reporting/get_metric_info/script.R @@ -5,8 +5,8 @@ library(rlang, warn.conflicts = FALSE) ## VIASH START par <- list( - input = "output/temp/metric_configs.yaml", - output = "output/metric_info.json" + input = "resources_test/openproblems/task_results_v3/raw/metric_configs.yaml", + output = "resources_test/openproblems/task_results_v3/processed/metric_info.json" ) ## VIASH END @@ -31,6 +31,7 @@ outputs <- map(configs, function(config) { info$config_path <- gsub(".*/src/", "src/", build_info$config) info$task_id <- gsub("/.*", "", config$namespace) info$id <- info$name + info$name <- NULL info$component_id <- config$name info$namespace <- config$namespace info$commit_sha <- build_info$git_commit %||% "missing-sha" diff --git a/src/reporting/get_results/config.vsh.yaml b/src/reporting/get_results/config.vsh.yaml new file mode 100644 index 0000000000..e0191cf2b0 --- /dev/null +++ b/src/reporting/get_results/config.vsh.yaml @@ -0,0 +1,65 @@ +name: get_results +namespace: reporting +description: Extract execution info +argument_groups: + - name: Inputs + arguments: + - name: --input_scores + type: file + description: Scores file + example: resources_test/openproblems/task_results_v3/raw/score_uns.yaml + - name: --input_execution + type: file + description: Nextflow log file + example: resources_test/openproblems/task_results_v3/raw/trace.txt + - name: --input_dataset_info + type: file + description: Method info file + example: resources_test/openproblems/task_results_v3/processed/dataset_info.json + - name: --input_method_info + type: file + description: Method info file + example: resources_test/openproblems/task_results_v3/processed/method_info.json + - name: --input_metric_info + type: file + description: Metric info file + example: resources_test/openproblems/task_results_v3/processed/metric_info.json + - name: Outputs + arguments: + - name: --output_results + type: file + direction: output + description: Output json + default: results.json + info: + format: + type: json + # TODO: add schema + - name: --output_metric_execution_info + type: file + direction: output + description: Output metric execution info + default: metric_execution_info.json + info: + format: + type: json + # TODO: add schema +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, dplyr, tidyr, readr, lubridate, dynutils, processx ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_results/script.R b/src/reporting/get_results/script.R similarity index 88% rename from src/common/process_task_results/get_results/script.R rename to src/reporting/get_results/script.R index 822562aa18..9f8459cffb 100644 --- a/src/common/process_task_results/get_results/script.R +++ b/src/reporting/get_results/script.R @@ -9,16 +9,16 @@ library(purrr, warn.conflicts = FALSE) library(rlang, warn.conflicts = FALSE) ## VIASH START -dir <- "work/c1/6660ea0cc6155d7e13fa341d16057b/_viash_par" par <- list( - task_id = "task_1", - input_scores = paste0(dir, "/input_scores_1/score_uns.yaml"), - input_execution = paste0(dir, "/input_execution_1/trace.txt"), - input_dataset_info = paste0(dir, "/input_dataset_info_1/output.json"), - input_method_info = paste0(dir, "/input_method_info_1/output.json"), - input_metric_info = paste0(dir, "/input_metric_info_1/output.json"), - output_results = "output/results.json", - output_metric_execution_info = "output/metric_execution_info.json" + # inputs + input_scores = "resources_test/openproblems/task_results_v3/raw/score_uns.yaml", + input_execution = "resources_test/openproblems/task_results_v3/raw/trace.txt", + input_dataset_info = "resources_test/openproblems/task_results_v3/processed/dataset_info.json", + input_method_info = "resources_test/openproblems/task_results_v3/processed/method_info.json", + input_metric_info = "resources_test/openproblems/task_results_v3/processed/metric_info.json", + # outputs + output_results = "resources_test/openproblems/task_results_v3/processed/results.json", + output_metric_execution_info = "resources_test/openproblems/task_results_v3/processed/metric_execution_info.json" ) ## VIASH END @@ -74,7 +74,7 @@ raw_scores <- x[c("dataset_id", "method_id", "metric_ids", "metric_values")] )) }, error = function(e) { - message("Encountered error while reading scores: ", e$message) + message("Encountered error while reading scores.\n Error: ", e$message, "\n Data: ", paste(paste0(names(x), "=", x), collapse = ", ")) NULL }) }) diff --git a/src/reporting/get_task_info/config.vsh.yaml b/src/reporting/get_task_info/config.vsh.yaml new file mode 100644 index 0000000000..6c87e0fd5c --- /dev/null +++ b/src/reporting/get_task_info/config.vsh.yaml @@ -0,0 +1,37 @@ +name: get_task_info +namespace: reporting +description: Extract task info +arguments: + - name: --input + type: file + description: A yaml file + required: true + example: resources_test/openproblems/task_results_v3/raw/task_info.yaml + - name: --output + type: file + direction: output + default: output.json + description: Output json + info: + format: + type: json + # TODO: add schema +resources: + - type: r_script + path: script.R +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v3 + dest: resources_test/openproblems/task_results_v3 +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_task_info/script.R b/src/reporting/get_task_info/script.R similarity index 60% rename from src/common/process_task_results/get_task_info/script.R rename to src/reporting/get_task_info/script.R index 71f1cb777a..d6096aae87 100644 --- a/src/common/process_task_results/get_task_info/script.R +++ b/src/reporting/get_task_info/script.R @@ -5,8 +5,8 @@ library(rlang, warn.conflicts = FALSE) ## VIASH START par <- list( - input = "output/temp/task_info.yaml", - output = "output/test/task_info.json" + input = "resources_test/openproblems/task_results_v3/raw/task_info.yaml", + output = "resources_test/openproblems/task_results_v3/processed/task_info.json" ) ## VIASH END @@ -14,13 +14,25 @@ info <- yaml::yaml.load_file(par$input) # ↑ this could be used as the new format # construct v1 format +repo <- + if ("name" %in% names(info) && "organization" %in% names(info)) { + paste0(info$organization, "/", info$name) + } else { + "openproblems-bio/openproblems" + } +description <- + if ("motivation" %in% names(info)) { + paste0(info$motivation, "\n\n", info$description) + } else { + info$description + } out <- list( task_id = info$name, commit_sha = NA_character_, task_name = info$label, task_summary = info$summary, - task_description = paste0(info$motivation, "\n\n", info$description), - repo = "openproblems-bio/openproblems", + task_description = description, + repo = repo, authors = info$authors ) diff --git a/src/reporting/process_dataset_metadata/config.vsh.yaml b/src/reporting/process_dataset_metadata/config.vsh.yaml new file mode 100644 index 0000000000..bb8cba8c8d --- /dev/null +++ b/src/reporting/process_dataset_metadata/config.vsh.yaml @@ -0,0 +1,28 @@ +name: process_dataset_metadata +namespace: reporting +description: >- + This workflow transforms the meta information of the datasets into a format + that can be used by the website. +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + direction: input + example: meta.yaml + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + default: meta.json +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: utils/yaml_to_json +runners: + - type: nextflow \ No newline at end of file diff --git a/src/common/process_dataset_metadata/run/main.nf b/src/reporting/process_dataset_metadata/main.nf similarity index 100% rename from src/common/process_dataset_metadata/run/main.nf rename to src/reporting/process_dataset_metadata/main.nf diff --git a/src/common/process_dataset_metadata/run/run.sh b/src/reporting/process_dataset_metadata/run.sh similarity index 94% rename from src/common/process_dataset_metadata/run/run.sh rename to src/reporting/process_dataset_metadata/run.sh index 27ea225ed3..f31e6cc744 100644 --- a/src/common/process_dataset_metadata/run/run.sh +++ b/src/reporting/process_dataset_metadata/run.sh @@ -37,7 +37,7 @@ for LOADER in $(aws s3 ls $DATASET_DIR); do # start the NXF_VER=23.10.0 nextflow run . \ - -main-script target/nextflow/common/process_dataset_metadata/run/main.nf \ + -main-script target/nextflow/reporting/process_dataset_metadata/main.nf \ -profile docker \ -c src/wf_utils/labels_ci.config \ --id "extract_metadata" \ diff --git a/src/reporting/process_task_results/config.vsh.yaml b/src/reporting/process_task_results/config.vsh.yaml new file mode 100644 index 0000000000..60b687615b --- /dev/null +++ b/src/reporting/process_task_results/config.vsh.yaml @@ -0,0 +1,90 @@ +name: process_task_results +namespace: reporting +description: >- + This workflow transforms the meta information of the results into a format + that can be used by the website. +argument_groups: + - name: Inputs + arguments: + - name: "--input_scores" + type: file + required: true + direction: input + description: A yaml file containing the scores of each of the methods + example: score_uns.yaml + - name: "--input_method_configs" + type: file + required: true + direction: input + example: method_configs.yaml + - name: "--input_metric_configs" + type: file + required: true + direction: input + example: metric_configs.yaml + - name: "--input_dataset_info" + type: file + required: true + direction: input + example: dataset_info.yaml + - name: "--input_execution" + type: file + required: true + direction: input + example: trace.txt + - name: "--input_task_info" + type: file + required: true + direction: input + example: task_info.yaml + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: results.json + - name: "--output_method_info" + type: file + required: true + direction: output + default: method_info.json + - name: "--output_metric_info" + type: file + required: true + direction: output + default: metric_info.json + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_info.json + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.json + - name: "--output_qc" + type: file + required: true + direction: output + default: quality_control.json + - name: "--output_metric_execution_info" + type: file + required: true + direction: output + default: metric_execution_info.json +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: reporting/get_results + - name: reporting/get_method_info + - name: reporting/get_metric_info + - name: reporting/get_dataset_info + - name: reporting/get_task_info + - name: reporting/generate_qc +runners: + - type: nextflow \ No newline at end of file diff --git a/src/common/process_task_results/run/main.nf b/src/reporting/process_task_results/main.nf similarity index 87% rename from src/common/process_task_results/run/main.nf rename to src/reporting/process_task_results/main.nf index dadbcfa1f6..a5154770e2 100644 --- a/src/common/process_task_results/run/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -1,10 +1,3 @@ -// workflow auto { -// findStates(params, meta.config) -// | meta.workflow.run( -// auto: [publish: "state"] -// ) -// } - workflow run_wf { take: input_ch @@ -12,9 +5,8 @@ workflow run_wf { main: output_ch = input_ch - | get_task_info.run( - key: "task_info", fromState: [ + | get_task_info.run( "input": "input_task_info" ], toState: ["output_task": "output"] @@ -29,7 +21,6 @@ workflow run_wf { | get_method_info.run( fromState: [ "input": "input_method_configs", - "task_id" : "task_id" ], toState: ["output_method": "output"] ) @@ -37,14 +28,12 @@ workflow run_wf { | get_metric_info.run( fromState: [ "input": "input_metric_configs", - "task_id" : "task_id" ], toState: ["output_metric": "output"] ) | get_dataset_info.run( fromState: [ - "task_id" : "task_id", "input": "input_dataset_info", ], toState: ["output_dataset": "output"] @@ -52,7 +41,6 @@ workflow run_wf { | get_results.run( fromState: [ - "task_id": "task_id", "input_scores": "input_scores", "input_execution": "input_execution", "input_dataset_info": "output_dataset", diff --git a/src/common/process_task_results/run/run_nf_tower_test.sh b/src/reporting/process_task_results/run_nf_tower_test.sh similarity index 93% rename from src/common/process_task_results/run/run_nf_tower_test.sh rename to src/reporting/process_task_results/run_nf_tower_test.sh index ca74e357a1..4357edf314 100644 --- a/src/common/process_task_results/run/run_nf_tower_test.sh +++ b/src/reporting/process_task_results/run_nf_tower_test.sh @@ -31,7 +31,7 @@ HERE tw launch https://github.com/openproblems-bio/openproblems.git \ --revision main_build \ --pull-latest \ - --main-script target/nextflow/common/workflows/transform_meta/main.nf \ + --main-script target/nextflow/reporting/process_task_results/main.nf \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file /tmp/params.yaml \ diff --git a/src/common/process_task_results/run/run_test.sh b/src/reporting/process_task_results/run_test.sh similarity index 94% rename from src/common/process_task_results/run/run_test.sh rename to src/reporting/process_task_results/run_test.sh index 762785b754..58172e811f 100755 --- a/src/common/process_task_results/run/run_test.sh +++ b/src/reporting/process_task_results/run_test.sh @@ -24,7 +24,7 @@ for TASK in "denoising" "dimensionality_reduction" "batch_integration" "label_pr # start the run NXF_VER=23.10.0 nextflow run . \ - -main-script target/nextflow/common/process_task_results/run/main.nf \ + -main-script target/nextflow/reporting/process_task_results/main.nf \ -profile docker \ -resume \ -c src/wf_utils/labels_ci.config \ diff --git a/src/utils/decompress_gzip/config.vsh.yaml b/src/utils/decompress_gzip/config.vsh.yaml new file mode 100644 index 0000000000..b96d6f604f --- /dev/null +++ b/src/utils/decompress_gzip/config.vsh.yaml @@ -0,0 +1,26 @@ +name: decompress_gzip +namespace: utils +arguments: + - name: --input + type: file + description: Input file + example: /path/to/file.gz + - name: --output + type: file + description: Output file + example: /path/to/file + direction: output +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: ubuntu:latest +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/common/decompress_gzip/script.sh b/src/utils/decompress_gzip/script.sh similarity index 100% rename from src/common/decompress_gzip/script.sh rename to src/utils/decompress_gzip/script.sh diff --git a/src/common/decompress_gzip/test.sh b/src/utils/decompress_gzip/test.sh similarity index 100% rename from src/common/decompress_gzip/test.sh rename to src/utils/decompress_gzip/test.sh diff --git a/src/utils/extract_uns_metadata/config.vsh.yaml b/src/utils/extract_uns_metadata/config.vsh.yaml new file mode 100644 index 0000000000..e415b25eac --- /dev/null +++ b/src/utils/extract_uns_metadata/config.vsh.yaml @@ -0,0 +1,45 @@ +name: extract_uns_metadata +namespace: files +description: Extract .uns metadata from an h5ad file and write it to a yaml file. +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: A h5ad file. + - name: --schema + type: file + required: false + description: An optional schema with which to annotate the output + - name: --uns_length_cutoff + type: integer + required: false + description: The maximum length of the .uns metadata to extract. If a value in uns is a list or a dictionary with more elements than the provided cutoff, it will not be extracted. + default: 10 + - name: Output + arguments: + - name: --output + type: file + required: true + description: A yaml file containing the metadata. + example: output_meta.yaml + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - path: /resources_test/common/pancreas + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: python + packages: viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/common/extract_metadata/script.py b/src/utils/extract_uns_metadata/script.py similarity index 88% rename from src/common/extract_metadata/script.py rename to src/utils/extract_uns_metadata/script.py index 7a55b50e21..5d759b60a6 100644 --- a/src/common/extract_metadata/script.py +++ b/src/utils/extract_uns_metadata/script.py @@ -21,6 +21,14 @@ print("Load schema", flush=True) with open(par["schema"], "r") as f: schema = yaml.safe_load(f) + + schema_info = schema.get("info") or {} + assert schema_info, "Schema must contain an 'info' field" + + schema_info_format = schema_info.get("format") or {} + assert schema_info_format, "Schema must contain a '.info.format' field" + + assert schema_info_format.get("type") == "h5ad", ".info.format.type must be 'h5ad'" else: schema = None @@ -114,7 +122,8 @@ def get_structure_dtype(obj) -> str: def get_structure_schema_info(struct, key) -> dict: if schema is None: return {} - struct_args = schema.get("info", {}).get("slots", {}).get(struct, {}) + + struct_args = schema_info_format.get(struct, {}) if struct_args is None: return {} if struct == "X": @@ -149,10 +158,15 @@ def get_structure(adata, struct): # see if the schema has information about this struct schema_info = get_structure_schema_info(struct, key) - if schema_info.get("description"): - out["description"] = schema_info.get("description") - if schema_info.get("type"): - out["schema_type"] = schema_info.get("type") + copy = { + "description": "description", + "summary": "summary", + "label": "label", + "schema_type": "type" + } + for k, v in copy.items(): + if schema_info.get(v): + out[k] = schema_info.get(v) output.append(out) @@ -176,16 +190,15 @@ def get_file_creation_time(path: str) -> str: creation_time = creation_time.strftime('%d-%m-%Y') return str(creation_time) - print("Extract metadata from object", flush=True) # Extract metadata about the adata object uns = {} for key, val in adata.uns.items(): if is_atomic(val): uns[key] = to_atomic(val) - elif is_list_of_atomics(val) and len(val) <= 10: + elif is_list_of_atomics(val) and len(val) <= par["uns_length_cutoff"]: uns[key] = to_list_of_atomics(val) - elif is_dict_of_atomics(val) and len(val) <= 10: + elif is_dict_of_atomics(val) and len(val) <= par["uns_length_cutoff"]: uns[key] = to_dict_of_atomics(val) uns["file_size"] = get_file_size(par["input"]) diff --git a/src/utils/extract_uns_metadata/test.py b/src/utils/extract_uns_metadata/test.py new file mode 100644 index 0000000000..1884bbd47a --- /dev/null +++ b/src/utils/extract_uns_metadata/test.py @@ -0,0 +1,57 @@ +import sys +import pytest +import yaml + +## VIASH START +## VIASH END + +input_path = meta["resources_dir"] + "/pancreas/dataset.h5ad" + +@pytest.fixture +def file_raw(tmp_path): + file_raw_content = { + "type": "file", + "label": "Raw dataset", + "summary": "An unprocessed dataset as output by a dataset loader.", + "description": "This dataset contains raw counts and metadata as output by a dataset loader.", + "info": { + "format": { + "type": "h5ad", + "layers": [ + { + "type": "integer", + "name": "counts", + "description": "Raw counts", + "required": True + } + ], + "obs": [ + { + "type": "string", + "name": "celltype", + "description": "Classification of the cell type based on its characteristics and function within the tissue or organism.", + "required": True + } + ] + } + } + } + file_raw_path = tmp_path / "file_raw.yaml" + with open(file_raw_path, "w") as f: + f.write(yaml.dump(file_raw_content)) + + return file_raw_path + +def test_run(run_component, file_raw, tmp_path): + output_path = tmp_path / "meta.yaml" + + run_component([ + "--input", input_path, + "--schema", str(file_raw), + "--output", str(output_path), + ]) + + assert output_path.exists(), "Output path does not exist" + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/utils/yaml_to_json/config.vsh.yaml b/src/utils/yaml_to_json/config.vsh.yaml new file mode 100644 index 0000000000..6b09ed9e33 --- /dev/null +++ b/src/utils/yaml_to_json/config.vsh.yaml @@ -0,0 +1,24 @@ +name: yaml_to_json +namespace: utils +summary: Convert a YAML file to a JSON file +description: | + This script converts a YAML file to a JSON file. +arguments: + - type: file + name: --input + required: true + description: A YAML file. + - type: file + name: --output + required: true + description: A JSON file. + direction: output +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow diff --git a/src/common/process_task_results/yaml_to_json/script.py b/src/utils/yaml_to_json/script.py similarity index 100% rename from src/common/process_task_results/yaml_to_json/script.py rename to src/utils/yaml_to_json/script.py diff --git a/src/validation/check_dataset_with_schema/config.vsh.yaml b/src/validation/check_dataset_with_schema/config.vsh.yaml new file mode 100644 index 0000000000..e071554bf0 --- /dev/null +++ b/src/validation/check_dataset_with_schema/config.vsh.yaml @@ -0,0 +1,47 @@ +name: check_dataset_with_schema +namespace: validation +summary: Check the format of a file against a schema +description: Checks if the file has the necessary data structures as defined in a schema. +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: An input file. Can be an .h5ad, .parquet, .csv, or .tsv file. + - name: --schema + type: file + required: true + description: A schema file for the input object. + - name: Arguments + arguments: + - name: --stop_on_error + type: boolean + default: false + description: Whether or not to stop with exit code 1 if the input file does not adhere to the schema. + - name: Output + arguments: + - name: --output + type: file + required: true + description: If specified, this file will contain a structured log of which checks succeeded (or not). + example: checks.json + direction: output +resources: + - type: python_script + path: script.py +test_resources: + - path: /resources_test/common/pancreas + - type: python_script + path: test.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: python + packages: viashpy +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/validation/check_dataset_with_schema/script.py b/src/validation/check_dataset_with_schema/script.py new file mode 100644 index 0000000000..1768287fa9 --- /dev/null +++ b/src/validation/check_dataset_with_schema/script.py @@ -0,0 +1,93 @@ +import anndata as ad +import pandas as pd +import yaml +import json + +## VIASH START +par = { + 'input': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/input_1/dataset.h5ad', + 'schema': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/schema_1/schema.yaml', + 'stop_on_error': False, + 'output': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/out.yaml', +} +## VIASH END + +# TODO: need to refactor to reuse the same helper functions as in 'run_and_check_output.py'. + +def check_h5ad_struct(struc, struc_fields, adata_slot): + missing = [] + if struc == "X": + struc_fields["name"] = "X" + struc_fields = [struc_fields] + for obj in struc_fields: + adata_data = adata_slot.get(obj['name']) if struc != 'X' else adata_slot + if obj.get('required') and adata_data is None: + missing.append(obj['name']) + # todo: check types + return missing + +def check_df_columns(df, columns): + missing = [] + for col in columns: + if col not in df.columns: + missing.append(col) + return missing + +print("Load schema", flush=True) +with open(par["schema"], "r") as f: + schema = yaml.safe_load(f) + +schema_info = schema.get("info") +assert schema_info, "Schema must contain an 'info' field" + +schema_info_format = schema_info.get("format") +assert schema_info_format, "Schema must contain a '.info.format' field" + +format_type = schema_info_format.get("type") +assert format_type == "h5ad", ".info.format.type must be 'h5ad'" + +# create output data structure +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} + +print('Load data', flush=True) +if format_type == "h5ad": + data = ad.read_h5ad(par['input']) +elif format_type == "csv": + data = pd.read_csv(par['input']) +elif format_type == "tsv": + data = pd.read_csv(par['input'], sep="\t") +elif format_type == "parquet": + data = pd.read_parquet(par['input']) +else: + raise ValueError(f"Unknown .info.format.type '{format_type}'") + +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} +print("Check file against schema", flush=True) +if format_type == "h5ad": + for struc, struc_fields in schema_info_format.items(): + if struc == "type": + continue + print("Checking slot", struc, flush=True) + missing = check_h5ad_struct(struc, struc_fields, getattr(data, struc)) + if missing: + print(f"Dataset is missing {struc} {missing}", flush=True) + out['exit_code'] = 1 + out['data_schema'] = 'not ok' + out['error'][struc] = missing +elif format_type in ["csv", "tsv", "parquet"]: + columns = schema_info_format.get("columns") or [] + missing = check_df_columns(data, columns) + +with open(par["output"], "w") as f: + json.dump(out, f, indent=2) + +if par['stop_on_error']: + exit(out['exit_code']) diff --git a/src/common/check_dataset_schema/test.py b/src/validation/check_dataset_with_schema/test.py similarity index 95% rename from src/common/check_dataset_schema/test.py rename to src/validation/check_dataset_with_schema/test.py index 1e7b5eb1e9..384f9d149d 100644 --- a/src/common/check_dataset_schema/test.py +++ b/src/validation/check_dataset_with_schema/test.py @@ -16,9 +16,10 @@ def schema(tmp_path): type: file description: "A preprocessed dataset" example: "preprocessed.h5ad" +label: "Preprocessed dataset" info: - label: "Preprocessed dataset" - slots: + format: + type: h5ad layers: - type: integer name: counts @@ -39,9 +40,10 @@ def error_schema(tmp_path): type: file description: "A preprocessed dataset" example: "preprocessed.h5ad" +label: "Preprocessed dataset" info: - label: "Preprocessed dataset" - slots: + format: + type: h5ad X: type: double description: Normalized expression values diff --git a/src/validation/check_yaml_with_schema/config.vsh.yaml b/src/validation/check_yaml_with_schema/config.vsh.yaml new file mode 100644 index 0000000000..e7f9112c46 --- /dev/null +++ b/src/validation/check_yaml_with_schema/config.vsh.yaml @@ -0,0 +1,28 @@ +name: check_yaml_with_schema +namespace: validation +summary: Check the format of a YAML file against a schema +description: Checks if the YAML file has the necessary data structures as defined in a schema. +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: A yaml file. + - name: --schema + type: file + required: true + description: A schema file for the yaml file. +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - jsonschema +runners: + - type: executable + - type: nextflow diff --git a/src/common/check_yaml_schema/script.py b/src/validation/check_yaml_with_schema/script.py similarity index 97% rename from src/common/check_yaml_schema/script.py rename to src/validation/check_yaml_with_schema/script.py index 2058832bb2..ee0e101611 100644 --- a/src/common/check_yaml_schema/script.py +++ b/src/validation/check_yaml_with_schema/script.py @@ -7,9 +7,6 @@ 'input': 'src/tasks/batch_integration/methods/bbknn/config.vsh.yaml', 'schema': 'src/common/api/schema_task_method.yaml' } -meta = { - 'functionality_name': 'foo', -} ## VIASH END def yaml_to_dict(file_path): diff --git a/src/wf_utils/ProfilesHelper.config b/src/wf_utils/ProfilesHelper.config deleted file mode 100644 index 35442065c6..0000000000 --- a/src/wf_utils/ProfilesHelper.config +++ /dev/null @@ -1,64 +0,0 @@ -process.container = 'nextflow/bash:latest' - -// detect tempdir -tempDir = java.nio.file.Paths.get( - System.getenv('NXF_TEMP') ?: - System.getenv('VIASH_TEMP') ?: - System.getenv('TEMPDIR') ?: - System.getenv('TMPDIR') ?: - '/tmp' -).toAbsolutePath() - -profiles { - no_publish { - process { - withName: '.*' { - publishDir = [ - enabled: false - ] - } - } - } - mount_temp { - docker.temp = tempDir - podman.temp = tempDir - charliecloud.temp = tempDir - } - docker { - docker.enabled = true - // docker.userEmulation = true - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - } - singularity { - singularity.enabled = true - singularity.autoMounts = true - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - } - podman { - podman.enabled = true - docker.enabled = false - singularity.enabled = false - shifter.enabled = false - charliecloud.enabled = false - } - shifter { - shifter.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - charliecloud.enabled = false - } - charliecloud { - charliecloud.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - } -} diff --git a/src/wf_utils/helper.nf b/src/wf_utils/helper.nf deleted file mode 100644 index 7b3acd5b1c..0000000000 --- a/src/wf_utils/helper.nf +++ /dev/null @@ -1,14 +0,0 @@ -Map findArgumentSchema(Map config, String argument_id) { - def argument_groups = - (config.functionality.argument_groups ?: []) + - [ - arguments: config.functionality.arguments ?: [] - ] - - def schema_value = argument_groups.findResult{ gr -> - gr.arguments.find { arg -> - arg.name == ("--" + argument_id) - } - } - return schema_value -} diff --git a/src/wf_utils/labels.config b/src/wf_utils/labels.config deleted file mode 100644 index 9a29d57c48..0000000000 --- a/src/wf_utils/labels.config +++ /dev/null @@ -1,11 +0,0 @@ -process { - withLabel: lowmem { memory = 20.Gb } - withLabel: lowcpu { cpus = 5 } - withLabel: midmem { memory = 50.Gb } - withLabel: midcpu { cpus = 15 } - withLabel: highmem { memory = 100.Gb } - withLabel: highcpu { cpus = 30 } - withLabel: lowtime { time = 1.h } - withLabel: midtime { time = 4.h } - withLabel: hightime { time = 8.h } -} diff --git a/src/wf_utils/labels_ci.config b/src/wf_utils/labels_ci.config deleted file mode 100644 index 5161976609..0000000000 --- a/src/wf_utils/labels_ci.config +++ /dev/null @@ -1,11 +0,0 @@ -process { - withLabel: lowmem { memory = 5.Gb } - withLabel: lowcpu { cpus = 2 } - withLabel: midmem { memory = 5.Gb } - withLabel: midcpu { cpus = 2 } - withLabel: highmem { memory = 5.Gb } - withLabel: highcpu { cpus = 2 } - withLabel: lowtime { time = 1.h } - withLabel: midtime { time = 4.h } - withLabel: hightime { time = 8.h } -} diff --git a/src/wf_utils/labels_tw.config b/src/wf_utils/labels_tw.config deleted file mode 100644 index 93a076367b..0000000000 --- a/src/wf_utils/labels_tw.config +++ /dev/null @@ -1,76 +0,0 @@ -process { - executor = 'awsbatch' - - // Default disk space - disk = 50.GB - - // Retry for exit codes that have something to do with memory issues - errorStrategy = { task.attempt < 3 && task.exitStatus in (137) ? 'retry' : 'ignore' } - maxRetries = 3 - maxMemory = null - - // Resource labels - withLabel: lowcpu { cpus = 5 } - withLabel: midcpu { cpus = 15 } - withLabel: highcpu { cpus = 30 } - withLabel: lowmem { - memory = { get_memory( 20.GB * task.attempt ) } - disk = { 50.GB * task.attempt } - } - withLabel: midmem { - memory = { get_memory( 50.GB * task.attempt ) } - disk = { 100.GB * task.attempt } - } - withLabel: highmem { - memory = { get_memory( 100.GB * task.attempt ) } - disk = { 200.GB * task.attempt } - } - withLabel: lowsharedmem { - containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.05)}" : ""} - } - withLabel: midsharedmem { - containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.1)}" : ""} - } - withLabel: highsharedmem { - containerOptions = { workflow.containerEngine != 'singularity' ? "--shm-size ${String.format("%.0f",task.memory.mega * 0.25)}" : ""} - } - withLabel: gpu { - accelerator = 1 - containerOptions = { workflow.containerEngine == "singularity" ? '--nv': - ( workflow.containerEngine == "docker" ? '--gpus all': null ) } - } - - // make sure publishstates gets enough disk space and memory - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } -} - -def get_memory(to_compare) { - if (!process.containsKey("maxMemory") || !process.maxMemory) { - return to_compare - } - - try { - if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { - return process.maxMemory - } - else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { - return max_memory as nextflow.util.MemoryUnit - } - else { - return to_compare - } - } catch (all) { - println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" - System.exit(1) - } -} - -// set tracing file -trace { - enabled = true - overwrite = true - file = "${params.publish_dir}/trace.txt" -}