diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..9a8a64b4d2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: [bug] +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..a49eab2f6b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..c17d3c0dfb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: [enhancement] +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/new_task.yml b/.github/ISSUE_TEMPLATE/new_task.yml new file mode 100644 index 0000000000..1b275f4227 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new_task.yml @@ -0,0 +1,31 @@ +name: New task +description: Start creating a new benchmarking task in OpenProblems +labels: [task] +body: + - type: markdown + attributes: + value: Thanks for choosing OpenProblems. Please check the [OpenProblems tasks](https://github.com/openproblems-bio/openproblems-v2/issues?q=label%3Atask+) to see whether a similar task has already been created. If you haven't already, please review the documentation on [how to create a new task](https://openproblems.bio/documentation/create_task/). + - type: textarea + attributes: + label: Task motivation + description: Explain the motivation behind your proposed task. Describe the biological or computational problem you aim to address and why it’s important. Discuss the current state of research in this area and any gaps or challenges that your task could help address. This section should convince readers of the significance and relevance of your task. + - type: textarea + attributes: + label: Task description + description: Provide a clear and concise description of your task, detailing the specific problem it aims to solve. Outline the input data types, the expected output, and any assumptions or constraints. Be sure to explain any terminology or concepts that are essential for understanding the task. + - type: textarea + attributes: + label: Proposed ground-truth in datasets + description: Describe the datasets you plan to use for your task. OpenProblems offers a standard set of datasets (See [“Common datasets”](https://openproblems.bio/documentation/reference/openproblems-v2/src-datasets.html)) which you can peruse through. Explain how these datasets will provide the ground-truth for evaluating the methods implemented in your task. If possible, include references or links to the datasets to facilitate reproducibility. + - type: textarea + attributes: + label: Initial set of methods to implement + description: List the initial set of methods you plan to implement for your task. Briefly describe each method’s core ideas and algorithms, and explain why you think they are suitable for your task. Consider including both established and cutting-edge methods to provide a comprehensive benchmarking of the state-of-the-art. + - type: textarea + attributes: + label: Proposed control methods + description: Outline the control methods you propose for your task. These methods serve as a starting point to test the relative accuracy of new methods in the task and as quality control for the defined metrics. Include both positive controls, which are methods with known outcomes resulting in the best possible metric values, and negative controls, which are simple, naive, or random methods that do not rely on sophisticated techniques or domain knowledge. Explain the rationale for your chosen controls. + - type: textarea + attributes: + label: Proposed Metrics + description: Describe the metrics you propose for evaluating the performance of methods in your task. Explain the rationale for selecting these metrics and how they will accurately assess the methods’ success in addressing the task’s challenges. Consider including multiple metrics to capture different aspects of method performance. \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..c48e62a4ac --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,24 @@ +## Describe your changes + +## Issue ticket number and link +Closes #xxxx (Replace xxxx with the GitHub issue number) + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code + +- Check the correct box. Does this PR contain: + - [ ] Breaking changes + - [ ] New functionality + - [ ] Major changes + - [ ] Minor changes + - [ ] Bug fixes + +- [ ] Proposed changes are described in the CHANGELOG.md + +- [ ] CI Tests succeed and look good! + +## Requirements after merging + +- [ ] Need to regenerate `common/` resources + +- [ ] Need to regenerate task-specific resources. Specify: \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..909637159a --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml new file mode 100644 index 0000000000..b8e85c22e2 --- /dev/null +++ b/.github/workflows/integration-test.yml @@ -0,0 +1,163 @@ +name: integration test + +on: workflow_dispatch + +jobs: + # phase 1 + list: + env: + s3_bucket: s3://openproblems-data/resources_test/ + runs-on: ubuntu-latest + + outputs: + component_matrix: ${{ steps.set_matrix.outputs.components }} + workflow_matrix: ${{ steps.set_matrix.outputs.workflows }} + cache_key: ${{ steps.cache.outputs.cache_key }} + + steps: + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 + id: cache + with: + s3_bucket: $s3_bucket + dest_path: resources_test + cache_key_prefix: resources_test__ + + - name: Remove target folder from .gitignore + run: | + # allow publishing the target folder + sed -i 's#^/target/$##g' .gitignore + + - uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'integration_build' + parallel: true + + - name: Deploy to target branch + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: . + publish_branch: integration_build + exclude_assets: '' + + - id: ns_list + uses: viash-io/viash-actions/ns-list@v5 + with: + platform: docker + src: src + format: json + + - id: set_matrix + run: | + echo "components=$(jq -c '[ .[] | + { + "name": (.functionality.namespace + (.platforms | map(select(.type == "docker"))[0].namespace_separator) + .functionality.name), + "config": .info.config, + "dir": .info.config | capture("^(?.*\/)").dir + } + ]' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT + + echo "workflows=$(jq -c '[ .[] | . as $config | (.functionality.test_resources // [])[] | select(.type == "nextflow_script", .entrypoint) | + { + "name": ($config.functionality.namespace + "/" + $config.functionality.name), + "main_script": (($config.info.config | capture("^(?.*\/)").dir) + "/" + .path), + "entry": .entrypoint, + "config": $config.info.config + } + ] | unique' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT + + # phase 2 + build: + needs: list + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.component_matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - name: Build container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'integration_build' + setup: build + src: ${{ matrix.component.dir }} + + - name: Login to container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.GTHB_USER }} + password: ${{ secrets.GTHB_PAT }} + + - name: Push container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'integration_build' + platform: docker + src: ${{ matrix.component.dir }} + setup: push + + ################################### + # phase 3 + integration_test: + needs: [ build, list ] + if: "${{ needs.list.outputs.workflow_matrix != '[]' }}" + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.workflow_matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - uses: nf-core/setup-nextflow@v2.0.0 + + # build target dir + # use containers from integration_build branch, hopefully these are available + - name: Build target dir + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: ".functionality.version := 'integration_build'" + parallel: true + + # use cache + - name: Cache resources data + uses: actions/cache@v4 + timeout-minutes: 5 + with: + path: resources_test + key: ${{ needs.list.outputs.cache_key }} + fail-on-cache-miss: true + + - name: Run integration test + timeout-minutes: 45 + run: | + # todo: replace with viash test command + export NXF_VER=22.04.5 + nextflow run . \ + -main-script "${{ matrix.component.main_script }}" \ + -entry "${{ matrix.component.entry }}" \ + -profile docker,mount_temp,no_publish \ + -c workflows/utils/labels_ci.config diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml new file mode 100644 index 0000000000..efdf563065 --- /dev/null +++ b/.github/workflows/main-build.yml @@ -0,0 +1,110 @@ +name: main build + +on: + push: + branches: [ 'main' ] + +jobs: + # phase 1 + list: + runs-on: ubuntu-latest + + outputs: + component_matrix: ${{ steps.set_matrix.outputs.matrix }} + cache_key: ${{ steps.cache.outputs.cache_key }} + + steps: + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - name: Remove target folder from .gitignore + run: | + # allow publishing the target folder + sed -i 's#^/target/$##g' .gitignore + + - uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'main_build' + parallel: true + + # - name: Build nextflow schemas + # uses: viash-io/viash-actions/pro/build-nextflow-schemas@v4 + # with: + # workflows: src + # components: src + # viash_pro_token: ${{ secrets.GTHB_PAT }} + # tools_version: 'main_build' + + # - name: Build parameter files + # uses: viash-io/viash-actions/pro/build-nextflow-params@v4 + # with: + # workflows: src + # components: src + # viash_pro_token: ${{ secrets.GTHB_PAT }} + # tools_version: 'main_build' + + - name: Deploy to target branch + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: . + publish_branch: main_build + + - id: ns_list + uses: viash-io/viash-actions/ns-list@v5 + with: + platform: docker + src: src + format: json + + - id: set_matrix + run: | + echo "matrix=$(jq -c '[ .[] | + { + "name": (.functionality.namespace + "/" + .functionality.name), + "dir": .info.config | capture("^(?.*\/)").dir + } + ]' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT + + # phase 2 + build: + needs: list + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.component_matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - name: Build container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'main_build' + platform: docker + src: ${{ matrix.component.dir }} + setup: build + + - name: Login to container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.GTHB_USER }} + password: ${{ secrets.GTHB_PAT }} + + - name: Push container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'main_build' + platform: docker + src: ${{ matrix.component.dir }} + setup: push \ No newline at end of file diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml new file mode 100644 index 0000000000..4d4b18c2f8 --- /dev/null +++ b/.github/workflows/release-build.yml @@ -0,0 +1,223 @@ +name: release build + +on: + workflow_dispatch: + inputs: + version_tag: + description: Version tag + required: true + +jobs: + # phase 1 + list: + env: + s3_bucket: s3://openproblems-data/resources_test/ + runs-on: ubuntu-latest + + outputs: + component_matrix: ${{ steps.set_matrix.outputs.components }} + workflow_matrix: ${{ steps.set_matrix.outputs.workflows }} + cache_key: ${{ steps.cache.outputs.cache_key }} + + steps: + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 + id: cache + with: + s3_bucket: $s3_bucket + dest_path: resources_test + cache_key_prefix: resources_test__ + + - name: Remove target folder from .gitignore + run: | + # allow publishing the target folder + sed -i 's#^/target/$##g' .gitignore + + - uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: ".functionality.version := '${{ github.event.inputs.version_tag }}'" + parallel: true + + - name: Build nextflow schemas + uses: viash-io/viash-actions/pro/build-nextflow-schemas@v5 + with: + workflows: src + components: src + viash_pro_token: ${{ secrets.GTHB_PAT }} + tools_version: 'main_build' + + - name: Build parameter files + uses: viash-io/viash-actions/pro/build-nextflow-params@v5 + with: + workflows: src + components: src + viash_pro_token: ${{ secrets.GTHB_PAT }} + tools_version: 'main_build' + + - name: Deploy to target branch + uses: peaceiris/actions-gh-pages@v4 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: . + publish_branch: release + full_commit_message: "Deploy for release ${{ github.event.inputs.version_tag }} from ${{ github.sha }}" + + - id: ns_list_components + uses: viash-io/viash-actions/ns-list@v5 + with: + platform: docker + src: src + format: json + + - id: ns_list_workflows + uses: viash-io/viash-actions/ns-list@v5 + with: + src: workflows + format: json + + - id: set_matrix + run: | + echo "components=$(jq -c '[ .[] | + { + "name": (.functionality.namespace + "/" + .functionality.name), + "config": .info.config, + "dir": .info.config | capture("^(?.*\/)").dir + } + ]' ${{ steps.ns_list_components.outputs.output_file }} )" >> $GITHUB_OUTPUT + + echo "workflows=$(jq -c '[ .[] | + { + "name": (.functionality.namespace + "/" + .functionality.name), + "main_script": (.info.config | capture("^(?.*\/)").dir + "/" + .functionality.test_resources[].path), + "entry": .functionality.test_resources[].entrypoint + } + ]' ${{ steps.ns_list_workflows.outputs.output_file }} )" >> $GITHUB_OUTPUT + + # phase 2 + build: + needs: list + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.component_matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - name: Build container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := 'main_build' + platform: docker + src: ${{ matrix.component.dir }} + setup: build + + - name: Login to container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.GTHB_USER }} + password: ${{ secrets.GTHB_PAT }} + + - name: Push container + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: .functionality.version := '${{ github.event.inputs.version_tag }}' + platform: docker + src: ${{ matrix.component.dir }} + setup: push + + ###################################3 + # phase 3 + integration_test: + needs: [ build, list ] + if: "${{ needs.list.outputs.workflow_matrix != '[]' }}" + + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.workflow_matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + - uses: nf-core/setup-nextflow@v2.0.0 + + # build target dir + # use containers from release branch, hopefully these are available + - name: Build target dir + uses: viash-io/viash-actions/ns-build@v5 + with: + config_mod: ".functionality.version := '${{ github.event.inputs.version_tag }}'" + parallel: true + + # use cache + - name: Cache resources data + uses: actions/cache@v4 + timeout-minutes: 5 + with: + path: resources_test + key: ${{ needs.list.outputs.cache_key }} + + - name: Run integration test + timeout-minutes: 45 + run: | + # todo: replace with viash test command + export NXF_VER=22.04.5 + nextflow run . \ + -main-script "${{ matrix.component.main_script }}" \ + -entry ${{ matrix.component.entry }} \ + -profile docker,mount_temp,no_publish \ + -c workflows/utils/labels_ci.config + + ###################################3 + # phase 4 + component_test: + needs: [ build, list ] + if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.component_matrix) }} + + steps: + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + # use cache + - name: Cache resources data + uses: actions/cache@v4 + timeout-minutes: 5 + with: + path: resources_test + key: ${{ needs.list.outputs.cache_key }} + + - name: Test component + timeout-minutes: 30 + run: | + viash test \ + --config_mod ".functionality.version := '${{ github.event.inputs.version_tag }}'" \ + "${{ matrix.component.config }}" \ + --cpus 2 \ + --memory "5gb" \ No newline at end of file diff --git a/.github/workflows/viash-test.yml b/.github/workflows/viash-test.yml new file mode 100644 index 0000000000..b6bab576ed --- /dev/null +++ b/.github/workflows/viash-test.yml @@ -0,0 +1,88 @@ +name: viash test + +on: + pull_request: + push: + branches: [ main ] + +jobs: + + # phase 1 + list: + env: + s3_bucket: s3://openproblems-data/resources_test/ + runs-on: ubuntu-latest + + outputs: + matrix: ${{ steps.set_matrix.outputs.matrix }} + cache_key: ${{ steps.cache.outputs.cache_key }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: viash-io/viash-actions/setup@v5 + + - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 + id: cache + with: + s3_bucket: $s3_bucket + dest_path: resources_test + cache_key_prefix: resources_test__ + + - id: ns_list + uses: viash-io/viash-actions/ns-list@v5 + with: + platform: docker + format: json + + - id: ns_list_filtered + uses: viash-io/viash-actions/project/detect-changed-components@v5 + with: + input_file: "${{ steps.ns_list.outputs.output_file }}" + + - id: set_matrix + run: | + echo "matrix=$(jq -c '[ .[] | + { + "name": (.functionality.namespace + "/" + .functionality.name), + "config": .info.config + } + ]' ${{ steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT + + # phase 2 + viash_test: + needs: list + if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + component: ${{ fromJson(needs.list.outputs.matrix) }} + + steps: + # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' + - uses: data-intuitive/reclaim-the-bytes@v2 + + - uses: actions/checkout@v4 + + - uses: viash-io/viash-actions/setup@v5 + + # use cache + - name: Cache resources data + uses: actions/cache@v4 + timeout-minutes: 10 + with: + path: resources_test + key: ${{ needs.list.outputs.cache_key }} + + - name: Run test + timeout-minutes: 30 + run: | + VIASH_TEMP=$RUNNER_TEMP/viash viash test \ + "${{ matrix.component.config }}" \ + --cpus 2 \ + --memory "5gb" + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..c19f926ba4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +README.html +README_files/ +*.DS_Store +*__pycache__ +*.h5ad + +# IDE ignores +/.idea/ + +# repo specific ignores +output_bash + +# R specific ignores +.Rhistory +.Rproj.user +*.Rproj + +# viash specific ignores +docker_output/ +/target/ +check_results/ +log.txt +.viash* +/resources/ +/resources_test/ + +# nextflow specific ignores +/.nextflow* +/work +output diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..e662fc6472 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "yaml.schemas": { + "src/common/schemas/api_component.yaml": "src/**/api/comp_*.yaml", + "src/common/schemas/api_file.yaml": "src/**/api/file_*.yaml", + "src/common/schemas/task_info.yaml": "src/**/api/task_info.yaml", + "src/common/schemas/task_method.yaml": "src/tasks/**/methods/**/config.vsh.yaml", + "src/common/schemas/task_control_method.yaml": "src/tasks/**/control_methods/**/config.vsh.yaml", + "src/common/schemas/task_metric.yaml": "src/tasks/**/metrics/**/config.vsh.yaml" + } +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000..139379d4d2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,418 @@ + +# openproblems-v2 0.1.0 + +## general + +### NEW FUNCTIONALITY + +* Updated all current tasks in v2 to latest changes in OP v1 (PR #214) + +### MAJOR CHANGES + +* Relocate task directories to new `src/tasks/` location (PR #142). + +* Update Docker images to our base images; `ghcr.io/openproblems-bio/base-python` + and `ghcr.io/openproblems-bio/base-r` (PR #168). + +* Update batch integration docker images to OpenProblems base images (PR #171). + +* Changed default normalization CPM to CP10k (PR #214) + +### MINOR CHANGES + +* Update test scripts (PR #143). + +* Update "baseline" to "control" (PR #146). + +* Add task image thumbnails to api (PR #231). + +### BUG FIXES + +* `dimensionality_reduction/methods/tsne`: Use GitHub version of MulticoreTSNE. + +* `label_projection/methods/seurat_transferdata`: Temporarily disable component as it appears to not be working (PR #206). + +* Remove the ns-list action for workflows in integration test (PR #208) + + +## common + +### NEW FUNCTIONALITY + +* `extract_scores`: Summarise a metrics output tsv. + +* Created test data `resources_test/pancreas` with `src/common/resources_test_scripts/pancreas.sh`. + +* `get_api_info`: Extract api info from tasks. + +* `get_method_info`: Extract method info from config yaml. + +* `get_metric_info`: Extract metric info from config yaml. + +* `get_results`: Extract benchmark scores. + +* `get_task_info`: Extract task info. + +* `comp_tests`: Common unit tests that can be used by all tasks. + +* `check_dataset_schema`: Check if the dataset used has the required fields defined in the api `file_*.yaml` files. + +* `Create_component`: Creates a template folder with a viash config and script file depending on the task api. + +### MINOR CHANGES + +* Refactor and standardize metric and method info fields (PR #99). + +* Add url check to method and metric unit test (PR #160). + +* Add library.bib file check to component unit test (PR #167) + +* Lower bound scvi-tools to 1.1.0 for all methods using the package (PR #416). + +### BUG FIXES + +* fix typos in metric and common defenition schemas (PR #212) + +## migration + +### NEW FUNCTIONALITY + +* `list_git_shas`: create list of latest commit hashes of all files in repo. + +* `check_migration_status`: compare git shas from methods with v1 + +## datasets + +### NEW FUNCTIONALITY + +* `workflows/process_openproblems_v1`: Fetch and process legacy OpenProblems v1 datasets, whilst adding extra information to the `.uns`. + +* `normalization/log_cpm`: A log CPM normalization method. + +* `normalization/log_scran_pooling`: A log scran pooling normalization method. + +* `normalization/sqrt_cpm`: A sqrt CPM normalization method. + +* `normalization/l1_sqrt`: A scaled L1 sqrt normalization. extracted from Alra method in the denoising task from v1 + +* `subsample`: Subsample an h5ad file. Allows keeping observations from specific batches and celltypes, + also allows keeping certain features. + +* `resources_test_scripts`: Scripts to create test_resources for local development with "pancreas", "pancreas_tasks" and "multimodal". + +### V1 MIGRATION + +* `loaders/openproblems_v1`: Fetch a dataset from OpenProblems v1, whilst adding extra information to the `.uns`. + +* `loaders/openproblems_v1_multimodal`: Fetch a multimodal dataset from OpenProblems v1, whilst adding extra information to the `.uns`. + +## batch_integration + +### MINOR CHANGES + +* Updated `methods/scanvi` to use `counts` as `preferred_normalization` + +* Updated `methods/scvi` to use `counts` as `preferred_normalization` + +### NEW FUNCTIONALITY + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the process, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `resources_test/label_projection/pancreas` with `src/tasks/label_projection/resources_test_scripts/pancreas.sh`. + +* `workflows/run`: Added nf-tower test script (PR #205). + +* `metrics/lisi`: Added a component for cLISI and iLISI graph metrics from scib (PR #213). + +### V1 MIGRATION + +* Removed the separate subtask specific subfolders. The "subtask" is added to the config. + +* `control_methods/no_integration`: Migrated from v1. + +* `control_methods/perfect_integration`: Migrated from v1, renaming "random embedding" to "perfect integration". + +* `control_methods/random_integration`: Migrated from v1. + +* `methods/bbknn`: Migrated from v1 graph. + +* `methods/combat`: Migrated from v1 feature. + +* `methods/scanorama_embed`: Migrated from v1 embedding. + +* `methods/scanorama_feature`: Migrated from v1 feature. + +* `methods/scvi`: Migrated from v1 embedding. + +* `metrics/asw_batch`: Migrated from v1 embedding. + +* `metrics/asw_label`: Migrated from v1 embedding. + +* `metrics/cell_cycle_conservation`: Migrated from v1 embedding. + +* `metrics/clustering_overlap`: Migrated from v1 graph NMI & ARI. + +* `metrics/graph_connectivity`: Migrated from v1 graph. + +* `metrics/hvg_overlap`: Migrated from v1 feature. + +* `metrics/isolated_label_asw`: Migrated from v1 embedding. + +* `metrics/isolated_label_f1`: Migrated from v1 graph. + +* `metrics/kbet`: Migrated from v1 embedding. + +* `metrics/pcr`: Migrated from v1 embedding. + +### MINOR CHANGES + +* Removed the `.uns["output_type"]` field from output anndata in methods and control methods. (PR #205) + +## label_projection + +### MINOR CHANGES + +* Updated `methods/scanvi` to use `counts` as `preferred_normalization` + +### NEW FUNCTIONALITY + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the process, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `resources_test/label_projection/pancreas` with `src/tasks/label_projection/resources_test_scripts/pancreas.sh`. + +* * `workflows/run`: Added nf-tower test script. (PR #205) + +### V1 MIGRATION + +* `methods/knn`: Migrated from v1. + +* `methods/logistic_regression`: Migrated from v1. + +* `methods/mlp`: Migrated from v1. + +* `methods/scanvi`: Migrated and adapted from v1. + +* `methods/scanvi_scarches`: Migrated and adapted from v1. + +* `methods/seurat_transferdata`: Migrated and adapted from v1. + +* `methods/xgboost`: Migrated from v1. + +* `control_methods/majority_vote`: Migrated from v1. + +* `control_methods/random_labels`: Migrated from v1. + +* `control_methods/true_labels`: Migrated from v1. + +* `metric/accuracy`: Migrated from v1. + +* `metric/f1`: Migrated from v1. + +## denoising + +### NEW FUNCTIONALITY + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the split, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `resources_test/denoising/pancreas` with `src/tasks/denoising/resources_test_scripts/pancreas.sh`. + +* `workflows/run`: Added nf-tower test script. (PR #205) + +### V1 MIGRATION + +* `control_methods/no_denoising`: Migrated from v1. Extracted from baseline method + +* `control_methods/perfect_denoising`: Migrated from v1.Extracted from baseline method + +* `methods/alra`: Migrated from v1. Changed from python to R and uses lg_cpm normalised data instead of L1 sqrt + +* `methods/dca`: Migrated and adapted from v1. + +* `methods/knn_smoothing`: Migrated and adapted from v1. + +* `methods/magic`: Migrated from v1. + +* `metrics/mse`: Migrated from v1. + +* `metrics/poisson`: Migrated from v1. + +### Changes from V1 + +* Anndata layers are used to store data instead of obsm + +* extended the use of sparse data in methods unless it was not possible + +* process_dataset also removes unnecessary data from train and test datasets not needed by the methods and metrics. + +## Dimensionality reduction + +### New functionality +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the split, control method, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `control_methods`: Added a component for baseline methods specifically. + +* `resources_test/dimensionality_reduction/pancreas` with `src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh`. + +* Added `variant` key to config files to store variants (different input parameters) of every component. + +* `workflows/run`: Added nf-tower test script. (PR #205) + +### V1 migration +* `control_methods/true_features`: Migrated from v1. Extracted from baseline method `True Features`. + +* `control_methods/random_features`: Migrated from v1. Extracted from baseline method `Random Features`. + +* `methods/umap`: Migrated from v1. + +* `methods/ivis`: Migrated from v1. + +* `methods/tsne`: Migrated and adapted from v1. + +* `methods/densmap`: Migrated and adapted from v1. + +* `methods/phate`: Migrated from v1. + +* `methods/pca`: Migrated from v1. + +* `methods/neuralee`: Migrated from v1. + +* `metrics/distance_correlation`: Migrated from v1, but will likely be removed. + +* `metrics/trustworthiness`: Migrated from v1, but will likely be removed. + +* `metrics/density_preservation`: Migrated from v1. + +* `metrics/coranking`: Migrated from v1. This script originally called `nn_ranking.py` and written in Python. + +### Changes from V1 + +* Raw counts and normalized expression data is stored in `.layers["counts"]` and `.layers["normalized"]`, respectively, + instead of in `.X`. + +* A `process_dataset` has been implemented to make a distinction between the data a method is allowed to see + (here called the train data) and what a metric is allowed to see (here called the test data). + +* `methods/ivis` had originally been removed from the v1 (temporarily) but has been added back to the v2. + +* The metrics as defined in the `nn_ranking.py` script have been documented and refactored into an R + component `metrics/coranking`. + +* `metrics/rmse` should be removed because RMSE metrics don't really make sense here. + +* `metrics/trustworthiness` should be removed because it is already included in `metrics/coranking`. + +* `methods/simlr`: Added new SIMLR method. + +* `metrics/clustering_performance`: Added new metric to assess clustering on the reduced dimensional embeddings using NMI and ARI. + + +## match_modalities (PR #201) + +### New functionality + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the split, control method, method and metric components. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `control_methods`: Added a component for baseline methods specifically. + +* `resources_test/dimensionality_reduction/pancreas` with `src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh`. + +* Added `variant` key to config files to store variants (different input parameters) of every component. + +* `workflows/run`: Added nf-tower test script. + +### V1 migration + +* `control_methods/true_features`: Migrated from v1. Extracted from baseline method `True Features`. + +* `control_methods/random_features`: Migrated from v1. Extracted from baseline method `Random Features`. + +* `methods/harmonic_alignment`: Migrated from v1. + +* `methods/mnn`: Migrated from v1. + +* `methods/procrustes`: Migrated from v1. + +* `metrics/knn_auc`: Migrated from v1. + +* `metrics/mse`: Migrated from v1. + +### Changes from V1 + +* `methods/scot`: Add new scot method. + +* Raw counts and normalized expression data is stored in `.layers["counts"]` and `.layers["normalized"]`, respectively, + instead of in `.X`. + +* The methods and metrics now take 2 modal datasets as input instead of 1. + + +## spatial_decomposition (PR #365) + +### NEW FUNCTIONALITY + +* `api/file_*`: Created a file format specifications for the h5ad files throughout the pipeline. + +* `api/comp_*`: Created an api definition for the process, method and metric components. + +* `dataset_simulator`: Added a component to simulate spatial datasets with the required ground-truth. + +* `process_dataset`: Added a component for processing common datasets into task-ready dataset objects. + +* `resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas` with `src/tasks/spatial_decomposition/resources_test_scripts/cxg_mouse_pancreas_atlas.sh`. + +* Added `variant` key to config files to store the different input parameter sets for components. + +* `workflows/run`: Added nf-tower test script. + +### V1 MIGRATION + +* `methods/cell2location`: Migrated from v1. + +* `methods/destvi`: Migrated from v1. + +* `methods/nmfreg`: Migrated from v1. + +* `methods/nnls`: Migrated and adapted from v1. + +* `methods/rctd`: Migrated and adapted from v1. + +* `methods/seurat`: Migrated and adapted from v1. + +* `methods/stereoscope`: Migrated from v1. + +* `methods/tangram`: Migrated from v1. + +* `methods/vanillanmf`: Migrated from v1. + +* `control_methods/random_proportions`: Migrated from v1. + +* `control_methods/true_proportions`: Migrated from v1. + +* `metric/r2`: Migrated from v1. + +### Changes from V1 + +* Raw counts and normalized expression data is stored in `.layers["counts"]` and `.layers["normalized"]`, respectively, + instead of in `.X`. + +* A `process_dataset` has been implemented to make a distinction between the data that a method or metric is allowed to see. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..45d257b29a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,133 @@ + +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[INSERT CONTACT METHOD]. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..a57b23cbb0 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,639 @@ +# Contributing to OpenProblems + + +- [Code of conduct](#code-of-conduct) +- [Requirements](#requirements) +- [Quick start](#quick-start) +- [Project structure](#project-structure) +- [Adding a Viash component](#adding-a-viash-component) +- [Running a component from CLI](#running-a-component-from-cli) +- [Building a component](#building-a-component) +- [Unit testing a component](#unit-testing-a-component) +- [More information](#more-information) +- [Branch Naming Conventions](#branch-naming-conventions) + +[OpenProblems](https://openproblems.bio) is a community effort, and +everyone is welcome to contribute. This project is hosted on +[github.com/openproblems-bio/openproblems-v2](https://github.com/openproblems-bio/openproblems-v2). +You can find a full in depth guide on how to contribute to this project +on the [OpenProblems website](https://openproblems.bio/documentation/). + +## Code of conduct + +We as members, contributors, and leaders pledge to make participation in +our community a harassment-free experience for everyone, regardless of +age, body size, visible or invisible disability, ethnicity, sex +characteristics, gender identity and expression, level of experience, +education, socio-economic status, nationality, personal appearance, +race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, +welcoming, diverse, inclusive, and healthy community. + +Our full [Code of Conduct](CODE_OF_CONDUCT.md) is adapted from the +[Contributor Covenant](https://www.contributor-covenant.org), version +2.1. + +## Requirements + +To use this repository, please install the following dependencies: + +- Bash +- Java (Java 11 or higher) +- Docker (Instructions [here](https://docs.docker.com/get-docker/)) +- Nextflow (Optional, though [very easy to + install](https://www.nextflow.io/index.html#GetStarted)) + +## Quick start + +The `src/` folder contains modular software components for running a +modality alignment benchmark. Running the full pipeline is quite easy. + +**Step 0, fetch Viash and Nextflow** + +``` bash +mkdir $HOME/bin +curl -fsSL get.viash.io | bash -s -- --bin $HOME/bin --tools false +curl -s https://get.nextflow.io | bash; mv nextflow $HOME/bin +``` + +Make sure that Viash and Nextflow are on the \$PATH by checking whether +the following commands work: + +``` bash +viash -v +nextflow -v +``` + + viash 0.8.0 (c) 2020 Data Intuitive + nextflow version 23.04.1.5866 + +**Step 1, download test resources:** by running the following command. + +``` bash +viash run src/common/sync_test_resources/config.vsh.yaml +``` + + Completed 256.0 KiB/7.2 MiB (302.6 KiB/s) with 6 file(s) remaining + Completed 512.0 KiB/7.2 MiB (595.8 KiB/s) with 6 file(s) remaining + Completed 768.0 KiB/7.2 MiB (880.3 KiB/s) with 6 file(s) remaining + Completed 1.0 MiB/7.2 MiB (1.1 MiB/s) with 6 file(s) remaining + Completed 1.2 MiB/7.2 MiB (1.3 MiB/s) with 6 file(s) remaining + ... + +**Step 2, build all the components:** in the `src/` folder as standalone +executables in the `target/` folder. Use the `-q 'xxx'` parameter to +build a subset of components in the repository. + +``` bash +viash ns build --query 'label_projection|common' --parallel --setup cachedbuild +``` + + In development mode with 'dev'. + Exporting process_dataset (label_projection) =docker=> target/docker/label_projection/process_dataset + Exporting accuracy (label_projection/metrics) =docker=> target/docker/label_projection/metrics/accuracy + Exporting random_labels (label_projection/control_methods) =docker=> target/docker/label_projection/control_methods/random_labels + [notice] Building container 'label_projection/control_methods_random_labels:dev' with Dockerfile + [notice] Building container 'common/data_processing_dataset_concatenate:dev' with Dockerfile + [notice] Building container 'label_projection/metrics_accuracy:dev' with Dockerfile + ... + +Viash will build a whole namespace (`ns`) into executables and Nextflow +pipelines into the `target/docker` and `target/nextflow` folders +respectively. By adding the `-q/--query` flag, you can filter which +components to build using a regex. By adding the `--parallel` flag, +these components are built in parallel (otherwise it will take a really +long time). The flag `--setup cachedbuild` will automatically start +building Docker containers for each of these methods. + +The command might take a while to run, since it is building a docker +container for each of the components. + +**Step 3, run the pipeline with nextflow.** To do so, run the bash +script located at +`src/tasks/label_projection/workflows/run_nextflow.sh`: + +``` bash +src/tasks/label_projection/workflows/run/run_test.sh +``` + + N E X T F L O W ~ version 22.04.5 + Launching `src/tasks/label_projection/workflows/run/main.nf` [pensive_turing] DSL2 - revision: 16b7b0c332 + executor > local (28) + [f6/f89435] process > run_wf:run_methods:true_labels:true_labels_process (pancreas.true_labels) [100%] 1 of 1 ✔ + [ed/d674a2] process > run_wf:run_methods:majority_vote:majority_vote_process (pancreas.majority_vote) [100%] 1 of 1 ✔ + [15/f0a427] process > run_wf:run_methods:random_labels:random_labels_process (pancreas.random_labels) [100%] 1 of 1 ✔ + [02/969d05] process > run_wf:run_methods:knn:knn_process (pancreas.knn) [100%] 1 of 1 ✔ + [90/5fdf9a] process > run_wf:run_methods:mlp:mlp_process (pancreas.mlp) [100%] 1 of 1 ✔ + [c7/dee2e5] process > run_wf:run_methods:logistic_regression:logistic_regression_process (pancreas.logistic_regression) [100%] 1 of 1 ✔ + [83/3ba0c9] process > run_wf:run_methods:scanvi:scanvi_process (pancreas.scanvi) [100%] 1 of 1 ✔ + [e3/2c298e] process > run_wf:run_methods:seurat_transferdata:seurat_transferdata_process (pancreas.seurat_transferdata) [100%] 1 of 1 ✔ + [d6/7212ab] process > run_wf:run_methods:xgboost:xgboost_process (pancreas.xgboost) [100%] 1 of 1 ✔ + [b6/7dc1a7] process > run_wf:run_metrics:accuracy:accuracy_process (pancreas.scanvi) [100%] 9 of 9 ✔ + [be/7d4da4] process > run_wf:run_metrics:f1:f1_process (pancreas.scanvi) [100%] 9 of 9 ✔ + [89/dcd77a] process > run_wf:aggregate_results:extract_scores:extract_scores_process (combined) [100%] 1 of 1 ✔ + +## Project structure + +High level overview: . ├── bin Helper scripts for building the project +and developing a new component. ├── resources_test Datasets for testing +components. If you don’t have this folder, run **Step 1** above. ├── src +Source files for each component in the pipeline. │ ├── common Common +processing components. │ ├── datasets Components and pipelines for +building the ‘Common datasets’ │ ├── label_projection Source files +related to the ‘Label projection’ task. │ └── … Other tasks. └── target +Executables generated by viash based on the components listed under +`src/`. ├── docker Bash executables which can be used from a terminal. +└── nextflow Nextflow modules which can be used as a standalone pipeline +or as part of a bigger pipeline. + +Detailed overview of a task folder (e.g. `src/tasks/label_projection`): + + src/tasks/label_projection/ + ├── api Specs for the components in this task. + ├── control_methods Control methods which serve as quality control checks for the benchmark. + ├── docs Task documentation + ├── methods Label projection method components. + ├── metrics Label projection metric components. + ├── resources_scripts The scripts needed to run the benchmark. + ├── resources_test_scripts The scripts needed to generate the test resources (which are needed for unit testing). + ├── process_dataset A component that masks a common dataset for use in the benchmark + └── workflows The benchmarking workflow. + +Detailed overview of the `src/datasets` folder: + + src/datasets/ + ├── api Specs for the data loaders and normalisation methods. + ├── loaders Components for ingesting datasets from a source. + ├── normalization Normalization method components. + ├── processors Other preprocessing components (e.g. HVG and PCA). + ├── resource_scripts The scripts needed to generate the common datasets. + ├── resource_test_scripts The scripts needed to generate the test resources (which are needed for unit testing). + └── workflows The workflow which generates the common datasets. + +## Adding a Viash component + +[Viash](https://viash.io) allows you to create pipelines in Bash or +Nextflow by wrapping Python, R, or Bash scripts into reusable +components. + +You can start creating a new component by [creating a Viash +component](https://viash.io/guide/component/creation/docker.html). + +For example, to create a new Python-based method named `foo`, create a +Viash config at +`src/tasks/label_projection/methods/foo/config.vsh.yaml`: + +``` yaml +__merge__: ../../api/comp_method.yaml +functionality: + name: "foo" + namespace: "label_projection/methods" + # A multiline description of your method. + description: "Todo: fill in" + info: + type: method + + # a short label of your method + label: Foo + + # A multiline description of your method. + description: "Todo: fill in" + + # A short summary of the method description. + summary: "Todo: fill in" + + # Add the bibtex reference to the "src/common/library.bib" if it is not already there. + reference: "cover1967nearest" + + repository_url: "https://github.com/openproblems-bio/openproblems-v2" + documentation_url: "https://openproblems.bio/documentation/" + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.2 + setup: + - type: python + packages: [scikit-learn] + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] +``` + +And create a script at +`src/tasks/label_projection/methods/foo/script.py`: + +``` python +import anndata as ad +import numpy as np + +## VIASH START +# This code-block will automatically be replaced by Viash at runtime. +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Create predictions", flush=True) +input_test.obs["label_pred"] = "foo" + +print("Add method name to uns", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] + +print("Write output to file", flush=True) +input_test.write_h5ad(par["output"], compression="gzip") +``` + +## Running a component from CLI + +You can view the interface of the executable by running the executable +with the `-h` or `--help` parameter. + +``` bash +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- --help +``` + + foo dev + + Todo: fill in + + Arguments: + --input_train + type: file, required parameter, file must exist + example: resources_test/label_projection/pancreas/train.h5ad + + --input_test + type: file, required parameter, file must exist + example: resources_test/label_projection/pancreas/test.h5ad + + --output + type: file, required parameter, output, file must exist + example: resources_test/label_projection/pancreas/prediction.h5ad + +Before running a new component, youy will need to create the docker +container: + +``` bash +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- ---setup cachedbuild +``` + + [notice] Building container 'ghcr.io/openproblems-bio/label_projection/methods/foo:dev' with Dockerfile + +You can **run the component** as follows: + +``` bash +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- \ + --input_train resources_test/label_projection/pancreas/train.h5ad \ + --input_test resources_test/label_projection/pancreas/test.h5ad \ + --output resources_test/label_projection/pancreas/prediction.h5ad +``` + + Load data + Create predictions + Add method name to uns + Write output to file + +## Building a component + +`viash` has several helper functions to help you quickly develop a +component. + +With **`viash build`**, you can turn the component into a standalone +executable. This standalone executable you can give to somebody else, +and they will be able to run it, provided that they have Bash and Docker +installed. + +``` bash +viash build src/tasks/label_projection/methods/foo/config.vsh.yaml \ + -o target/docker/label_projection/methods/foo +``` + +> [!NOTE] +> +> The `viash ns build` component does a much better job of setting up a +> collection of components. + +You can now view the same interface of the executable by running the +executable with the `-h` parameter. + +``` bash +target/docker/label_projection/methods/foo/foo -h +``` + + foo dev + + Todo: fill in + + Arguments: + --input_train + type: file, required parameter, file must exist + example: resources_test/label_projection/pancreas/train.h5ad + + --input_test + type: file, required parameter, file must exist + example: resources_test/label_projection/pancreas/test.h5ad + + --output + type: file, required parameter, output, file must exist + example: resources_test/label_projection/pancreas/prediction.h5ad + +Or **run the component** as follows: + +``` bash +target/docker/label_projection/methods/foo/foo \ + --input_train resources_test/label_projection/pancreas/train.h5ad \ + --input_test resources_test/label_projection/pancreas/test.h5ad \ + --output resources_test/label_projection/pancreas/prediction.h5ad +``` + + Load data + Create predictions + Add method name to uns + Write output to file + +## Unit testing a component + +The [method API +specifications](src/tasks/label_projection/api/comp_method.yaml) comes +with a generic unit test for free. This means you can unit test your +component using the **`viash test`** command. + +``` bash +viash test src/tasks/label_projection/methods/foo/config.vsh.yaml +``` + + Running tests in temporary directory: '/tmp/viash_test_foo11070556749764805852' + ==================================================================== + +/tmp/viash_test_foo11070556749764805852/build_executable/foo ---verbosity 6 ---setup cachedbuild + [notice] Building container 'ghcr.io/openproblems-bio/label_projection/methods/foo:test' with Dockerfile + [info] Running 'docker build -t ghcr.io/openproblems-bio/label_projection/methods/foo:test /tmp/viash_test_foo11070556749764805852/build_executable -f /tmp/viash_test_foo11070556749764805852/build_executable/tmp/dockerbuild-foo-VMKj2u/Dockerfile' + #0 building with "default" instance using docker driver + + #1 [internal] load build definition from Dockerfile + #1 transferring dockerfile: 658B done + #1 DONE 0.1s + + #2 [internal] load .dockerignore + #2 transferring context: 2B done + #2 DONE 0.1s + + #3 [internal] load metadata for ghcr.io/openproblems-bio/base_python:1.0.2 + #3 DONE 0.3s + + #4 [1/2] FROM ghcr.io/openproblems-bio/base_python:1.0.2@sha256:65a577a3de37665b7a65548cb33c9153b6881742345593d33fe02919c8d66a20 + #4 DONE 0.0s + + #5 [2/2] RUN pip install --upgrade pip && pip install --upgrade --no-cache-dir "scikit-learn" + #5 CACHED + + #6 exporting to image + #6 exporting layers done + #6 writing image sha256:b5c134ce2ab91a0e616d7362f6bd168e6494c4a1bd7c643d62d7ad65d8678c5b done + #6 naming to ghcr.io/openproblems-bio/label_projection/methods/foo:test 0.0s done + #6 DONE 0.0s + ==================================================================== + +/tmp/viash_test_foo11070556749764805852/test_check_method_config/test_executable + Load config data + Check general fields + Check info fields + Check platform fields + All checks succeeded! + ==================================================================== + +/tmp/viash_test_foo11070556749764805852/test_run_and_check_adata/test_executable + >> Running test 'run' + >> Checking whether input files exist + >> Running script as test + Load data + Create predictions + Add method name to uns + Write output to file + >> Checking whether output file exists + >> Reading h5ad files and checking formats + Reading and checking input_train + AnnData object with n_obs × n_vars = 326 × 500 + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'normalization_id' + obsm: 'X_pca' + layers: 'counts', 'normalized' + Reading and checking input_test + AnnData object with n_obs × n_vars = 174 × 500 + obs: 'batch' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'normalization_id' + obsm: 'X_pca' + layers: 'counts', 'normalized' + Reading and checking output + AnnData object with n_obs × n_vars = 174 × 500 + obs: 'batch', 'label_pred' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'method_id', 'normalization_id' + obsm: 'X_pca' + layers: 'counts', 'normalized' + All checks succeeded! + ==================================================================== + SUCCESS! All 2 out of 2 test scripts succeeded! + Cleaning up temporary directory + +Let’s introduce a bug in the script and try running the test again. For +instance: + +``` python +import anndata as ad +import numpy as np + +## VIASH START +# This code-block will automatically be replaced by Viash at runtime. +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Not creating any predictions!!!", flush=True) +# input_test.obs["label_pred"] = "foo" + +print("Not adding method name to uns!!!", flush=True) +# input_test.uns["method_id"] = meta["functionality_name"] + +print("Write output to file", flush=True) +input_test.write_h5ad(par["output"], compression="gzip") +``` + +If we now run the test, we should get an error since we didn’t create +all of the required output slots. + +``` bash +viash test src/tasks/label_projection/methods/foo/config.vsh.yaml +``` + + Running tests in temporary directory: '/tmp/viash_test_foo11839237358029204600' + ==================================================================== + +/tmp/viash_test_foo11839237358029204600/build_executable/foo ---verbosity 6 ---setup cachedbuild + [notice] Building container 'ghcr.io/openproblems-bio/label_projection/methods/foo:test' with Dockerfile + [info] Running 'docker build -t ghcr.io/openproblems-bio/label_projection/methods/foo:test /tmp/viash_test_foo11839237358029204600/build_executable -f /tmp/viash_test_foo11839237358029204600/build_executable/tmp/dockerbuild-foo-gPvc8b/Dockerfile' + #0 building with "default" instance using docker driver + + #1 [internal] load build definition from Dockerfile + #1 transferring dockerfile: 658B done + #1 DONE 0.1s + + #2 [internal] load .dockerignore + #2 transferring context: 2B done + #2 DONE 0.1s + + #3 [internal] load metadata for ghcr.io/openproblems-bio/base_python:1.0.2 + #3 DONE 0.3s + + #4 [1/2] FROM ghcr.io/openproblems-bio/base_python:1.0.2@sha256:65a577a3de37665b7a65548cb33c9153b6881742345593d33fe02919c8d66a20 + #4 DONE 0.0s + + #5 [2/2] RUN pip install --upgrade pip && pip install --upgrade --no-cache-dir "scikit-learn" + #5 CACHED + + #6 exporting to image + #6 exporting layers done + #6 writing image sha256:939f5846475192d821898f663f15872432e7a2c9033b38ac9b9522155270daf4 done + #6 naming to ghcr.io/openproblems-bio/label_projection/methods/foo:test 0.0s done + #6 DONE 0.0s + ==================================================================== + +/tmp/viash_test_foo11839237358029204600/test_check_method_config/test_executable + Load config data + Check general fields + Check info fields + Check platform fields + All checks succeeded! + ==================================================================== + +/tmp/viash_test_foo11839237358029204600/test_run_and_check_adata/test_executable + >> Running test 'run' + >> Checking whether input files exist + >> Running script as test + Load data + Not creating any predictions!!! + Not adding method name to uns!!! + Write output to file + >> Checking whether output file exists + >> Reading h5ad files and checking formats + Reading and checking input_train + AnnData object with n_obs × n_vars = 326 × 500 + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'normalization_id' + obsm: 'X_pca' + layers: 'counts', 'normalized' + Reading and checking input_test + AnnData object with n_obs × n_vars = 174 × 500 + obs: 'batch' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'normalization_id' + obsm: 'X_pca' + layers: 'counts', 'normalized' + Reading and checking output + AnnData object with n_obs × n_vars = 174 × 500 + Traceback (most recent call last): + obs: 'batch' + var: 'hvg', 'hvg_score' + File "/viash_automount/tmp/viash_test_foo11839237358029204600/test_run_and_check_adata/tmp//viash-run-foo-22aQh6.py", line 138, in + uns: 'dataset_id', 'normalization_id' + run_and_check(argset_args, cmd) + obsm: 'X_pca' + File "/viash_automount/tmp/viash_test_foo11839237358029204600/test_run_and_check_adata/tmp//viash-run-foo-22aQh6.py", line 81, in run_and_check + layers: 'counts', 'normalized' + check_slots(adata, arg) + File "/viash_automount/tmp/viash_test_foo11839237358029204600/test_run_and_check_adata/tmp//viash-run-foo-22aQh6.py", line 48, in check_slots + assert slot_item["name"] in struc_x,\ + AssertionError: File 'output.h5ad' is missing slot .obs['label_pred'] + ==================================================================== + ERROR! Only 1 out of 2 test scripts succeeded! + Unexpected error occurred! If you think this is a bug, please post + create an issue at https://github.com/viash-io/viash/issues containing + a reproducible example and the stack trace below. + + viash - 0.8.0 + Stacktrace: + java.lang.RuntimeException: Only 1 out of 2 test scripts succeeded! + at io.viash.ViashTest$.apply(ViashTest.scala:134) + at io.viash.Main$.mainCLI(Main.scala:253) + at io.viash.Main$.mainCLIOrVersioned(Main.scala:123) + at io.viash.Main$.main(Main.scala:58) + at io.viash.Main.main(Main.scala) + +## More information + +The [Viash reference docs](https://viash.io/reference/config/) page +provides information on all of the available fields in a Viash config, +and the [Guide](https://viash.io/guide/) will help you get started with +creating components from scratch. + + + +## Branch Naming Conventions + +### Category + +A git branch should start with a category. Pick one of these: feature, +bugfix, hotfix, or test. + +- `feature` is for adding, refactoring or removing a feature +- `bugfix` is for fixing a bug +- `hotfix` is for changing code with a temporary solution and/or without + following the usual process (usually because of an emergency) +- `test` is for experimenting outside of an issue/ticket +- `doc` is for adding, changing or removing documentation + +### Reference + +After the category, there should be a “`/`” followed by the reference of +the issue/ticket/task you are working on. If there’s no reference, just +add no-ref. With task it is meant as benchmarking task +e.g. batch_integration + +### Description + +After the reference, there should be another “`/`” followed by a +description which sums up the purpose of this specific branch. This +description should be short and “kebab-cased”. + +By default, you can use the title of the issue/ticket you are working +on. Just replace any special character by “`-`”. + +### To sum up, follow this pattern when branching: + +``` bash +git branch +``` + +### Examples + +- You need to add, refactor or remove a feature: + `git branch feature/issue-42/create-new-button-component` +- You need to fix a bug: + `git branch bugfix/issue-342/button-overlap-form-on-mobile` +- You need to fix a bug really fast (possibly with a temporary + solution): `git branch hotfix/no-ref/registration-form-not-working` +- You need to experiment outside of an issue/ticket: + `git branch test/no-ref/refactor-components-with-atomic-design` + +### References + +- [a-simplified-convention-for-naming-branches-and-commits-in-git](https://dev.to/varbsan/a-simplified-convention-for-naming-branches-and-commits-in-git-il4) diff --git a/CONTRIBUTING.qmd b/CONTRIBUTING.qmd new file mode 100644 index 0000000000..6b6e33ae07 --- /dev/null +++ b/CONTRIBUTING.qmd @@ -0,0 +1,401 @@ +--- +title: Contributing to OpenProblems +format: gfm +toc: true +toc-depth: 2 +engine: knitr +--- + +[OpenProblems](https://openproblems.bio) is a community effort, and everyone is welcome to contribute. This project is hosted on [github.com/openproblems-bio/openproblems-v2](https://github.com/openproblems-bio/openproblems-v2). You can find a full in depth guide on how to contribute to this project on the [OpenProblems website](https://openproblems.bio/documentation/). + +## Code of conduct {#code-of-conduct} + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +Our full [Code of Conduct](CODE_OF_CONDUCT.md) is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1. + + +## Requirements + +To use this repository, please install the following dependencies: + +* Bash +* Java (Java 11 or higher) +* Docker (Instructions [here](https://docs.docker.com/get-docker/)) +* Nextflow (Optional, though [very easy to install](https://www.nextflow.io/index.html#GetStarted)) + +## Quick start + +The `src/` folder contains modular software components for running a modality alignment benchmark. Running the full pipeline is quite easy. + +**Step 0, fetch Viash and Nextflow** + +```bash +mkdir $HOME/bin +curl -fsSL get.viash.io | bash -s -- --bin $HOME/bin --tools false +curl -s https://get.nextflow.io | bash; mv nextflow $HOME/bin +``` + +Make sure that Viash and Nextflow are on the $PATH by checking whether the following commands work: + +```{bash} +viash -v +nextflow -v +``` + +**Step 1, download test resources:** by running the following command. + +```bash +viash run src/common/sync_test_resources/config.vsh.yaml +``` + + Completed 256.0 KiB/7.2 MiB (302.6 KiB/s) with 6 file(s) remaining + Completed 512.0 KiB/7.2 MiB (595.8 KiB/s) with 6 file(s) remaining + Completed 768.0 KiB/7.2 MiB (880.3 KiB/s) with 6 file(s) remaining + Completed 1.0 MiB/7.2 MiB (1.1 MiB/s) with 6 file(s) remaining + Completed 1.2 MiB/7.2 MiB (1.3 MiB/s) with 6 file(s) remaining + ... + +**Step 2, build all the components:** in the `src/` folder as standalone executables in the `target/` folder. Use the `-q 'xxx'` parameter to build a subset of components in the repository. + +```bash +viash ns build --query 'label_projection|common' --parallel --setup cachedbuild +``` + + In development mode with 'dev'. + Exporting process_dataset (label_projection) =docker=> target/docker/label_projection/process_dataset + Exporting accuracy (label_projection/metrics) =docker=> target/docker/label_projection/metrics/accuracy + Exporting random_labels (label_projection/control_methods) =docker=> target/docker/label_projection/control_methods/random_labels + [notice] Building container 'label_projection/control_methods_random_labels:dev' with Dockerfile + [notice] Building container 'common/data_processing_dataset_concatenate:dev' with Dockerfile + [notice] Building container 'label_projection/metrics_accuracy:dev' with Dockerfile + ... + +Viash will build a whole namespace (`ns`) into executables and Nextflow pipelines into the `target/docker` and `target/nextflow` folders respectively. +By adding the `-q/--query` flag, you can filter which components to build using a regex. +By adding the `--parallel` flag, these components are built in parallel (otherwise it will take a really long time). +The flag `--setup cachedbuild` will automatically start building Docker containers for each of these methods. + +The command might take a while to run, since it is building a docker container for each of the components. + +**Step 3, run the pipeline with nextflow.** To do so, run the bash script located at `src/tasks/label_projection/workflows/run_nextflow.sh`: + +```bash +src/tasks/label_projection/workflows/run/run_test.sh +``` + + N E X T F L O W ~ version 22.04.5 + Launching `src/tasks/label_projection/workflows/run/main.nf` [pensive_turing] DSL2 - revision: 16b7b0c332 + executor > local (28) + [f6/f89435] process > run_wf:run_methods:true_labels:true_labels_process (pancreas.true_labels) [100%] 1 of 1 ✔ + [ed/d674a2] process > run_wf:run_methods:majority_vote:majority_vote_process (pancreas.majority_vote) [100%] 1 of 1 ✔ + [15/f0a427] process > run_wf:run_methods:random_labels:random_labels_process (pancreas.random_labels) [100%] 1 of 1 ✔ + [02/969d05] process > run_wf:run_methods:knn:knn_process (pancreas.knn) [100%] 1 of 1 ✔ + [90/5fdf9a] process > run_wf:run_methods:mlp:mlp_process (pancreas.mlp) [100%] 1 of 1 ✔ + [c7/dee2e5] process > run_wf:run_methods:logistic_regression:logistic_regression_process (pancreas.logistic_regression) [100%] 1 of 1 ✔ + [83/3ba0c9] process > run_wf:run_methods:scanvi:scanvi_process (pancreas.scanvi) [100%] 1 of 1 ✔ + [e3/2c298e] process > run_wf:run_methods:seurat_transferdata:seurat_transferdata_process (pancreas.seurat_transferdata) [100%] 1 of 1 ✔ + [d6/7212ab] process > run_wf:run_methods:xgboost:xgboost_process (pancreas.xgboost) [100%] 1 of 1 ✔ + [b6/7dc1a7] process > run_wf:run_metrics:accuracy:accuracy_process (pancreas.scanvi) [100%] 9 of 9 ✔ + [be/7d4da4] process > run_wf:run_metrics:f1:f1_process (pancreas.scanvi) [100%] 9 of 9 ✔ + [89/dcd77a] process > run_wf:aggregate_results:extract_scores:extract_scores_process (combined) [100%] 1 of 1 ✔ + +## Project structure + +High level overview: + . + ├── bin Helper scripts for building the project and developing a new component. + ├── resources_test Datasets for testing components. If you don't have this folder, run **Step 1** above. + ├── src Source files for each component in the pipeline. + │ ├── common Common processing components. + │ ├── datasets Components and pipelines for building the 'Common datasets' + │ ├── label_projection Source files related to the 'Label projection' task. + │ └── ... Other tasks. + └── target Executables generated by viash based on the components listed under `src/`. + ├── docker Bash executables which can be used from a terminal. + └── nextflow Nextflow modules which can be used as a standalone pipeline or as part of a bigger pipeline. + +Detailed overview of a task folder (e.g. `src/tasks/label_projection`): + + src/tasks/label_projection/ + ├── api Specs for the components in this task. + ├── control_methods Control methods which serve as quality control checks for the benchmark. + ├── docs Task documentation + ├── methods Label projection method components. + ├── metrics Label projection metric components. + ├── resources_scripts The scripts needed to run the benchmark. + ├── resources_test_scripts The scripts needed to generate the test resources (which are needed for unit testing). + ├── process_dataset A component that masks a common dataset for use in the benchmark + └── workflows The benchmarking workflow. + + +Detailed overview of the `src/datasets` folder: + + src/datasets/ + ├── api Specs for the data loaders and normalisation methods. + ├── loaders Components for ingesting datasets from a source. + ├── normalization Normalization method components. + ├── processors Other preprocessing components (e.g. HVG and PCA). + ├── resource_scripts The scripts needed to generate the common datasets. + ├── resource_test_scripts The scripts needed to generate the test resources (which are needed for unit testing). + └── workflows The workflow which generates the common datasets. + +## Adding a Viash component + +[Viash](https://viash.io) allows you to create pipelines +in Bash or Nextflow by wrapping Python, R, or Bash scripts into reusable components. + + +You can start creating a new component by [creating a Viash component](https://viash.io/guide/component/creation/docker.html). + + +```{bash, include=FALSE} + +mkdir -p src/tasks/label_projection/methods/foo + +cat > src/tasks/label_projection/methods/foo/config.vsh.yaml << HERE +__merge__: ../../api/comp_method.yaml +functionality: + name: "foo" + namespace: "label_projection/methods" + # A multiline description of your method. + description: "Todo: fill in" + info: + type: method + + # a short label of your method + label: Foo + + # A multiline description of your method. + description: "Todo: fill in" + + # A short summary of the method description. + summary: "Todo: fill in" + + # Add the bibtex reference to the "src/common/library.bib" if it is not already there. + reference: "cover1967nearest" + + repository_url: "https://github.com/openproblems-bio/openproblems-v2" + documentation_url: "https://openproblems.bio/documentation/" + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: [scikit-learn] + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] +HERE + +cat > src/tasks/label_projection/methods/foo/script.py << HERE +import anndata as ad +import numpy as np + +## VIASH START +# This code-block will automatically be replaced by Viash at runtime. +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Create predictions", flush=True) +input_test.obs["label_pred"] = "foo" + +print("Add method name to uns", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] + +print("Write output to file", flush=True) +input_test.write_h5ad(par["output"], compression="gzip") +HERE +``` + +For example, to create a new Python-based method named `foo`, create a Viash config at `src/tasks/label_projection/methods/foo/config.vsh.yaml`: + +```{embed lang="yaml"} +src/tasks/label_projection/methods/foo/config.vsh.yaml +``` + +And create a script at `src/tasks/label_projection/methods/foo/script.py`: + +```{embed lang="python"} +src/tasks/label_projection/methods/foo/script.py +``` + + +## Running a component from CLI + +You can view the interface of the executable by running the executable with the `-h` or `--help` parameter. + +```{bash} +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- --help +``` + +Before running a new component, youy will need to create the docker container: + +```{bash} +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- ---setup cachedbuild + +``` + +You can **run the component** as follows: + +```{bash} +viash run src/tasks/label_projection/methods/foo/config.vsh.yaml -- \ + --input_train resources_test/label_projection/pancreas/train.h5ad \ + --input_test resources_test/label_projection/pancreas/test.h5ad \ + --output resources_test/label_projection/pancreas/prediction.h5ad +``` + +## Building a component + +`viash` has several helper functions to help you quickly develop a component. + +With **`viash build`**, you can turn the component into a standalone executable. +This standalone executable you can give to somebody else, and they will be able to +run it, provided that they have Bash and Docker installed. + +```{bash} +viash build src/tasks/label_projection/methods/foo/config.vsh.yaml \ + -o target/docker/label_projection/methods/foo +``` + +:::{.callout-note} +The `viash ns build` component does a much better job of setting up +a collection of components. +::: + +You can now view the same interface of the executable by running the executable with the `-h` parameter. + +```{bash} +target/docker/label_projection/methods/foo/foo -h +``` + +Or **run the component** as follows: + +```{bash} +target/docker/label_projection/methods/foo/foo \ + --input_train resources_test/label_projection/pancreas/train.h5ad \ + --input_test resources_test/label_projection/pancreas/test.h5ad \ + --output resources_test/label_projection/pancreas/prediction.h5ad +``` + + +## Unit testing a component + +The [method API specifications](src/tasks/label_projection/api/comp_method.yaml) comes with a generic unit test for free. +This means you can unit test your component using the **`viash test`** command. + +```{bash} +viash test src/tasks/label_projection/methods/foo/config.vsh.yaml +``` + +```{bash include=FALSE} +cat > src/tasks/label_projection/methods/foo/script.py << HERE +import anndata as ad +import numpy as np + +## VIASH START +# This code-block will automatically be replaced by Viash at runtime. +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Not creating any predictions!!!", flush=True) +# input_test.obs["label_pred"] = "foo" + +print("Not adding method name to uns!!!", flush=True) +# input_test.uns["method_id"] = meta["functionality_name"] + +print("Write output to file", flush=True) +input_test.write_h5ad(par["output"], compression="gzip") +HERE +``` + +Let's introduce a bug in the script and try running the test again. For instance: + +```{embed lang="python"} +src/tasks/label_projection/methods/foo/script.py +``` + +If we now run the test, we should get an error since we didn't create all of the required output slots. + +```{bash error=TRUE} +viash test src/tasks/label_projection/methods/foo/config.vsh.yaml +``` + + +## More information + +The [Viash reference docs](https://viash.io/reference/config/) page provides information on all of the available fields in a Viash config, and the [Guide](https://viash.io/guide/) will help you get started with creating components from scratch. + + + + +```{bash, echo=FALSE} +rm -r src/tasks/label_projection/methods/foo target/docker/label_projection/methods/foo +``` + +## Branch Naming Conventions + +### Category + +A git branch should start with a category. Pick one of these: feature, bugfix, hotfix, or test. + +* `feature` is for adding, refactoring or removing a feature +* `bugfix` is for fixing a bug +* `hotfix` is for changing code with a temporary solution and/or without following the usual process (usually because of an emergency) +* `test` is for experimenting outside of an issue/ticket +* `doc` is for adding, changing or removing documentation + +### Reference + +After the category, there should be a "`/`" followed by the reference of the issue/ticket/task you are working on. If there's no reference, just add no-ref. With task it is meant as benchmarking task e.g. batch_integration + +### Description + +After the reference, there should be another "`/`" followed by a description which sums up the purpose of this specific branch. This description should be short and "kebab-cased". + +By default, you can use the title of the issue/ticket you are working on. Just replace any special character by "`-`". + +### To sum up, follow this pattern when branching: + +```bash +git branch +``` + +### Examples + +* You need to add, refactor or remove a feature: `git branch feature/issue-42/create-new-button-component` +* You need to fix a bug: `git branch bugfix/issue-342/button-overlap-form-on-mobile` +* You need to fix a bug really fast (possibly with a temporary solution): `git branch hotfix/no-ref/registration-form-not-working` +* You need to experiment outside of an issue/ticket: `git branch test/no-ref/refactor-components-with-atomic-design` + +### References + +* [a-simplified-convention-for-naming-branches-and-commits-in-git](https://dev.to/varbsan/a-simplified-convention-for-naming-branches-and-commits-in-git-il4) \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..c7a5f287cb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 OpenProblems + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000..daff800cdd --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +[![](https://openproblems.bio/images/heros/home_hero_text.png)](https://openproblems.bio) + +------ + +Open Problems is a living, extensible, community-guided benchmarking platform. + +Useful links: + +* [Introduction to Open Problems](https://openproblems.bio) +* [Our benchmarks](https://openproblems.bio/results) +* [Our datasets](https://openproblems.bio/datasets) +* [Our team and community](https://openproblems.bio/team) +* [Planned and past events](https://openproblems.bio/events) +* [How to contribute](https://openproblems.bio/documentation) diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 0000000000..0f0a8fa8f6 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,14 @@ +viash_version: 0.8.0 + +source: src +target: target + +config_mods: | + .functionality.version := 'dev' + .platforms[.type == 'docker'].target_registry := 'ghcr.io' + .platforms[.type == 'docker'].target_organization := 'openproblems-bio' + .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/openproblems-v2' + .platforms[.type == "nextflow"].directives.tag := "$id" + .platforms[.type == "nextflow"].auto.simplifyOutput := false + .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } + .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" \ No newline at end of file diff --git a/main.nf b/main.nf new file mode 100644 index 0000000000..fd40518830 --- /dev/null +++ b/main.nf @@ -0,0 +1,3 @@ +workflow { + print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000000..6402ebf273 --- /dev/null +++ b/nextflow.config @@ -0,0 +1 @@ +process.container = 'nextflow/bash:latest' diff --git a/src/common/check_dataset_schema/config.vsh.yaml b/src/common/check_dataset_schema/config.vsh.yaml new file mode 100644 index 0000000000..08449c3e7d --- /dev/null +++ b/src/common/check_dataset_schema/config.vsh.yaml @@ -0,0 +1,45 @@ +functionality: + name: check_dataset_schema + namespace: common + description: Checks if the dataset has the necessary slots that are predefined in a schema. + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: A h5ad file. + - name: --schema + type: file + required: true + description: A schema file for the h5ad object. + - name: Arguments + arguments: + - name: --stop_on_error + type: boolean + default: false + description: Whether or not to stop with exit code 1 if the input file does not adhere to the schema. + - name: Output + arguments: + - name: --output + type: file + required: true + description: If specified, this file will contain a structured log of which checks succeeded (or not). + example: checks.json + direction: output + resources: + - type: python_script + path: script.py + test_resources: + - path: /resources_test/common/pancreas + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: python + packages: viashpy + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/common/check_dataset_schema/script.py b/src/common/check_dataset_schema/script.py new file mode 100644 index 0000000000..cd84f9cdcf --- /dev/null +++ b/src/common/check_dataset_schema/script.py @@ -0,0 +1,60 @@ +import anndata as ad +import yaml +import json + +## VIASH START +par = { + 'input': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/input_1/dataset.h5ad', + 'schema': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/_viash_par/schema_1/schema.yaml', + 'stop_on_error': False, + 'output': 'work/d4/f4fabc8aa4f2308841d4ab57bcff62/out.yaml', +} +## VIASH END + +def check_structure(slot, slot_info, adata_slot): + missing = [] + if slot == "X": + slot_info["name"] = "X" + slot_info = [slot_info] + for obj in slot_info: + adata_data = adata_slot.get(obj['name']) if slot != 'X' else adata_slot + if obj.get('required') and adata_data is None: + missing.append(obj['name']) + # todo: check types + return missing + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input']) + +# create data structure +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} + +print("Check AnnData against schema", flush=True) +with open(par["schema"], "r") as f: + data_struct = yaml.safe_load(f) + +def_slots = data_struct['info']['slots'] + +out = { + "exit_code": 0, + "error": {}, + "data_schema": "ok" +} +for slot in def_slots: + print("Checking slot", slot, flush=True) + missing = check_structure(slot, def_slots[slot], getattr(adata, slot)) + if missing: + print(f"Dataset is missing {slot} {missing}", flush=True) + out['exit_code'] = 1 + out['data_schema'] = 'not ok' + out['error'][slot] = missing + +with open(par["output"], "w") as f: + json.dump(out, f, indent=2) + +if par['stop_on_error']: + exit(out['exit_code']) diff --git a/src/common/check_dataset_schema/test.py b/src/common/check_dataset_schema/test.py new file mode 100644 index 0000000000..1e7b5eb1e9 --- /dev/null +++ b/src/common/check_dataset_schema/test.py @@ -0,0 +1,98 @@ +import sys +import re +import pytest +import json +import subprocess + +## VIASH START +## VIASH END + +input_path = meta["resources_dir"] + "/pancreas/dataset.h5ad" + +@pytest.fixture +def schema(tmp_path): + schema = tmp_path / "schema.yaml" + schema.write_text(""" +type: file +description: "A preprocessed dataset" +example: "preprocessed.h5ad" +info: + label: "Preprocessed dataset" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true +""") + return schema + +@pytest.fixture +def error_schema(tmp_path): + schema = tmp_path / "schema.yaml" + schema.write_text(""" +type: file +description: "A preprocessed dataset" +example: "preprocessed.h5ad" +info: + label: "Preprocessed dataset" + slots: + X: + type: double + description: Normalized expression values + required: true + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: error_test + description: "A made up uns variable to test if error is picked up" + required: true + """) + return schema + +def test_run(run_component, tmp_path, schema): + output_path = tmp_path / "checks.json" + + run_component([ + "--input", input_path, + "--schema", str(schema), + "--output", str(output_path) + ]) + + assert output_path.exists(), "Output path does not exist" + +def test_error(run_component, tmp_path, error_schema): + output_checks = tmp_path / "checks.json" + + with pytest.raises(subprocess.CalledProcessError) as err: + run_component([ + "--input", input_path, + "--schema", str(error_schema), + "--stop_on_error", "true", + "--output", str(output_checks) + ]) + assert err.value.exitcode > 0 + + assert output_checks.exists(), "Output checks file does not exist" + + with open(output_checks, "r") as f: + out = json.load(f) + assert out["exit_code"] > 0 + assert out["data_schema"] == "not ok" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/common/check_yaml_schema/config.vsh.yaml b/src/common/check_yaml_schema/config.vsh.yaml new file mode 100644 index 0000000000..b87bec5429 --- /dev/null +++ b/src/common/check_yaml_schema/config.vsh.yaml @@ -0,0 +1,26 @@ +functionality: + name: check_yaml_schema + namespace: common + description: Checks if a YAML file adheres to a custom schema file. + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: A yaml file. + - name: --schema + type: file + required: true + description: A schema file for the yaml file. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - jsonschema + - type: nextflow diff --git a/src/common/check_yaml_schema/script.py b/src/common/check_yaml_schema/script.py new file mode 100644 index 0000000000..2058832bb2 --- /dev/null +++ b/src/common/check_yaml_schema/script.py @@ -0,0 +1,59 @@ +import jsonschema +import yaml +from pathlib import Path + +## VIASH START +par = { + 'input': 'src/tasks/batch_integration/methods/bbknn/config.vsh.yaml', + 'schema': 'src/common/api/schema_task_method.yaml' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +def yaml_to_dict(file_path): + with open(file_path, 'r') as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + +def load_schemas(schema_dir): + schema_files = list(schema_dir.glob("./**/schema_*.yaml")) + + schemas = {} + for file in schema_files: + schema = yaml_to_dict(file) + schemas[file.absolute()] = schema + + return schemas + +def create_validator(schema_name, schemas): + schema_store = {} + for name, value in schemas.items(): + schema_store[f"file://{name}"] = value + + # Setting the first schema as the main schema + + main_schema = schemas[schema_name] + resolver = jsonschema.RefResolver( + base_uri=f"file://{schema_name}", + referrer=main_schema, + store=schema_store + ) + + return jsonschema.Draft7Validator(main_schema, resolver=resolver) + +print(">> Read input yaml", flush=True) +input_yaml_file = Path(par["input"]) +with open(input_yaml_file, 'r') as f: + input_yaml = yaml.safe_load(f) + +print(">> Read schema(s)", flush=True) +schema_yaml_file = Path(par["schema"]) +schemas = load_schemas(schema_yaml_file.parent) + +print(">> Validate input yaml against schema", flush=True) +validator = create_validator(schema_yaml_file.absolute(), schemas) +validator.validate(input_yaml) diff --git a/src/common/comp_tests/check_get_info.py b/src/common/comp_tests/check_get_info.py new file mode 100644 index 0000000000..a00f1d702d --- /dev/null +++ b/src/common/comp_tests/check_get_info.py @@ -0,0 +1,37 @@ +import subprocess +from os import path +import json + +## VIASH START +## VIASH END + +input_path = meta["resources_dir"] + "/test_file.yaml" +task_id = "denoising" +output_path = "output.json" + +cmd = [ + meta['executable'], + "--input", input_path, + "--task_id", task_id, + "--output", output_path, +] + +print(">> Running script as test", flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert path.exists(output_path), "Output does not exist" + +print(">> Reading json file", flush=True) +with open(output_path, 'r') as f: + out = json.load(f) + print(out) + +print("All checks succeeded!", flush=True) \ No newline at end of file diff --git a/src/common/comp_tests/check_method_config.py b/src/common/comp_tests/check_method_config.py new file mode 100644 index 0000000000..a30111d648 --- /dev/null +++ b/src/common/comp_tests/check_method_config.py @@ -0,0 +1,132 @@ +import yaml + +## VIASH START +meta = { + "config" : "foo" +} +## VIASH END + + +NAME_MAXLEN = 50 + +SUMMARY_MAXLEN = 400 + +DESCRIPTION_MAXLEN = 5000 + +_MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] + +TIME_LABELS = ["lowtime", "midtime", "hightime", "veryhightime"] +MEM_LABELS = ["lowmem", "midmem", "highmem"] +CPU_LABELS = ["lowcpu", "midcpu", "highcpu"] + +def _load_bib(): + with open(f"{meta['resources_dir']}/library.bib", "r") as file: + return file.read() + +def check_url(url): + import requests + from urllib3.util.retry import Retry + from requests.adapters import HTTPAdapter + + # configure retry strategy + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + + get = session.head(url) + + if get.ok or get.status_code == 429: # 429 rejected, too many requests + return True + else: + return False + +def search_ref_bib(reference): + import re + bib = _load_bib() + + entry_pattern = r"(@\w+{[^}]*" + reference + r"[^}]*}(.|\n)*?)(?=@)" + + bib_entry = re.search(entry_pattern, bib) + + if bib_entry: + + type_pattern = r"@(.*){" + reference + doi_pattern = r"(?=[Dd][Oo][Ii]\s*=\s*{([^,}]+)})" + + entry_type = re.search(type_pattern, bib_entry.group(1)) + + if not (entry_type.group(1) == "misc" or reference in _MISSING_DOIS): + entry_doi = re.search(doi_pattern, bib_entry.group(1)) + assert entry_doi.group(1), "doi not found in bibtex reference" + url = f"https://doi.org/{entry_doi.group(1)}" + assert check_url(url), f"{url} is not reachable, ref= {reference}." + + return True + + else: + return False + +print("Load config data", flush=True) +with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + +print("Check general fields", flush=True) +assert len(config["functionality"]["name"]) <= NAME_MAXLEN, f"Component id (.functionality.name) should not exceed {NAME_MAXLEN} characters." +assert "namespace" in config["functionality"] is not None, "namespace not a field or is empty" + +print("Check info fields", flush=True) +info = config['functionality']['info'] +assert "type" in info, "type not an info field" +info_types = ["method", "control_method"] +assert info["type"] in info_types , f"got {info['type']} expected one of {info_types}" +assert "label" in info is not None, "label not an info field or is empty" +assert "summary" in info is not None, "summary not an info field or is empty" +assert "FILL IN:" not in info["summary"], "Summary not filled in" +assert len(info["summary"]) <= SUMMARY_MAXLEN, f"Component id (.functionality.info.summary) should not exceed {SUMMARY_MAXLEN} characters." +assert "description" in info is not None, "description not an info field or is empty" +assert "FILL IN:" not in info["description"], "description not filled in" +assert len(info["description"]) <= DESCRIPTION_MAXLEN, f"Component id (.functionality.info.description) should not exceed {DESCRIPTION_MAXLEN} characters." +if info["type"] == "method": + assert "reference" in info, "reference not an info field" + bib = _load_bib() + if info["reference"]: + reference = info["reference"] + if not isinstance(reference, list): + reference = [reference] + for ref in reference: + assert search_ref_bib(ref), f"reference {ref} not added to library.bib" + assert "documentation_url" in info is not None, "documentation_url not an info field or is empty" + assert "repository_url" in info is not None, "repository_url not an info field or is empty" + assert check_url(info["documentation_url"]), f"{info['documentation_url']} is not reachable" + assert check_url(info["repository_url"]), f"{info['repository_url']} is not reachable" + +if "variants" in info: + arg_names = [arg["name"].replace("--", "") for arg in config["functionality"]["arguments"]] + ["preferred_normalization"] + + for paramset_id, paramset in info["variants"].items(): + if paramset: + for arg_id in paramset: + assert arg_id in arg_names, f"Argument '{arg_id}' in `.functionality.info.variants['{paramset_id}']` is not an argument in `.functionality.arguments`." + +assert "preferred_normalization" in info, "preferred_normalization not an info field" +norm_methods = ["log_cpm", "log_cp10k", "counts", "log_scran_pooling", "sqrt_cpm", "sqrt_cp10k", "l1_sqrt"] +assert info["preferred_normalization"] in norm_methods, "info['preferred_normalization'] not one of '" + "', '".join(norm_methods) + "'." + +print("Check platform fields", flush=True) +platforms = config['platforms'] +for platform in platforms: + if not platform["type"] == "nextflow": + continue + nextflow= platform + +assert nextflow, "nextflow not a platform" +assert nextflow["directives"], "directives not a field in nextflow platform" +assert nextflow["directives"]["label"], "label not a field in nextflow platform directives" + +assert [i for i in nextflow["directives"]["label"] if i in TIME_LABELS], "time label not filled in" +assert [i for i in nextflow["directives"]["label"] if i in MEM_LABELS], "mem label not filled in" +assert [i for i in nextflow["directives"]["label"] if i in CPU_LABELS], "cpu label not filled in" + +print("All checks succeeded!", flush=True) diff --git a/src/common/comp_tests/check_metric_config.py b/src/common/comp_tests/check_metric_config.py new file mode 100644 index 0000000000..45fa1efc2b --- /dev/null +++ b/src/common/comp_tests/check_metric_config.py @@ -0,0 +1,139 @@ +import yaml +from typing import Dict + +## VIASH START + +meta = { + "config" : "foo" +} + +## VIASH END + +NAME_MAXLEN = 50 + +SUMMARY_MAXLEN = 400 + +DESCRIPTION_MAXLEN = 5000 + +_MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] + +TIME_LABELS = ["lowtime", "midtime", "hightime"] +MEM_LABELS = ["lowmem", "midmem", "highmem"] +CPU_LABELS = ["lowcpu", "midcpu", "highcpu"] + + +def _load_bib(): + bib_path = meta["resources_dir"]+"/library.bib" + with open(bib_path, "r") as file: + return file.read() + +def check_url(url): + import requests + from urllib3.util.retry import Retry + from requests.adapters import HTTPAdapter + + # configure retry strategy + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + + get = session.head(url) + + if get.ok or get.status_code == 429: # 429 rejected, too many requests + return True + else: + return False + +def search_ref_bib(reference): + import re + bib = _load_bib() + + entry_pattern = r"(@\w+{[^}]*" + reference + r"[^}]*}(.|\n)*?)(?=@)" + + bib_entry = re.search(entry_pattern, bib) + + if bib_entry: + + type_pattern = r"@(.*){" + reference + doi_pattern = r"(?=[Dd][Oo][Ii]\s*=\s*{([^,}]+)})" + + entry_type = re.search(type_pattern, bib_entry.group(1)) + + if not (entry_type.group(1) == "misc" or reference in _MISSING_DOIS): + entry_doi = re.search(doi_pattern, bib_entry.group(1)) + assert entry_doi.group(1), "doi not found in bibtex reference" + url = f"https://doi.org/{entry_doi.group(1)}" + assert check_url(url), f"{url} is not reachable, ref= {reference}." + + return True + + else: + return False + +def check_metric(metric: Dict[str, str]) -> str: + assert "name" in metric is not None, "name not a field or is empty" + assert len(metric["name"]) <= NAME_MAXLEN, f"Component id (.functionality.info.metrics.metric.name) should not exceed {NAME_MAXLEN} characters." + assert "label" in metric is not None, "label not a field in metric or is empty" + assert "summary" in metric is not None, "summary not a field in metric or is empty" + assert "FILL IN:" not in metric["summary"], "Summary not filled in" + assert len(metric["summary"]) <= SUMMARY_MAXLEN, f"Component id (.functionality.info.metrics.metric.summary) should not exceed {SUMMARY_MAXLEN} characters." + assert "description" in metric is not None, "description not a field in metric or is empty" + assert len(metric["description"]) <= DESCRIPTION_MAXLEN, f"Component id (.functionality.info.metrics.metric.description) should not exceed {DESCRIPTION_MAXLEN} characters." + assert "FILL IN:" not in metric["description"], "description not filled in" + # assert "reference" in metric, "reference not a field in metric" + if "reference" in metric: + reference = metric["reference"] + if not isinstance(reference, list): + reference = [reference] + for ref in reference: + assert search_ref_bib(ref), f"reference {ref} not added to library.bib" + # assert "documentation_url" in metric , "documentation_url not a field in metric" + # assert "repository_url" in metric , "repository_url not a metric field" + if "documentation_url" in metric: + assert check_url(metric["documentation_url"]), f"{metric['documentation_url']} is not reachable" + if "repository_url" in metric: + assert check_url(metric["repository_url"]), f"{metric['repository_url']} is not reachable" + assert "min" in metric is not None, f"min not a field in metric or is emtpy" + assert "max" in metric is not None, f"max not a field in metric or is empty" + assert "maximize" in metric is not None, f"maximize not a field in metric or is emtpy" + assert isinstance(metric['min'], (int, str)), "not an int or string (-.inf)" + assert isinstance(metric['max'], (int, str)), "not an int or string (+.inf)" + assert isinstance(metric['maximize'], bool) or metric["maximize"] not in ["-inf", "+inf"], "not a bool" + + +print("Load config data", flush=True) +with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + +print("check general fields", flush=True) +assert "name" in config["functionality"] is not None, "Name not a field or is empty" +assert len(config["functionality"]["name"]) <= NAME_MAXLEN, f"Component id (.functionality.name) should not exceed {NAME_MAXLEN} characters." +assert "namespace" in config["functionality"] is not None, "namespace not a field or is empty" + + +print("Check info fields", flush=True) +info = config['functionality']['info'] +assert "type" in info, "type not an info field" +assert info["type"] == "metric" , f"got {info['type']} expected 'metric'" +assert "metrics" in info, "metrics not an info field" +for metric in info["metrics"]: + check_metric(metric) + +print("Check platform fields", flush=True) +platforms = config['platforms'] +for platform in platforms: + if not platform["type"] == "nextflow": + continue + nextflow= platform + +assert nextflow, "nextflow not a platform" +assert nextflow["directives"], "directives not a field in nextflow platform" +assert nextflow["directives"]["label"], "label not a field in nextflow platform directives" + +assert [i for i in nextflow["directives"]["label"] if i in TIME_LABELS], "time label not filled in" +assert [i for i in nextflow["directives"]["label"] if i in MEM_LABELS], "mem label not filled in" +assert [i for i in nextflow["directives"]["label"] if i in CPU_LABELS], "cpu label not filled in" + +print("All checks succeeded!", flush=True) diff --git a/src/common/comp_tests/run_and_check_adata.py b/src/common/comp_tests/run_and_check_adata.py new file mode 100644 index 0000000000..d2cda5af94 --- /dev/null +++ b/src/common/comp_tests/run_and_check_adata.py @@ -0,0 +1,127 @@ +import anndata as ad +import subprocess +from os import path +import yaml +import re + +## VIASH START +meta = { + "executable": "target/docker/denoising/methods/dca/dca", + "config": "target/docker/denoising/methods/dca/.config.vsh.yaml", + "resources_dir": "resources_test/denoising" +} +## VIASH END + +# helper functions +def check_slots(adata, arg): + """Check whether an AnnData file contains all for the required + slots in the corresponding .info.slots field. + """ + for struc_name, slot_items in arg["info"].get("slots", {}).items(): + struc_x = getattr(adata, struc_name) + + if struc_name == "X": + if slot_items.get("required", True): + assert struc_x is not None,\ + f"File '{arg['value']}' is missing slot .{struc_name}" + + else: + for slot_item in slot_items: + if slot_item.get("required", True): + assert slot_item["name"] in struc_x,\ + f"File '{arg['value']}' is missing slot .{struc_name}['{slot_item['name']}']" + +def run_and_check(arguments, cmd): + print(">> Checking whether input files exist", flush=True) + for arg in arguments: + if arg["type"] == "file" and arg["direction"] == "input": + assert path.exists(arg["value"]), f"Input file '{arg['value']}' does not exist" + + print(f">> Running script as test", flush=True) + out = subprocess.run(cmd, stderr=subprocess.STDOUT) + + if out.stdout: + print(out.stdout) + + if out.returncode: + print(f"script: \'{' '.join(cmd)}\' exited with an error.") + exit(out.returncode) + + print(">> Checking whether output file exists", flush=True) + for arg in arguments: + if arg["type"] == "file" and arg["direction"] == "output": + assert path.exists(arg["value"]), f"Output file '{arg['value']}' does not exist" + + print(">> Reading h5ad files and checking formats", flush=True) + adatas = {} + for arg in arguments: + if arg["type"] == "file" and "slots" in arg["info"]: + print(f"Reading and checking {arg['clean_name']}", flush=True) + adata = ad.read_h5ad(arg["value"]) + + print(f" {adata}") + + check_slots(adata, arg) + + adatas[arg["clean_name"]] = adata + + print("All checks succeeded!", flush=True) + + +# read viash config +with open(meta["config"], "r") as file: + config = yaml.safe_load(file) + +# get resources +arguments = [] + +for arg in config["functionality"]["arguments"]: + new_arg = arg.copy() + arg_info = new_arg.get("info") or {} + + # set clean name + clean_name = re.sub("^--", "", arg["name"]) + new_arg["clean_name"] = clean_name + + # use example to find test resource file + if arg["type"] == "file": + if arg["direction"] == "input": + value = f"{meta['resources_dir']}/{arg['example'][0]}" + else: + value = f"{clean_name}.h5ad" + new_arg["value"] = value + elif "test_default" in arg_info: + new_arg["value"] = arg_info["test_default"] + + arguments.append(new_arg) + + +if "test_setup" not in config["functionality"]["info"]: + argument_sets = {"run": arguments} +else: + test_setup = config["functionality"]["info"]["test_setup"] + argument_sets = {} + for name, test_instance in test_setup.items(): + new_arguments = [] + for arg in arguments: + new_arg = arg.copy() + if arg["clean_name"] in test_instance: + val = test_instance[arg["clean_name"]] + if new_arg["type"] == "file" and new_arg["direction"] == "input": + val = f"{meta['resources_dir']}/{val}" + new_arg["value"] = val + new_arguments.append(new_arg) + argument_sets[name] = new_arguments + +for argset_name, argset_args in argument_sets.items(): + print(f">> Running test '{argset_name}'", flush=True) + # construct command + cmd = [ meta["executable"] ] + for arg in argset_args: + if "value" in arg: + value = arg["value"] + if arg["multiple"] and isinstance(value, list): + value = arg["multiple_sep"].join(value) + cmd.extend([arg["name"], str(value)]) + + run_and_check(argset_args, cmd) \ No newline at end of file diff --git a/src/common/create_component/config.vsh.yaml b/src/common/create_component/config.vsh.yaml new file mode 100644 index 0000000000..5c829462ad --- /dev/null +++ b/src/common/create_component/config.vsh.yaml @@ -0,0 +1,71 @@ +functionality: + name: create_component + namespace: common + description: | + Create a component Viash component. + + Usage: + ``` + bin/create_component --task denoising --type method --language r --name foo + bin/create_component --task denoising --type metric --language python --name bar + ``` + arguments: + - type: string + name: --task + description: Which task the component will be added to. + example: denoising + - type: string + name: --type + example: metric + description: The type of component to create. Typically must be one of 'method', 'control_method' or 'metric'. + - type: string + name: --language + description: Which scripting language to use. Options are 'python', 'r'. + default: python + choices: [python, r] + - type: string + name: --name + example: new_comp + description: Name of the new method, formatted in snake case. + - type: file + name: --output + direction: output + # required: true + description: Path to the component directory. Suggested location is `src//s/`. + default: src/tasks/${VIASH_PAR_TASK}/${VIASH_PAR_TYPE}s/${VIASH_PAR_NAME} + - type: file + name: --api_file + description: | + Which API file to use. Defaults to `src//api/comp_.yaml`. + In tasks with different subtypes of method, this location might not exist and you might need + to manually specify a different API file to inherit from. + must_exist: false + # required: true + default: src/tasks/${VIASH_PAR_TASK}/api/comp_${VIASH_PAR_TYPE}.yaml + - type: file + name: --viash_yaml + description: | + Path to the project config file. Needed for knowing the relative location of a file to the project root. + # required: true + default: "_viash.yaml" + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/read_and_merge_yaml.py + test_resources: + - type: python_script + path: test.py + - path: /src + dest: openproblems-v2/src + - path: /_viash.yaml + dest: openproblems-v2/_viash.yaml +platforms: + - type: docker + image: python:3.10-slim + setup: + - type: python + pypi: ruamel.yaml + - type: native + - type: nextflow + + diff --git a/src/common/create_component/script.py b/src/common/create_component/script.py new file mode 100644 index 0000000000..8c954a66d4 --- /dev/null +++ b/src/common/create_component/script.py @@ -0,0 +1,476 @@ +from typing import Any +from pathlib import Path +import sys +import os +import re + +## VIASH START +par = { + "task": "denoising", + "type": "method", + "language": "python", + "name": "new_comp", + "output": "src/tasks/denoising/methods/new_comp", + "api_file": "src/tasks/denoising/api/comp_method.yaml", + "viash_yaml": "_viash.yaml" +} +## VIASH END + +# import helper function +sys.path.append(meta["resources_dir"]) +from read_and_merge_yaml import read_and_merge_yaml + +def strip_margin(text: str) -> str: + return re.sub("(^|\n)[ \t]*\|", "\\1", text) + +def create_config(par, component_type, pretty_name, script_path) -> str: + info_str = generate_info(par, component_type, pretty_name) + resources_str = generate_resources(par, script_path) + docker_platform = generate_docker_platform(par) + + return strip_margin(f'''\ + |# The API specifies which type of component this is. + |# It contains specifications for: + |# - The input/output files + |# - Common parameters + |# - A unit test + |__merge__: {os.path.relpath(par["api_file"], par["output"])} + | + |functionality: + | # A unique identifier for your component (required). + | # Can contain only lowercase letters or underscores. + | name: {par["name"]} + | + | # Metadata for your component + | info: + |{info_str} + | # Component-specific parameters (optional) + | # arguments: + | # - name: "--n_neighbors" + | # type: "integer" + | # default: 5 + | # description: Number of neighbors to use. + | + | # Resources required to run the component + | resources: + |{resources_str} + |platforms: + | # Specifications for the Docker image for this component. + |{docker_platform} + | # This platform allows running the component natively + | - type: native + | # Allows turning the component into a Nextflow module / pipeline. + | - type: nextflow + | directives: + | label: [midtime,midmem, midcpu] + |''' + ) + +def generate_info(par, component_type, pretty_name) -> str: + """Generate the functionality info for a component.""" + if component_type in ["method", "control_method"]: + str = strip_margin(f'''\ + | # A relatively short label, used when rendering visualisarions (required) + | label: {pretty_name} + | # A one sentence summary of how this method works (required). Used when + | # rendering summary tables. + | summary: "FILL IN: A one sentence summary of this method." + | # A multi-line description of how this component works (required). Used + | # when rendering reference documentation. + | description: | + | FILL IN: A (multi-line) description of how this method works. + | # Which normalisation method this component prefers to use (required). + | preferred_normalization: log_cp10k + |''') + if component_type == "method": + str += strip_margin(f'''\ + | # A reference key from the bibtex library at src/common/library.bib (required). + | reference: bibtex_reference_key + | # URL to the documentation for this method (required). + | documentation_url: https://url.to/the/documentation + | # URL to the code repository for this method (required). + | repository_url: https://github.com/organisation/repository + |''') + return str + elif component_type == "metric": + return strip_margin(f'''\ + | metrics: + | # A unique identifier for your metric (required). + | # Can contain only lowercase letters or underscores. + | name: {par["name"]} + | # A relatively short label, used when rendering visualisarions (required) + | label: {pretty_name} + | # A one sentence summary of how this metric works (required). Used when + | # rendering summary tables. + | summary: "FILL IN: A one sentence summary of this metric." + | # A multi-line description of how this component works (required). Used + | # when rendering reference documentation. + | description: | + | FILL IN: A (multi-line) description of how this metric works. + | # A reference key from the bibtex library at src/common/library.bib (required). + | reference: bibtex_reference_key + | # URL to the documentation for this metric (required). + | documentation_url: https://url.to/the/documentation + | # URL to the code repository for this metric (required). + | repository_url: https://github.com/organisation/repository + | # The minimum possible value for this metric (required) + | min: 0 + | # The maximum possible value for this metric (required) + | max: 1 + | # Whether a higher value represents a 'better' solution (required) + | maximize: true + |''') + + +def generate_resources(par, script_path) -> str: + """Add the script to the functionality resources.""" + if par["language"] == "python": + type_str = "python_script" + elif par["language"] == "r": + type_str = "r_script" + + return strip_margin(f'''\ + | # The script of your component (required) + | - type: {type_str} + | path: {script_path} + | # Additional resources your script needs (optional) + | # - type: file + | # path: weights.pt + |''') + +def generate_docker_platform(par) -> str: + """Set up the docker platform for Python.""" + if par["language"] == "python": + image_str = "openproblems/base_python:1.0.0" + setup_type = "python" + package_example = "scib==1.1.5" + elif par["language"] == "r": + image_str = "openproblems/base_r:1.0.0" + setup_type = "r" + package_example = "tidyverse" + return strip_margin(f'''\ + | - type: docker + | image: {image_str} + | # Add custom dependencies here (optional). For more information, see + | # https://viash.io/reference/config/platforms/docker/#setup . + | # setup: + | # - type: {setup_type} + | # packages: {package_example} + |''') + +def set_par_values(config) -> None: + """Adds values to each of the arguments in a config file.""" + args = config['functionality']['arguments'] + for argi, arg in enumerate(args): + key = re.sub("^-*", "", arg['name']) + + # find value + if arg["type"] != "file": + value = arg.get("default", arg.get("example", "...")) + elif arg.get("direction", "input") == "input": + key_strip = key.replace("input_", "") + value = f'resources_test/{par["task"]}/pancreas/{key_strip}.h5ad' + else: + key_strip = key.replace("output_", "") + value = f'{key_strip}.h5ad' + + # store key and value + config['functionality']['arguments'][argi]["key"] = key + config['functionality']['arguments'][argi]["value"] = value + +def look_for_adata_arg(args, uns_field): + """Look for an argument that has a .uns[uns_field] in its info.slots.""" + for arg in args: + uns = arg.get("info", {}).get("slots", {}).get("uns", []) + for unval in uns: + if unval.get("name") == uns_field: + return arg["key"] + return "adata" + +def write_output_python(arg, copy_from_adata, is_metric): + """Create code for writing the output h5ad files.""" + slots = arg.get("info", {}).get("slots", {}) + outer = [] + for group_name, slots in slots.items(): + inner = [] + for slot in slots: + if group_name == "uns" and slot["name"] in ["dataset_id", "normalization_id"]: + value = f"{copy_from_adata}.uns['{slot['name']}']" + elif group_name == "uns" and slot["name"] == "method_id": + if is_metric: + value = f"{copy_from_adata}.uns['{slot['name']}']" + else: + value = "meta['functionality_name']" + else: + value = group_name + "_" + slot["name"] + inner.append(f"'{slot['name']}': {value}") + inner_values = ',\n '.join(inner) + outer.append(f"{group_name}={{\n {inner_values}\n }}") + outer_values = ',\n '.join(outer) + return strip_margin( + f'''\ + |print("Write {arg["key"]} AnnData to file", flush=True) + |{arg["key"]} = ad.AnnData( + | {outer_values} + |) + |{arg["key"]}.write_h5ad(par['{arg["key"]}'], compression='gzip')''' + ) + +def write_output_r(arg, copy_from_adata, is_metric): + """Create code for writing the output h5ad files.""" + slots = arg.get("info", {}).get("slots", {}) + outer = [] + for group_name, slots in slots.items(): + inner = [] + for slot in slots: + if group_name == "uns" and slot["name"] in ["dataset_id", "normalization_id"]: + value = f"{copy_from_adata}$uns[[\"{slot['name']}\"]]" + elif group_name == "uns" and slot["name"] == "method_id": + if is_metric: + value = f"{copy_from_adata}$uns[[\"{slot['name']}\"]]" + else: + value = "meta[[\"functionality_name\"]]" + else: + value = group_name + "_" + slot["name"] + inner.append(f"{slot['name']} = {value}") + inner_values = ',\n '.join(inner) + outer.append(f"{group_name} = list(\n {inner_values}\n )") + outer_values = ',\n '.join(outer) + return strip_margin( + f'''\ + |cat("Write {arg["key"]} AnnData to file\\n") + |{arg["key"]} <- anndata::AnnData( + | {outer_values} + |) + |{arg["key"]}$write_h5ad(par[["{arg["key"]}"]], compression = "gzip")''' + ) + +def create_python_script(par, config, type): + args = config['functionality']['arguments'] + + # create the arguments of the par string + par_string = ",\n ".join(f"'{arg['key']}': '{arg['value']}'" for arg in args) + + # create code for reading the input h5ad file + read_h5ad_string = "\n".join( + f"{arg['key']} = ad.read_h5ad(par['{arg['key']}'])" + for arg in args + if arg['type'] == "file" + and arg.get('direction', "input") == "input" + ) + + # determine which adata to copy from + copy_from_adata = look_for_adata_arg(args, "method_id" if type == "metric" else "dataset_id") + + # create code for writing the output h5ad files + write_h5ad_string = "\n".join( + write_output_python(arg, copy_from_adata, type == "metric") + for arg in args + if arg["type"] == "file" + and arg.get("direction", "input") == "output" + ) + + if type == 'metric': + processing_string = strip_margin(f'''\ + |print('Compute metrics', flush=True) + |# metric_ids and metric_values can have length > 1 + |# but should be of equal length + |uns_metric_ids = [ '{par['name']}' ] + |uns_metric_values = [ 0.5 ]''') + else: + processing_string = strip_margin(f'''\ + |print('Preprocess data', flush=True) + |# ... preprocessing ... + | + |print('Train model', flush=True) + |# ... train model ... + | + |print('Generate predictions', flush=True) + |# ... generate predictions ...''') + + script = strip_margin(f'''\ + |import anndata as ad + | + |## VIASH START + |# Note: this section is auto-generated by viash at runtime. To edit it, make changes + |# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. + |par = {{ + | {par_string} + |}} + |meta = {{ + | 'functionality_name': '{par["name"]}' + |}} + |## VIASH END + | + |print('Reading input files', flush=True) + |{read_h5ad_string} + | + |{processing_string} + | + |{write_h5ad_string} + |''') + + return script + +def create_r_script(par, api_spec, type): + args = api_spec['functionality']['arguments'] + + # create the arguments of the par string + par_string = ",\n ".join(f'{arg["key"]} = "{arg["value"]}"' for arg in args) + + # create helpers for reading the h5ad file + read_h5ad_string = "\n".join( + f'{arg["key"]} <- anndata::read_h5ad(par[["{arg["key"]}"]])' + for arg in args + if arg['type'] == "file" + and arg.get("direction", "input") == "input" + ) + + # determine which adata to copy from + copy_from_adata = look_for_adata_arg(args, "method_id" if type == "metric" else "dataset_id") + + # create code for writing the output h5ad files + write_h5ad_string = "\n".join( + write_output_r(arg, copy_from_adata, type == "metric") + for arg in args + if arg["type"] == "file" + and arg.get("direction", "input") == "output" + ) + + if type == 'metric': + processing_string = strip_margin(f'''\ + |cat("Compute metrics\\n") + |# metric_ids and metric_values can have length > 1 + |# but should be of equal length + |uns_metric_ids <- c("{par['name']}") + |uns_metric_values <- c(0.5)''') + else: + processing_string = strip_margin(f'''\ + |cat("Preprocess data\\n") + |# ... preprocessing ... + | + |cat("Train model\\n") + |# ... train model ... + | + |cat("Generate predictions\\n") + |# ... generate predictions ...''') + + script = strip_margin(f'''\ + |library(anndata) + | + |## VIASH START + |par <- list( + | {par_string} + |) + |meta <- list( + | functionality_name = "{par["name"]}" + |) + |## VIASH END + | + |cat("Reading input files\\n") + |{read_h5ad_string} + | + |{processing_string} + | + |{write_h5ad_string} + |''') + + return script + +# def read_viash_config(file): +# file = file.absolute() + +# # read in config +# command = ["viash", "config", "view", str(file)] + +# # Execute the command and capture the output +# output = subprocess.check_output( +# command, +# universal_newlines=True, +# cwd=str(file.parent) +# ) + +# # Parse the output as YAML +# config = yaml.load(output) + +# return config + + +def main(par): + ####### CHECK INPUTS ####### + print("Check inputs", flush=True) + assert re.match("[a-z][a-z0-9_]*", par["name"]), "Name should match the regular expression '[a-z][a-z0-9_]*'. Example: 'my_component'." + assert len(par['name']) <= 50, "Method name should be at most 50 characters." + + pretty_name = re.sub("_", " ", par['name']).title() + + ####### CHECK LANGUAGE ####### + print("Check language", flush=True) + # check language and determine script path + if par["language"] == "python": + script_path = "script.py" + elif par["language"] == "r": + script_path = "script.R" + else: + sys.exit(f"Unrecognized language parameter '{par['language']}'.") + + ## CHECK API FILE + print("Check API file", flush=True) + api_file = Path(par["api_file"]) + viash_yaml = Path(par["viash_yaml"]) + project_dir = viash_yaml.parent + if not api_file.exists(): + comp_types = [x.with_suffix("").name.removeprefix("comp_") for x in api_file.parent.glob("**/comp_*.y*ml")] + list.sort(comp_types) + sys.exit(strip_margin(f"""\ + |Error: Invalid --type argument. + | Reason: Could not find API file at '{api_file.relative_to(project_dir)}'. + | Possible values for --type: {', '.join(comp_types)}.""")) + + ## READ API FILE + print("Read API file", flush=True) + api = read_and_merge_yaml(api_file) + comp_type = api.get("functionality", {}).get("info", {}).get("type", {}) + if not comp_type: + sys.exit(strip_margin(f"""\ + |Error: API file is incorrectly formatted. + | Reason: Could not find component type at `.functionality.info.type`.' + | Please fix the formatting of the API file.""")) + + ####### CREATE OUTPUT DIR ####### + print("Create output dir", flush=True) + out_dir = Path(par["output"]) + out_dir.mkdir(exist_ok=True) + + ####### CREATE CONFIG ####### + print("Create config", flush=True) + config_file = out_dir / "config.vsh.yaml" + + # get config template + config_str = create_config(par, comp_type, pretty_name, script_path) + + with open(config_file, "w") as f: + f.write(config_str) + + ####### CREATE SCRIPT ####### + print("Create script", flush=True) + script_file = out_dir / script_path + + # set reasonable values + set_par_values(api) + + if par["language"] == "python": + script_out = create_python_script(par, api, comp_type) + + if par["language"] == "r": + script_out = create_r_script(par, api, comp_type) + + # write script + with open(script_file, "w") as f: + f.write(script_out) + + print("Done!", flush=True) + + +if __name__ == "__main__": + main(par) diff --git a/src/common/create_component/script.sh b/src/common/create_component/script.sh new file mode 100755 index 0000000000..9fef9ef3a7 --- /dev/null +++ b/src/common/create_component/script.sh @@ -0,0 +1,5 @@ +TASK=dimensionality_reduction +viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type metric --name foor --language r +viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type method --name foor --language r +viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type method --name foopy +viash run src/common/create_component/config.vsh.yaml -- --task $TASK --type metric --name foopy \ No newline at end of file diff --git a/src/common/create_component/test.py b/src/common/create_component/test.py new file mode 100644 index 0000000000..16da1bd854 --- /dev/null +++ b/src/common/create_component/test.py @@ -0,0 +1,52 @@ +import os +import subprocess +from os import path +from ruamel.yaml import YAML + +## VIASH START +meta = { + 'executable': 'foo' +} +## VIASH END + +opv2 = f"{meta['resources_dir']}/openproblems-v2" +output_path = f"{opv2}/src/tasks/label_projection/methods/test_method" + +cmd = [ + meta['executable'], + '--task', 'label_projection', + '--type', 'method', + '--name', 'test_method', + '--language', 'python' +] + +print('>> Running the script as test', flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT, cwd=opv2) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print('>> Checking whether output files exist', flush=True) +assert os.path.exists(output_path), "Output dir does not exist" + +conf_f = path.join(output_path, 'config.vsh.yaml') +assert os.path.exists(conf_f), "Config file does not exist" + +script_f = path.join(output_path, "script.py") +assert os.path.exists(script_f), "Script file does not exist" + +print('>> Checking file contents', flush=True) +yaml = YAML(typ='safe', pure=True) +with open(conf_f) as f: + conf_data = yaml.load(f) + +assert conf_data['functionality']['name'] == 'test_method', "Name should be equal to 'test_method'" +# assert conf_data['platforms'][0]['image'] == 'python:3.10', "Python image should be equal to python:3.10" + + +print('All checks succeeded!', flush=True) + diff --git a/src/common/create_task_readme/config.vsh.yaml b/src/common/create_task_readme/config.vsh.yaml new file mode 100644 index 0000000000..d268974ce8 --- /dev/null +++ b/src/common/create_task_readme/config.vsh.yaml @@ -0,0 +1,69 @@ +functionality: + name: create_task_readme + namespace: common + description: | + Create a README for the task. + argument_groups: + - name: Inputs + arguments: + - type: string + name: --task + description: Which task the component will be added to. + example: denoising + required: false + - type: file + name: --task_dir + description: Path to the task directory. + default: src/tasks/${VIASH_PAR_TASK} + required: false + - type: file + name: --viash_yaml + description: | + Path to the project config file. Needed for knowing the relative location of a file to the project root. + default: "_viash.yaml" + - type: string + name: --github_url + description: | + URL to the GitHub repository. Needed for linking to the source code. + default: "https://github.com/openproblems-bio/openproblems-v2/tree/main/" + - name: Outputs + arguments: + - type: file + name: --output + direction: output + description: Path to the component directory. Suggested location is `src/tasks//README.md`. + default: src/tasks/${VIASH_PAR_TASK}/README.md + resources: + - type: r_script + path: script.R + - path: /src/common/helper_functions/read_and_merge_yaml.R + - path: /src/common/helper_functions/read_api_files.R + - path: /src/common/helper_functions/strip_margin.R + test_resources: + - type: r_script + path: test.R + - path: /src + dest: openproblems-v2/src + - path: /_viash.yaml + dest: openproblems-v2/_viash.yaml +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + packages: [dplyr, purrr, rlang, glue, yaml, fs, cli, igraph, rmarkdown, processx] + - type: apt + packages: [jq, curl] + - type: docker + # download and install quarto-*-linux-amd64.deb from latest release + run: | + release_info=$(curl -s https://api.github.com/repos/quarto-dev/quarto-cli/releases/latest) && \ + download_url=$(printf "%s" "$release_info" | jq -r '.assets[] | select(.name | test("quarto-.*-linux-amd64.deb")) | .browser_download_url') && \ + curl -sL "$download_url" -o /opt/quarto.deb && \ + dpkg -i /opt/quarto.deb && \ + rm /opt/quarto.deb + - type: native + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] + diff --git a/src/common/create_task_readme/render_all.sh b/src/common/create_task_readme/render_all.sh new file mode 100755 index 0000000000..e44195c1ed --- /dev/null +++ b/src/common/create_task_readme/render_all.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +TASK_IDS=`ls src/tasks` + +for task_id in $TASK_IDS; do + echo ">> Processing $task_id" + viash run src/common/create_task_readme/config.vsh.yaml -- --task $task_id +done \ No newline at end of file diff --git a/src/common/create_task_readme/script.R b/src/common/create_task_readme/script.R new file mode 100644 index 0000000000..55388ea7ed --- /dev/null +++ b/src/common/create_task_readme/script.R @@ -0,0 +1,134 @@ +library(rlang, quietly = TRUE, warn.conflicts = FALSE) +library(purrr, quietly = TRUE, warn.conflicts = FALSE) +library(dplyr, quietly = TRUE, warn.conflicts = FALSE) + +## VIASH START +par <- list( + "task" = "batch_integration", + "task_dir" = "src/tasks/batch_integration", + "output" = "src/tasks/batch_integration/README.md", + "viash_yaml" = "_viash.yaml", + "github_url" = "https://github.com/openproblems-bio/openproblems-v2/tree/main/" +) +meta <- list( + "resources_dir" = "src/common/helper_functions", + "temp_dir" = "temp/" +) +## VIASH END + +if (is.null(par$task) && is.null(par$task_dir)) { + stop("Either 'task' or 'task_dir' must be provided") +} +if (is.null(par$viash_yaml)) { + stop("Argument 'viash_yaml' must be provided") +} +if (is.null(par$output)) { + stop("Argument 'output' must be provided") +} + +# import helper function +source(paste0(meta["resources_dir"], "/read_and_merge_yaml.R")) +source(paste0(meta["resources_dir"], "/strip_margin.R")) +source(paste0(meta["resources_dir"], "/read_api_files.R")) + +cat("Read task info\n") +task_api <- read_task_api(par[["task_dir"]]) + +# determine ordering +root <- .task_graph_get_root(task_api) + +r_graph <- render_task_graph(task_api, root) + +cat("Render API details\n") +order <- names(igraph::bfs(task_api$task_graph, root)$order) +r_details <- map_chr( + order, + function(file_name) { + if (file_name %in% names(task_api$comp_specs)) { + render_component(task_api$comp_specs[[file_name]]) + } else { + render_file(task_api$file_specs[[file_name]]) + } + } +) + +cat("Render authors\n") +authors_str <- + if (nrow(task_api$authors) > 0) { + paste0( + "\n## Authors & contributors\n\n", + task_api$authors %>% knitr::kable() %>% paste(collapse = "\n"), + "\n" + ) + } else { + "" + } +readme_str <- + if (is.null(task_api$task_info$readme) || is.na(task_api$task_info$readme)) { + "" + } else { + paste0( + "\n## README\n\n", + task_api$task_info$readme, + "\n" + ) + } + +cat("Generate qmd content\n") +relative_path <- par[["task_dir"]] %>% + gsub(paste0(dirname(par[["viash_yaml"]]), "/*"), "", .) %>% + gsub("/*$", "", .) +source_url <- paste0(par[["github_url"]], relative_path) +qmd_content <- strip_margin(glue::glue(" + §--- + §title: \"{task_api$task_info$label}\" + §format: gfm + §--- + § + § + § + §{task_api$task_info$summary} + § + §Path to source: [`{relative_path}`]({source_url}) + § + §{readme_str} + § + §## Motivation + § + §{task_api$task_info$motivation} + § + §## Description + § + §{task_api$task_info$description} + §{authors_str} + §## API + § + §{r_graph} + § + §{paste(r_details, collapse = '\n\n')} + § + §"), symbol = "§") + +cat("Write README.qmd to file\n") +qmd_file <- tempfile( + pattern = "README_", + fileext = ".qmd", + tmpdir = meta$temp_dir +) + +if (!dir.exists(meta$temp_dir)) { + dir.create(meta$temp_dir, recursive = TRUE) +} +writeLines(qmd_content, qmd_file) + +cat("Render README.qmd to README.md\n") +out <- processx::run( + command = "quarto", + args = c("render", qmd_file, "--output", "-"), + echo = TRUE +) + +writeLines(out$stdout, par$output) diff --git a/src/common/create_task_readme/test.R b/src/common/create_task_readme/test.R new file mode 100644 index 0000000000..9af1fe9738 --- /dev/null +++ b/src/common/create_task_readme/test.R @@ -0,0 +1,30 @@ +requireNamespace("assertthat", quietly = TRUE) + +## VIASH START +## VIASH END + +opv2 <- paste0(meta$resources_dir, "/openproblems-v2") +output_path <- "output.md" + +cat(">> Running the script as test\n") +system(paste( + meta["executable"], + "--task", "label_projection", + "--output", output_path, + "--task_dir", paste0(opv2, "/src/tasks/label_projection"), + "--viash_yaml", paste0(opv2, "/_viash.yaml") +)) + +cat(">> Checking whether output files exist\n") +assertthat::assert_that(file.exists(output_path)) + +cat(">> Checking file contents\n") +lines <- readLines(output_path) +assertthat::assert_that(any(grepl("# Label projection", lines))) +assertthat::assert_that(any(grepl("# Description", lines))) +assertthat::assert_that(any(grepl("# Motivation", lines))) +assertthat::assert_that(any(grepl("# Authors", lines))) +assertthat::assert_that(any(grepl("flowchart LR", lines))) +assertthat::assert_that(any(grepl("# File format:", lines))) + +cat("All checks succeeded!\n") diff --git a/src/common/decompress_gzip/config.vsh.yaml b/src/common/decompress_gzip/config.vsh.yaml new file mode 100644 index 0000000000..2716dc554d --- /dev/null +++ b/src/common/decompress_gzip/config.vsh.yaml @@ -0,0 +1,25 @@ +functionality: + name: decompress_gzip + namespace: common + arguments: + - name: --input + type: file + description: Input file + example: /path/to/file.gz + - name: --output + type: file + description: Output file + example: /path/to/file + direction: output + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh +platforms: + - type: docker + image: ubuntu:latest + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/common/decompress_gzip/script.sh b/src/common/decompress_gzip/script.sh new file mode 100644 index 0000000000..f0486b6068 --- /dev/null +++ b/src/common/decompress_gzip/script.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +gunzip "$par_input" -c > "$par_output" \ No newline at end of file diff --git a/src/common/decompress_gzip/test.sh b/src/common/decompress_gzip/test.sh new file mode 100644 index 0000000000..17bb20afbf --- /dev/null +++ b/src/common/decompress_gzip/test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +echo "> Creating test file" +echo "Foo bar" > uncompressed.txt + +echo "> Compressing file" +gzip uncompressed.txt -c > compressed.txt.gz + +echo "> Decompressing file" +"$meta_executable" \ + --input "compressed.txt.gz" \ + --output "decompressed.txt" + +echo "> Comparing files" +diff uncompressed.txt decompressed.txt + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/common/extract_metadata/config.vsh.yaml b/src/common/extract_metadata/config.vsh.yaml new file mode 100644 index 0000000000..76e73cb975 --- /dev/null +++ b/src/common/extract_metadata/config.vsh.yaml @@ -0,0 +1,40 @@ +functionality: + name: extract_metadata + namespace: common + description: Extract the metadata from an h5ad file. + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + description: A h5ad file. + - name: --schema + type: file + required: false + description: An optional schema with which to annotate the output + - name: Output + arguments: + - name: --output + type: file + required: true + description: A yaml file containing the metadata. + example: output_meta.yaml + direction: output + resources: + - type: python_script + path: script.py + test_resources: + - path: /resources_test/common/pancreas + - path: /src/datasets/api/file_raw.yaml + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: python + packages: viashpy + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/common/extract_metadata/script.py b/src/common/extract_metadata/script.py new file mode 100644 index 0000000000..7a55b50e21 --- /dev/null +++ b/src/common/extract_metadata/script.py @@ -0,0 +1,206 @@ +import anndata as ad +import yaml +import numpy as np +import pandas as pd +import scipy +import os +import datetime + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'schema': 'src/datasets/api/file_raw.yaml', + 'output': 'output/meta.yaml', +} +## VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input']).copy() + +if par["schema"]: + print("Load schema", flush=True) + with open(par["schema"], "r") as f: + schema = yaml.safe_load(f) +else: + schema = None + +#################################################################################################### +## Helper functions for extracting the dataset metadata in uns ## +#################################################################################################### +def is_atomic(obj): + return isinstance(obj, str) or isinstance(obj, int) or isinstance(obj, bool) or isinstance(obj, float) + +def to_atomic(obj): + if isinstance(obj, np.float64): + return float(obj) + elif isinstance(obj, np.int64): + return int(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.str_): + return str(obj) + return obj + +def is_list_of_atomics(obj): + if not isinstance(obj, (list,pd.core.series.Series,np.ndarray)): + return False + return all(is_atomic(elem) for elem in obj) + +def to_list_of_atomics(obj): + if isinstance(obj, pd.core.series.Series): + obj = obj.to_numpy() + if isinstance(obj, np.ndarray): + obj = obj.tolist() + return [to_atomic(elem) for elem in obj] + +def is_dict_of_atomics(obj): + if not isinstance(obj, dict): + return False + return all(is_atomic(elem) for _, elem in obj.items()) + +def to_dict_of_atomics(obj): + return {k: to_atomic(v) for k, v in obj.items()} + + +#################################################################################################### +## Helper functions for extracting metadata about the used data structures ## +#################################################################################################### +def get_structure_shape(obj) -> list: + if isinstance(obj, np.ndarray): + return list(obj.shape) + elif scipy.sparse.issparse(obj): + return list(obj.shape) + elif isinstance(obj, pd.core.frame.DataFrame): + return list(obj.shape) + elif isinstance(obj, pd.core.series.Series): + return list(obj.shape) + elif isinstance(obj, list): + return [len(obj)] + elif isinstance(obj, dict): + return [len(obj)] + elif is_atomic(obj): + return [1] + return None + +def get_structure_type(obj) -> str: + # return one of: atomic, dataFrame, vector, dict, denseMatrix, sparseMatrix + if is_atomic(obj): + return "atomic" + elif isinstance(obj, (list,pd.core.series.Series)): + return "vector" + elif isinstance(obj, dict): + return "dict" + elif isinstance(obj, pd.core.frame.DataFrame): + return "dataframe" + elif scipy.sparse.issparse(obj): + return "sparsematrix" + elif isinstance(obj, np.ndarray): + return "densematrix" + return "other: " + str(type(obj)) + +def get_structure_dtype(obj) -> str: + if isinstance(obj, np.ndarray): + return obj.dtype.name + elif isinstance(obj, pd.core.series.Series): + return obj.dtype.name + elif isinstance(obj, pd.core.frame.DataFrame): + return [dtype.name for dtype in obj.dtypes] + elif scipy.sparse.issparse(obj): + return obj.dtype.name + elif is_atomic(obj): + return type(obj).__name__ + return None + +def get_structure_schema_info(struct, key) -> dict: + if schema is None: + return {} + struct_args = schema.get("info", {}).get("slots", {}).get(struct, {}) + if struct_args is None: + return {} + if struct == "X": + return struct_args + + # look for item with the correct name + struct_results = [x for x in struct_args if x.get("name") == key] + + # return None if no match is found + if len(struct_results) != 1: + return {} + + return struct_results[0] + +def get_structure(adata, struct): + adata_struct = getattr(adata, struct) + + # turn `adata_struct` into a dict for `X` + if (struct == "X"): + adata_struct = {"X": adata_struct} if adata_struct is not None else {} + + output = [] + + for key, value in adata_struct.items(): + out = { + "name": key, + "type": get_structure_type(value), + "shape": get_structure_shape(value), + "dtype": get_structure_dtype(value), + } + + # see if the schema has information about this struct + schema_info = get_structure_schema_info(struct, key) + + if schema_info.get("description"): + out["description"] = schema_info.get("description") + if schema_info.get("type"): + out["schema_type"] = schema_info.get("type") + + output.append(out) + + return output + +#################################################################################################### +## Other helper functions ## +#################################################################################################### + +def get_file_size(path: str) -> int: + """Get the file size in bytes of the file at the given path.""" + return os.path.getsize(path) + +def get_file_creation_time(path: str) -> str: + """Get the creation time of the file at the given path.""" + # Get file creation time + creation_time = os.path.getctime(path) + # Convert creation time from seconds since epoch to a readable timestamp + creation_time = datetime.datetime.fromtimestamp(creation_time) + # Format the datetime object as 'DD-MM-YYYY' + creation_time = creation_time.strftime('%d-%m-%Y') + return str(creation_time) + + +print("Extract metadata from object", flush=True) +# Extract metadata about the adata object +uns = {} +for key, val in adata.uns.items(): + if is_atomic(val): + uns[key] = to_atomic(val) + elif is_list_of_atomics(val) and len(val) <= 10: + uns[key] = to_list_of_atomics(val) + elif is_dict_of_atomics(val) and len(val) <= 10: + uns[key] = to_dict_of_atomics(val) + +uns["file_size"] = get_file_size(par["input"]) +uns["date_created"] = get_file_creation_time(par["input"]) + +# Extract metadata about the data structures +structure = { + struct: get_structure(adata, struct) + for struct + in ["X", "obs", "var", "obsp", "varp", "obsm", "varm", "layers", "uns"] +} + +# ¢reate metadata object +meta = {"uns": uns, "structure": structure} + +print("Write metadata to file", flush=True) +with open(par["output"], "w") as f: + yaml.dump(meta, f, indent=2) diff --git a/src/common/extract_metadata/test.py b/src/common/extract_metadata/test.py new file mode 100644 index 0000000000..8af023d8f6 --- /dev/null +++ b/src/common/extract_metadata/test.py @@ -0,0 +1,26 @@ +import sys +import re +import pytest +import json +import subprocess + +## VIASH START +## VIASH END + +input_path = meta["resources_dir"] + "/pancreas/dataset.h5ad" +schema_path = meta["resources_dir"] + "/file_raw.yaml" + +def test_run(run_component, tmp_path): + output_path = tmp_path / "meta.yaml" + + run_component([ + "--input", input_path, + "--schema", schema_path, + "--output", str(output_path), + ]) + + assert output_path.exists(), "Output path does not exist" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/common/extract_scores/config.vsh.yaml b/src/common/extract_scores/config.vsh.yaml new file mode 100644 index 0000000000..72270b7a95 --- /dev/null +++ b/src/common/extract_scores/config.vsh.yaml @@ -0,0 +1,35 @@ +functionality: + name: "extract_scores" + status: disabled + namespace: "common" + description: "Extract evaluation data frame on output" + arguments: + - name: "--input" + alternatives: ["-i"] + type: "file" + multiple: true + default: "input.h5ad" + description: "Input h5ad files containing metadata and metrics in adata.uns" + - name: "--column_names" + type: "string" + multiple: true + default: [ "dataset_id", "method_id", "metric_ids", "metric_values" ] + description: "Which fields from adata.uns to extract and store as a data frame." + - name: "--output" + alternatives: ["-o"] + type: "file" + direction: "output" + default: "output.tsv" + description: "Output tsv" + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ tidyverse ] + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/common/extract_scores/script.R b/src/common/extract_scores/script.R new file mode 100644 index 0000000000..6b540380ab --- /dev/null +++ b/src/common/extract_scores/script.R @@ -0,0 +1,30 @@ +cat("Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +options(tidyverse.quiet = TRUE) +library(tidyverse) +library(assertthat) + +## VIASH START +par <- list( + input = "resources_test/label_projection/pancreas/knn_accuracy.h5ad", + output = "scores.tsv" +) +inp <- par$input[[1]] +## VIASH END + +cat("Reading input h5ad files\n") +scores <- map_df(par$input, function(inp) { + cat("Reading '", inp, "'\n", sep = "") + ad <- read_h5ad(inp) + + for (uns_name in par$column_names) { + assert_that( + uns_name %in% names(ad$uns), + msg = paste0("File ", inp, " must contain `uns['", uns_name, "']`") + ) + } + + data.frame(ad$uns[par$column_names]) +}) + +write_tsv(scores, par$output) diff --git a/src/common/helper_functions/read_and_merge_yaml.R b/src/common/helper_functions/read_and_merge_yaml.R new file mode 100644 index 0000000000..932d3feb92 --- /dev/null +++ b/src/common/helper_functions/read_and_merge_yaml.R @@ -0,0 +1,144 @@ +#' Read a Viash YAML +#' +#' If the YAML contains a "__merge__" key anywhere in the yaml, +#' the path specified in that YAML will be read and the two +#' lists will be merged. This is a recursive procedure. +#' +#' @param path Path to Viash YAML +read_and_merge_yaml <- function(path, project_path = .ram_find_project(path)) { + path <- normalizePath(path, mustWork = FALSE) + data <- tryCatch({ + suppressWarnings(yaml::read_yaml(path)) + }, error = function(e) { + stop("Could not read ", path, ". Error: ", e) + }) + .ram_process_merge(data, data, path, project_path) +} + +.ram_find_project <- function(path) { + path <- normalizePath(path, mustWork = FALSE) + check <- paste0(dirname(path), "/_viash.yaml") + if (file.exists(check)) { + dirname(check) + } else if (check == "//_viash.yaml") { + NULL + } else { + .ram_find_project(dirname(check)) + } +} + +.ram_is_named_list <- function(obj) { + is.null(obj) || (is.list(obj) && (length(obj) == 0 || !is.null(names(obj)))) +} + +.ram_process_merge <- function(data, root_data, path, project_path) { + if (.ram_is_named_list(data)) { + # check whether children have `__merge__` entries + processed_data <- lapply(data, function(dat) { + .ram_process_merge(dat, root_data, path, project_path) + }) + processed_data <- lapply(names(data), function(nm) { + dat <- data[[nm]] + .ram_process_merge(dat, root_data, path, project_path) + }) + names(processed_data) <- names(data) + + # if current element has __merge__, read list2 yaml and combine with data + new_data <- + if ("__merge__" %in% names(processed_data) && !.ram_is_named_list(processed_data$`__merge__`)) { + new_data_path <- .ram_resolve_path( + path = processed_data$`__merge__`, + project_path = project_path, + parent_path = dirname(path) + ) + read_and_merge_yaml(new_data_path, project_path) + } else if ("$ref" %in% names(processed_data) && !.ram_is_named_list(processed_data$`$ref`)) { + ref_parts <- strsplit(processed_data$`$ref`, "#")[[1]] + + # resolve the path in $ref + x <- + if (ref_parts[[1]] == "") { + root_data + } else { + new_data_path <- .ram_resolve_path( + path = ref_parts[[1]], + project_path = project_path, + parent_path = dirname(path) + ) + new_data_path <- normalizePath(new_data_path, mustWork = FALSE) + + # read in the new data + tryCatch({ + suppressWarnings(yaml::read_yaml(new_data_path)) + }, error = function(e) { + stop("Could not read ", new_data_path, ". Error: ", e) + }) + } + x_root <- x + + + # Navigate the path and retrieve the referenced data + ref_path_parts <- unlist(strsplit(ref_parts[[2]], "/")) + for (part in ref_path_parts) { + if (part == "") { + next + } else if (part %in% names(x)) { + x <- x[[part]] + } else { + stop("Could not find ", processed_data$`$ref`, " in ", path) + } + } + + # postprocess the new data + if (ref_parts[[1]] == "") { + x + } else { + .ram_process_merge(x, x_root, new_data_path, project_path) + } + } else { + list() + } + + .ram_deep_merge(new_data, processed_data) + } else if (is.list(data)) { + lapply(data, function(dat) { + .ram_process_merge(dat, root_data, path, project_path) + }) + } else { + data + } +} + +.ram_resolve_path <- function(path, project_path, parent_path) { + ifelse( + grepl("^/", path), + paste0(project_path, "/", path), + fs::path_abs(path, parent_path) + ) +} + +.ram_deep_merge <- function(list1, list2) { + if (.ram_is_named_list(list1) && .ram_is_named_list(list2)) { + # if list1 and list2 are objects, recursively merge + keys <- unique(c(names(list1), names(list2))) + out <- lapply(keys, function(key) { + if (key %in% names(list1)) { + if (key %in% names(list2)) { + .ram_deep_merge(list1[[key]], list2[[key]]) + } else { + list1[[key]] + } + } else { + list2[[key]] + } + }) + names(out) <- keys + out + } else if (is.list(list1) && is.list(list2)) { + # if list1 and list2 are both lists, append + c(list1, list2) + } else { + # else override list1 with list2 + list2 + } +} \ No newline at end of file diff --git a/src/common/helper_functions/read_and_merge_yaml.py b/src/common/helper_functions/read_and_merge_yaml.py new file mode 100644 index 0000000000..b74995aed1 --- /dev/null +++ b/src/common/helper_functions/read_and_merge_yaml.py @@ -0,0 +1,52 @@ +def read_and_merge_yaml(path): + """Read a Viash YAML + + If the YAML contains a "__merge__" key anywhere in the yaml, + the path specified in that YAML will be read and the two + lists will be merged. This is a recursive procedure. + + Arguments: + path -- Path to the Viash YAML""" + from ruamel.yaml import YAML + + yaml = YAML(typ='safe', pure=True) + + with open(path, 'r') as stream: + data = yaml.load(stream) + return _ram_process_merge(data, path) + +def _ram_deep_merge(dict1, dict2): + if isinstance(dict1, dict) and isinstance(dict2, dict): + keys = set(list(dict1.keys()) + list(dict2.keys())) + out = {} + for key in keys: + if key in dict1: + if key in dict2: + out[key] = _ram_deep_merge(dict1[key], dict2[key]) + else: + out[key] = dict1[key] + else: + out[key] = dict2[key] + return out + elif isinstance(dict1, list) and isinstance(dict2, list): + return dict1 + dict2 + else: + return dict2 + +def _ram_process_merge(data, path): + import os + if isinstance(data, dict): + processed_data = {k: _ram_process_merge(v, path) for k, v in data.items()} + + if "__merge__" in processed_data: + new_data_path = os.path.join(os.path.dirname(path), processed_data["__merge__"]) + new_data = read_and_merge_yaml(new_data_path) + else: + new_data = {} + + return _ram_deep_merge(new_data, processed_data) + elif isinstance(data, list): + return [_ram_process_merge(dat, path) for dat in data] + else: + return data + diff --git a/src/common/helper_functions/read_anndata_partial.py b/src/common/helper_functions/read_anndata_partial.py new file mode 100644 index 0000000000..efbea0592d --- /dev/null +++ b/src/common/helper_functions/read_anndata_partial.py @@ -0,0 +1,77 @@ +import warnings +from pathlib import Path +import anndata as ad +import h5py +from scipy.sparse import csr_matrix +from anndata.experimental import read_elem, sparse_dataset + + +def read_anndata( + file: str, + backed: bool = False, + **kwargs +) -> ad.AnnData: + """ + Read anndata file + :param file: path to anndata file in h5ad format + :param kwargs: AnnData parameter to group mapping + """ + assert Path(file).exists(), f'File not found: {file}' + + f = h5py.File(file, 'r') + kwargs = {x: x for x in f} if not kwargs else kwargs + if len(f.keys()) == 0: + return ad.AnnData() + # check if keys are available + for name, slot in kwargs.items(): + if slot not in f: + warnings.warn( + f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"' + ) + adata = read_partial(f, backed=backed, **kwargs) + if not backed: + f.close() + + return adata + + +def read_partial( + group: h5py.Group, + backed: bool = False, + force_sparse_types: [str, list] = None, + **kwargs +) -> ad.AnnData: + """ + Partially read h5py groups + :params group: file group + :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix + :params backed: read sparse matrix as sparse_dataset + :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file + :return: AnnData object + """ + if force_sparse_types is None: + force_sparse_types = [] + elif isinstance(force_sparse_types, str): + force_sparse_types = [force_sparse_types] + slots = {} + if backed: + print('Read as backed sparse matrix...') + + for slot_name, slot in kwargs.items(): + print(f'Read slot "{slot}", store as "{slot_name}"...') + if slot not in group: + warnings.warn(f'Slot "{slot}" not found, skip...') + slots[slot_name] = None + else: + elem = group[slot] + iospec = ad._io.specs.get_spec(elem) + if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed: + slots[slot_name] = sparse_dataset(elem) + elif iospec.encoding_type in force_sparse_types: + slots[slot_name] = csr_matrix(read_elem(elem)) + if backed: + slots[slot_name] = sparse_dataset(slots[slot_name]) + else: + slots[slot_name] = read_elem(elem) + return ad.AnnData(**slots) + diff --git a/src/common/helper_functions/read_api_files.R b/src/common/helper_functions/read_api_files.R new file mode 100644 index 0000000000..be602b58c4 --- /dev/null +++ b/src/common/helper_functions/read_api_files.R @@ -0,0 +1,493 @@ + +anndata_struct_names <- c("obs", "var", "obsm", "obsp", "varm", "varp", "layers", "uns") + +read_file_spec <- function(path) { + spec <- read_and_merge_yaml(path) + out <- list( + info = read_file_info(spec, path) + ) + if (out$info$file_type == "h5ad" || "slots" %in% names(spec$info)) { + out$info$file_type <- "h5ad" + out$slots <- read_anndata_slots(spec, path) + } + if (out$info$file_type == "csv" || out$info$file_type == "tsv" || out$info$file_type == "parquet") { + out$columns <- read_tabular_columns(spec, path) + } + out +} +read_file_info <- function(spec, path) { + # TEMP: make it readable + spec$info$slots <- NULL + df <- list_as_tibble(spec) + if (list_contains_tibble(spec$info)) { + df <- dplyr::bind_cols(df, list_as_tibble(spec$info)) + } + df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) + df$description <- df$description %||% NA_character_ %>% as.character + df$summary <- df$summary %||% NA_character_ %>% as.character + as_tibble(df) +} +read_anndata_slots <- function(spec, path) { + map_df( + anndata_struct_names, + function(struct_name, slot) { + slot <- spec$info$slots[[struct_name]] + if (is.null(slot)) return(NULL) + df <- map_df(slot, as.data.frame) + df$struct <- struct_name + df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) + df$required <- df$required %||% TRUE %|% TRUE + df$multiple <- df$multiple %||% FALSE %|% FALSE + as_tibble(df) + } + ) +} +read_tabular_columns <- function(spec, path) { + map_df( + spec$info$columns, + function(column) { + df <- list_as_tibble(column) + df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) + df$required <- df$required %||% TRUE %|% TRUE + df$multiple <- df$multiple %||% FALSE %|% FALSE + as_tibble(df) + } + ) +} + +format_file_format <- function(spec) { + if (spec$info$file_type == "h5ad") { + example <- spec$slots %>% + group_by(struct) %>% + summarise( + str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", ")) + ) %>% + arrange(match(struct, anndata_struct_names)) + + c(" AnnData object", paste0(" ", example$str)) + } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") { + example <- spec$columns %>% + summarise( + str = paste0("'", name, "'", collapse = ", ") + ) + + c(" Tabular data", paste0(" ", example$str)) + } else { + "" + } +} + +format_file_format_as_kable <- function(spec) { + if (spec$info$file_type == "h5ad") { + spec$slots %>% + mutate( + tag_str = pmap_chr(lst(required), function(required) { + out <- c() + if (!required) { + out <- c(out, "Optional") + } + if (length(out) == 0) { + "" + } else { + paste0("(_", paste(out, collapse = ", "), "_) ") + } + }) + ) %>% + transmute( + Slot = paste0("`", struct, "[\"", name, "\"]`"), + Type = paste0("`", type, "`"), + Description = paste0( + tag_str, + description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), + "." + ) + ) %>% + knitr::kable() + } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") { + spec$columns %>% + mutate( + tag_str = pmap_chr(lst(required), function(required) { + out <- c() + if (!required) { + out <- c(out, "Optional") + } + if (length(out) == 0) { + "" + } else { + paste0("(_", paste(out, collapse = ", "), "_) ") + } + }) + ) %>% + transmute( + Column = paste0("`", name, "`"), + Type = paste0("`", type, "`"), + Description = paste0( + tag_str, + description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), + "." + ) + ) %>% + knitr::kable() + } else { + "" + } +} + +list_contains_tibble <- function(li) { + is.list(li) && any(sapply(li, is.atomic)) +} + +list_as_tibble <- function(li) { + as.data.frame(li[sapply(li, is.atomic)], check.names = FALSE) +} + +read_comp_spec <- function(path) { + spec_yaml <- read_and_merge_yaml(path) + list( + info = read_comp_info(spec_yaml, path), + args = read_comp_args(spec_yaml, path) + ) +} + +read_comp_info <- function(spec_yaml, path) { + # TEMP: make it readable + spec_yaml$functionality$arguments <- NULL + spec_yaml$functionality$argument_groups <- NULL + + df <- list_as_tibble(spec_yaml$functionality) + if (nrow(df) == 0) { + df <- data.frame(a = 1)[, integer(0)] + } + if (list_contains_tibble(spec_yaml$functionality$info)) { + df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info)) + } + if (list_contains_tibble(spec_yaml$functionality$info$type_info)) { + df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info$type_info)) + } + df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) + as_tibble(df) +} + +read_comp_args <- function(spec_yaml, path) { + arguments <- spec_yaml$functionality$arguments + for (arg_group in spec_yaml$functionality$argument_groups) { + arguments <- c(arguments, arg_group$arguments) + } + map_df(arguments, function(arg) { + df <- list_as_tibble(arg) + if (list_contains_tibble(arg$info)) { + df <- dplyr::bind_cols(df, list_as_tibble(arg$info)) + } + df$file_name <- basename(path) %>% gsub("\\.yaml", "", .) + df$arg_name <- gsub("^-*", "", arg$name) + df$direction <- df$direction %||% "input" %|% "input" + df$parent <- df$`__merge__` %||% NA_character_ %>% basename() %>% gsub("\\.yaml", "", .) + df$required <- df$required %||% FALSE %|% FALSE + df$default <- df$default %||% NA_character_ %>% as.character + df$example <- df$example %||% NA_character_ %>% as.character + df$description <- df$description %||% NA_character_ %>% as.character + df$summary <- df$summary %||% NA_character_ %>% as.character + df + }) +} + +format_comp_args_as_tibble <- function(spec) { + if (nrow(spec$args) == 0) return("") + spec$args %>% + mutate( + tag_str = pmap_chr(lst(required, direction), function(required, direction) { + out <- c() + if (!required) { + out <- c(out, "Optional") + } + if (direction == "output") { + out <- c(out, "Output") + } + if (length(out) == 0) { + "" + } else { + paste0("(_", paste(out, collapse = ", "), "_) ") + } + }) + ) %>% + transmute( + Name = paste0("`--", arg_name, "`"), + Type = paste0("`", type, "`"), + Description = paste0( + tag_str, + (summary %|% description) %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), + ".", + ifelse(!is.na(default), paste0(" Default: `", default, "`."), "") + ) + ) %>% + knitr::kable() +} + +# path <- "src/datasets/api/comp_processor_knn.yaml" +render_component <- function(spec) { + if (is.character(spec)) { + spec <- read_comp_spec(spec) + } + + strip_margin(glue::glue(" + §## Component type: {spec$info$label} + § + §Path: [`src/{spec$info$namespace}`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/{spec$info$namespace}) + § + §{spec$info$summary} + § + §Arguments: + § + §:::{{.small}} + §{paste(format_comp_args_as_tibble(spec), collapse = '\n')} + §::: + § + §"), symbol = "§") +} + +# path <- "src/datasets/api/file_pca.yaml" +render_file <- function(spec) { + if (is.character(spec)) { + spec <- read_file_spec(spec) + } + + if (!"label" %in% names(spec$info)) { + spec$info$label <- basename(spec$info$example) + } + + example <- + if (is.null(spec$info$example) || is.na(spec$info$example)) { + "" + } else { + paste0("Example file: `", spec$info$example, "`") + } + + description <- + if (is.null(spec$info$description) || is.na(spec$info$description)) { + "" + } else { + paste0("Description:\n\n", spec$info$description) + } + + strip_margin(glue::glue(" + §## File format: {spec$info$label} + § + §{spec$info$summary %||% ''} + § + §{example} + § + §{description} + § + §Format: + § + §:::{{.small}} + §{paste(format_file_format(spec), collapse = '\n')} + §::: + § + §Slot description: + § + §:::{{.small}} + §{paste(format_file_format_as_kable(spec), collapse = '\n')} + §::: + § + §"), symbol = "§") +} + +# path <- "src/tasks/denoising" +read_task_api <- function(path) { + cli::cli_inform("Looking for project root") + project_path <- .ram_find_project(path) + api_dir <- paste0(path, "/api") + + cli::cli_inform("Reading task info") + task_info_yaml <- list.files(api_dir, pattern = "task_info.ya?ml", full.names = TRUE) + assertthat::assert_that(length(task_info_yaml) == 1) + task_info <- read_and_merge_yaml(task_info_yaml, project_path) + + cli::cli_inform("Reading task authors") + authors <- map_df(task_info$authors, function(aut) { + aut$roles <- paste(aut$roles, collapse = ", ") + list_as_tibble(aut) + }) + + cli::cli_inform("Reading component yamls") + comp_yamls <- list.files(api_dir, pattern = "comp_.*\\.ya?ml", full.names = TRUE) + comps <- map(comp_yamls, read_comp_spec) + comp_info <- map_df(comps, "info") + comp_args <- map_df(comps, "args") + names(comps) <- basename(comp_yamls) %>% gsub("\\..*$", "", .) + + cli::cli_inform("Reading file yamls") + file_yamls <- .ram_resolve_path( + path = na.omit(unique(comp_args$`__merge__`)), + project_path = project_path, + parent_path = api_dir + ) + files <- map(file_yamls, read_file_spec) + names(files) <- basename(file_yamls) %>% gsub("\\..*$", "", .) + file_info <- map_df(files, "info") + file_slots <- map_df(files, "slots") + + cli::cli_inform("Generating task graph") + task_graph <- create_task_graph(file_info, comp_info, comp_args) + + list( + task_info = task_info, + file_specs = files, + file_info = file_info, + file_slots = file_slots, + comp_specs = comps, + comp_info = comp_info, + comp_args = comp_args, + task_graph = task_graph, + authors = authors + ) +} + + +create_task_graph <- function(file_info, comp_info, comp_args) { + clean_id <- function(id) { + gsub("graph", "graaf", id) + } + nodes <- + bind_rows( + file_info %>% + mutate(id = file_name, label = label, is_comp = FALSE), + comp_info %>% + mutate(id = file_name, label = label, is_comp = TRUE) + ) %>% + select(id, label, everything()) %>% + mutate(str = paste0( + " ", + clean_id(id), + ifelse(is_comp, "[/\"", "(\""), + label, + ifelse(is_comp, "\"/]", "\")") + )) + edges <- bind_rows( + comp_args %>% + filter(type == "file", direction == "input") %>% + mutate( + from = parent, + to = file_name, + arrow = "---" + ), + comp_args %>% + filter(type == "file", direction == "output") %>% + mutate( + from = file_name, + to = parent, + arrow = "-->" + ) + ) %>% + select(from, to, everything()) %>% + mutate(str = paste0(" ", clean_id(from), arrow, clean_id(to))) + + igraph::graph_from_data_frame( + edges, + vertices = nodes, + directed = TRUE + ) +} + +.task_graph_get_root <- function(task_api) { + root <- names(which(igraph::degree(task_api$task_graph, mode = "in") == 0)) + if (length(root) > 1) { + warning( + "There should probably only be one node with in-degree equal to 0.\n", + " Nodes with in-degree == 0: ", paste(root, collapse = ", ") + ) + } + root[[1]] +} + +render_task_graph <- function(task_api, root = .task_graph_get_root(task_api)) { + order <- names(igraph::bfs(task_api$task_graph, root)$order) + + vdf <- igraph::as_data_frame(task_api$task_graph, "vertices") %>% + arrange(match(name, order)) + edf <- igraph::as_data_frame(task_api$task_graph, "edges") %>% + arrange(match(from, order), match(to, order)) + + strip_margin(glue::glue(" + §```mermaid + §flowchart LR + §{paste(vdf$str, collapse = '\n')} + §{paste(edf$str, collapse = '\n')} + §``` + §"), symbol = "§") +} + + + +# Recursive function to process each property with indentation +.render_example_process_property <- function(prop, prop_name = NULL, indent_level = 0) { + if (is.null(prop_name)) { + prop_name <- "" + } + + out <- c() + + # define helper variables + indent_spaces <- strrep(" ", indent_level) + next_indent_spaces <- strrep(" ", indent_level + 2) + + # add comment if available + if ("description" %in% names(prop)) { + comment <- gsub("\n", paste0("\n", indent_spaces, "# "), stringr::str_trim(prop$description)) + out <- c(out, indent_spaces, "# ", comment, "\n") + } + + # add variable + out <- c(out, indent_spaces, prop_name, ": ") + + if (prop$type == "object" && "properties" %in% names(prop)) { + # Handle object with properties + prop_names <- setdiff(names(prop$properties), "additionalProperties") + sub_props <- unlist(lapply(prop_names, function(sub_prop_name) { + prop_out <- .render_example_process_property( + prop$properties[[sub_prop_name]], + sub_prop_name, + indent_level + 2 + ) + c(prop_out, "\n") + })) + c(out, "\n", sub_props[-length(sub_props)]) + } else if (prop$type == "array") { + if (is.list(prop$items) && "properties" %in% names(prop$items)) { + # Handle array of objects + array_items_yaml <- unlist(lapply(names(prop$items$properties), function(item_prop_name) { + prop_out <- .render_example_process_property( + prop$items$properties[[item_prop_name]], + item_prop_name, + indent_level + 4 + ) + c(prop_out, "\n") + })) + c(out, "\n", next_indent_spaces, "- ", array_items_yaml[-1]) + } else { + # Handle simple array + c(out, "[ ... ]") + } + } else { + c(out, "...") + } +} + +# Function for rendering an example yaml based on a JSON schema +render_example <- function(json_schema) { + if (!"properties" %in% names(json_schema)) { + return("") + } + text <- + unlist(lapply(names(json_schema$properties), function(prop_name) { + out <- .render_example_process_property( + json_schema$properties[[prop_name]], + prop_name, + 0 + ) + c(out, "\n") + })) + + paste(text, collapse = "") +} \ No newline at end of file diff --git a/src/common/helper_functions/setup_logger.py b/src/common/helper_functions/setup_logger.py new file mode 100644 index 0000000000..ae71eb9611 --- /dev/null +++ b/src/common/helper_functions/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger \ No newline at end of file diff --git a/src/common/helper_functions/strip_margin.R b/src/common/helper_functions/strip_margin.R new file mode 100644 index 0000000000..3830d58d79 --- /dev/null +++ b/src/common/helper_functions/strip_margin.R @@ -0,0 +1,3 @@ +strip_margin <- function(text, symbol = "\\|") { + gsub(paste0("(^|\n)[ \t]*", symbol), "\\1", text) +} \ No newline at end of file diff --git a/src/common/helper_functions/strip_margin.py b/src/common/helper_functions/strip_margin.py new file mode 100644 index 0000000000..fbfb39dec9 --- /dev/null +++ b/src/common/helper_functions/strip_margin.py @@ -0,0 +1,3 @@ +def strip_margin(text: str) -> str: + import re + return re.sub("(^|\n)[ \t]*\|", "\\1", text) \ No newline at end of file diff --git a/src/common/helper_functions/subset_anndata.py b/src/common/helper_functions/subset_anndata.py new file mode 100644 index 0000000000..80bd160872 --- /dev/null +++ b/src/common/helper_functions/subset_anndata.py @@ -0,0 +1,83 @@ +"""Helper functions related to subsetting AnnData objects based on the file format +specifications in the .config.vsh.yaml and slot mapping overrides.""" + +def read_config_slots_info(config_file, slot_mapping = {}): + """Read the .config.vsh.yaml to find out which output slots need to be copied to which output file. + + Arguments: + config_file -- Path to the .config.vsh.yaml file (required). + slot_mapping -- Which slots to retain. Must be a dictionary whose keys are the names + of the AnnData structs, and values is another dictionary with destination value + names as keys and source value names as values. + Example of slot_mapping: + ``` + slot_mapping = { + "layers": { + "counts": par["layer_counts"], + }, + "obs": { + "cell_type": par["obs_cell_type"], + "batch": par["obs_batch"], + } + } + ``` + """ + import yaml + import re + + # read output spec from yaml + with open(config_file, "r") as object_name: + config = yaml.safe_load(object_name) + + output_struct_slots = {} + + # fetch info on which slots should be copied to which file + for arg in config["functionality"]["arguments"]: + # argument is an output file with a slot specification + if arg["direction"] == "output" and arg.get("info", {}).get("slots"): + object_name = re.sub("--", "", arg["name"]) + + struct_slots = arg['info']['slots'] + out = {} + for (struct, slots) in struct_slots.items(): + out_struct = {} + for slot in slots: + # if slot_mapping[struct][slot['name']] exists, use that as the source slot name + # otherwise use slot['name'] + source_slot = slot_mapping.get(struct, {}).get(slot["name"], slot["name"]) + out_struct[slot["name"]] = source_slot + out[struct] = out_struct + + output_struct_slots[object_name] = out + + return output_struct_slots + +# create new anndata objects according to api spec +def subset_anndata(adata, slot_info): + """Create new anndata object according to slot info specifications. + + Arguments: + adata -- An AnnData object to subset (required) + slot_info -- Which slots to retain, typically one of the items in the output of read_config_slots_info. + Must be a dictionary whose keys are the names of the AnnData structs, and values is another + dictionary with destination value names as keys and source value names as values. + """ + import pandas as pd + import anndata as ad + + structs = ["layers", "obs", "var", "uns", "obsp", "obsm", "varp", "varm"] + kwargs = {} + + for struct in structs: + slot_mapping = slot_info.get(struct, {}) + data = {dest : getattr(adata, struct)[src] for (dest, src) in slot_mapping.items()} + if len(data) > 0: + if struct in ['obs', 'var']: + data = pd.concat(data, axis=1) + kwargs[struct] = data + elif struct in ['obs', 'var']: + # if no columns need to be copied, we still need an 'obs' and a 'var' + # to help determine the shape of the adata + kwargs[struct] = getattr(adata, struct).iloc[:,[]] + + return ad.AnnData(**kwargs) \ No newline at end of file diff --git a/src/common/library.bib b/src/common/library.bib new file mode 100644 index 0000000000..af730fe8cd --- /dev/null +++ b/src/common/library.bib @@ -0,0 +1,2191 @@ +@misc{10x2018pbmc, + title = {1k PBMCs from a Healthy Donor (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2018}, + url = {https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0} +} + + +@misc{10x2019heart, + title = {Human Heart}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0} +} + + +@misc{10x2019lymph, + title = {Human Lymph Node}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0} +} + + +@misc{10x2019pbmc, + title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0} +} + + +@misc{10x2020breast, + title = {Human Breast Cancer: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0} +} + + +@misc{10x2020cerebellum, + title = {Human Cerebellum: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0} +} + + +@misc{10x2020kidney, + title = {Mouse Kidney Section (Coronal)}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0} +} + + +@misc{10x2021breast, + title = {Human Breast Cancer: Ductal Carcinoma In Situ, Invasive Carcinoma (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0} +} + + +@misc{10x2021prostate, + title = {Normal Human Prostate (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0} +} + + +@misc{10x2022brain, + title = {Mouse Brain Coronal Section 1 (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard} +} + + +@misc{10x2022cervical, + title = {Human Cervical Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard} +} + + +@misc{10x2022olfactory, + title = {Adult Mouse Olfactory Bulb}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1} +} + + +@misc{10x2022intestine, + title = {Human Intestine Cancer (FPPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard} +} + + +@misc{10x2022melanoma, + title = {Human Melanoma, IF Stained (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard} +} + + +@misc{10x2022prostate, + title = {Human Prostate Cancer, Adjacent Normal Section with IF Staining (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard} +} + + +@misc{10x2023brain, + title = {Human Brain Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023colon, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Human Colon Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard} +} + + +@misc{10x2023colorectal, + title = {Human Colorectal Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023embryo, + title = {Visium CytAssist, Mouse Embryo, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023kidney, + title = {Human Kidney, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023lung, + title = {Human Lung Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023mousebrain, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Mouse Brain (FF)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard} +} + + +@article{agostinis2022newwave, + doi = {10.1093/bioinformatics/btac149}, + url = {https://doi.org/10.1093/bioinformatics/btac149}, + year = {2022}, + month = {Mar.}, + publisher = {Oxford University Press ({OUP})}, + volume = {38}, + number = {9}, + pages = {2648--2650}, + author = {Federico Agostinis and Chiara Romualdi and Gabriele Sales and Davide Risso}, + editor = {Yann Ponty}, + title = {NewWave: a scalable R/Bioconductor package for the dimensionality reduction and batch effect removal of single-cell {RNA}-seq data}, + journal = {Bioinformatics} +} + + +@article{agrawal2021mde, + title = {Minimum-Distortion Embedding}, + author = {Akshay Agrawal and Alnur Ali and Stephen Boyd}, + year = {2021}, + journal = {Foundations and Trends{\textregistered} in Machine Learning}, + publisher = {Now Publishers}, + volume = {14}, + number = {3}, + pages = {211--378}, + doi = {10.1561/2200000090}, + url = {https://doi.org/10.1561/2200000090} +} + + +@article{aliee2021autogenes, + title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution}, + author = {Hananeh Aliee and Fabian J. Theis}, + year = {2021}, + month = {Jul.}, + journal = {Cell Systems}, + publisher = {Elsevier {BV}}, + volume = {12}, + number = {7}, + pages = {706--715.e4}, + doi = {10.1016/j.cels.2021.05.006}, + url = {https://doi.org/10.1016/j.cels.2021.05.006} +} + + +@inproceedings{amelio2015normalized, + doi = {10.1145/2808797.2809344}, + url = {https://doi.org/10.1145/2808797.2809344}, + year = {2015}, + month = {Aug.}, + publisher = {{ACM}}, + author = {Alessia Amelio and Clara Pizzuti}, + title = {Is Normalized Mutual Information a Fair Measure for Comparing Community Detection Methods?}, + booktitle = {Proceedings of the 2015 {IEEE}/{ACM} International Conference on Advances in Social Networks Analysis and Mining 2015} +} + + +@article{andersson2020single, + title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, + author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, + year = {2020}, + month = {Oct.}, + journal = {Communications Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {3}, + number = {1}, + doi = {10.1038/s42003-020-01247-y}, + url = {https://doi.org/10.1038/s42003-020-01247-y} +} + + +@article{andersson2021sepal, + title={sepal: Identifying transcript profiles with spatial patterns by diffusion-based modeling}, + author={Andersson, Alma and Lundeberg, Joakim}, + journal={Bioinformatics}, + volume={37}, + number={17}, + pages={2644--2650}, + year={2021}, + publisher={Oxford University Press}, + doi={10.1093/bioinformatics/btab164} +} + + +@string{apr = {Apr.}} + + +@string{aug = {Aug.}} + + +@article{batson2019molecular, + title = {Molecular Cross-Validation for Single-Cell RNA-seq}, + author = {Batson, Joshua and Royer, Lo{\"\i}c and Webber, James}, + year = {2019}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/786269}, + url = {https://www.biorxiv.org/content/early/2019/09/30/786269}, + elocation-id = {786269}, + eprint = {https://www.biorxiv.org/content/early/2019/09/30/786269.full.pdf} +} + + +@article{biancalani2021deep, + title = {Deep learning and alignment of spatially resolved single-cell transcriptomes with Tangram}, + author = {Tommaso Biancalani and Gabriele Scalia and Lorenzo Buffoni and Raghav Avasthi and Ziqing Lu and Aman Sanger and Neriman Tokcan and Charles R. Vanderburg and {\AA}sa Segerstolpe and Meng Zhang and Inbal Avraham-Davidi and Sanja Vickovic and Mor Nitzan and Sai Ma and Ayshwarya Subramanian and Michal Lipinski and Jason Buenrostro and Nik Bear Brown and Duccio Fanelli and Xiaowei Zhuang and Evan Z. Macosko and Aviv Regev}, + year = {2021}, + month = {Oct.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {18}, + number = {11}, + pages = {1352--1362}, + doi = {10.1038/s41592-021-01264-7}, + url = {https://doi.org/10.1038/s41592-021-01264-7} +} + + +@article{bintayyash2021non, + author = {BinTayyash, Nuha and Georgaka, Sokratia and John, S T and Ahmed, Sumon and Boukouvalas, Alexis and Hensman, James and Rattray, Magnus}, + title = "{Non-parametric modelling of temporal and spatial counts data from RNA-seq experiments}", + journal = {Bioinformatics}, + volume = {37}, + number = {21}, + pages = {3788-3795}, + year = {2021}, + month = {07}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab486}, + url = {https://doi.org/10.1093/bioinformatics/btab486}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/21/3788/50336570/btab486.pdf}, +} + + +@article{bland2000odds, + title = {Statistics Notes: The odds ratio}, + author = {J. M. Bland}, + year = {2000}, + month = {May}, + journal = {{BMJ}}, + publisher = {{BMJ}}, + volume = {320}, + number = {7247}, + pages = {1468--1468}, + doi = {10.1136/bmj.320.7247.1468}, + url = {https://doi.org/10.1136/bmj.320.7247.1468} +} + + +@article{breiman2001random, + doi = {10.1023/a:1010933404324}, + url = {https://doi.org/10.1023/a:1010933404324}, + year = {2001}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {45}, + number = {1}, + pages = {5--32}, + author = {Leo Breiman}, + journal = {Machine Learning} +} + + +@article{bttner2018test, + title = {A test metric for assessing single-cell {RNA}-seq batch correction}, + author = {Maren B\"{u}ttner and Zhichao Miao and F. Alexander Wolf and Sarah A. Teichmann and Fabian J. Theis}, + year = {2018}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + pages = {43--49}, + doi = {10.1038/s41592-018-0254-1}, + url = {https://doi.org/10.1038/s41592-018-0254-1} +} + + +@article{cabello2020singlecellsignalr, + title = {{SingleCellSignalR}: inference of intercellular networks from single-cell transcriptomics}, + author = {Simon Cabello-Aguilar and M{\'{e}}lissa Alame and Fabien Kon-Sun-Tack and Caroline Fau and Matthieu Lacroix and Jacques Colinge}, + year = {2020}, + month = {Mar.}, + journal = {Nucleic Acids Research}, + publisher = {Oxford University Press ({OUP})}, + volume = {48}, + number = {10}, + pages = {e55--e55}, + doi = {10.1093/nar/gkaa183}, + url = {https://doi.org/10.1093/nar/gkaa183} +} + + +@article{cable2021robust, + title = {Robust decomposition of cell type mixtures in spatial transcriptomics}, + author = {Dylan M. Cable and Evan Murray and Luli S. Zou and Aleksandrina Goeva and Evan Z. Macosko and Fei Chen and Rafael A. Irizarry}, + year = {2021}, + month = {Feb.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {4}, + pages = {517--526}, + doi = {10.1038/s41587-021-00830-w}, + url = {https://doi.org/10.1038/s41587-021-00830-w} +} + + +@misc{cannoodt2021viashfromscripts, + doi = {10.48550/ARXIV.2110.11494}, + url = {https://arxiv.org/abs/2110.11494}, + author = {Cannoodt, Robrecht and Cannoodt, Hendrik and Van de Kerckhove, Eric and Boschmans, Andy and De Maeyer, Dries and Verbeiren, Toni}, + keywords = {Software Engineering (cs.SE), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {Viash: from scripts to pipelines}, + publisher = {arXiv}, + year = {2021}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} +} + + +@article{cai2023spanve, + title={Spanve: an Statistical Method to Detect Clustering-friendly Spatially Variable Genes in Large-scale Spatial Transcriptomics Data}, + author={Cai, Guoxin and Chen, Yichang and Chen, Shuqing and Gu, Xun and Zhou, Zhan}, + journal={bioRxiv}, + pages={2023--02}, + year={2023}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2023.02.08.527623} +} + + +@article{cao2018joint, + title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, + author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, + year = {2018}, + month = {Sept.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {361}, + number = {6409}, + pages = {1380--1385}, + doi = {10.1126/science.aau0730}, + url = {https://doi.org/10.1126/science.aau0730} +} + + +@article{cao2020human, + title = {A human cell atlas of fetal gene expression}, + author = {Junyue Cao and Diana R. O'Day and Hannah A. Pliner and Paul D. Kingsley and Mei Deng and Riza M. Daza and Michael A. Zager and Kimberly A. Aldinger and Ronnie Blecher-Gonen and Fan Zhang and Malte Spielmann and James Palis and Dan Doherty and Frank J. Steemers and Ian A. Glass and Cole Trapnell and Jay Shendure}, + year = {2020}, + month = {Nov.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {370}, + number = {6518}, + doi = {10.1126/science.aba7721}, + url = {https://doi.org/10.1126/science.aba7721} +} + + +@article{chai2014root, + doi = {10.5194/gmdd-7-1525-2014}, + url = {https://doi.org/10.5194/gmdd-7-1525-2014}, + year = {2014}, + month = {Feb.}, + publisher = {Copernicus {GmbH}}, + author = {T. Chai and R. R. Draxler}, + title = {Root mean square error ({RMSE}) or mean absolute error ({MAE})?} +} + + +@article{chang2022spatial, + title={Spatial omics representation and functional tissue module inference using graph Fourier transform}, + author={Chang, Yuzhou and Liu, Jixin and Ma, Anjun and Jiang, Sizun and Krull, Jordan and Yeo, Yao Yu and Liu, Yang and Rodig, Scott J and Barouch, Dan H and Fan, Rong and others}, + journal={bioRxiv}, + pages={2022--12}, + year={2022}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2022.12.10.519929} +} + + +@article{chazarragil2021flexible, + doi = {10.1093/nar/gkab004}, + url = {https://doi.org/10.1093/nar/gkab004}, + year = {2021}, + month = {Feb.}, + publisher = {Oxford University Press ({OUP})}, + volume = {49}, + number = {7}, + pages = {e42--e42}, + author = {Ruben Chazarra-Gil and Stijn van~Dongen and Vladimir~Yu Kiselev and Martin Hemberg}, + title = {Flexible comparison of batch correction methods for single-cell {RNA}-seq using {BatchBench}}, + journal = {Nucleic Acids Research} +} + + +@article{chen2009local, + title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, + author = {Lisha Chen and Andreas Buja}, + year = {2009}, + month = {Mar.}, + journal = {Journal of the American Statistical Association}, + publisher = {Informa {UK} Limited}, + volume = {104}, + number = {485}, + pages = {209--219}, + doi = {10.1198/jasa.2009.0111}, + url = {https://doi.org/10.1198/jasa.2009.0111} +} + + +@inproceedings{chen2016xgboost, + title = {{XGBoost}}, + author = {Tianqi Chen and Carlos Guestrin}, + year = {2016}, + month = {Aug.}, + booktitle = {Proceedings of the 22nd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining}, + publisher = {{Acm}}, + doi = {10.1145/2939672.2939785}, + url = {https://doi.org/10.1145/2939672.2939785} +} + + +@article{cichocki2009fast, + title = {Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations}, + author = {Andrzej Cichocki and Anh-Huy Phan}, + year = {2009}, + journal = {{IEICE} Transactions on Fundamentals of Electronics, Communications and Computer Sciences}, + publisher = {Institute of Electronics, Information and Communications Engineers ({IEICE})}, + volume = {E92-a}, + number = {3}, + pages = {708--721}, + doi = {10.1587/transfun.e92.a.708}, + url = {https://doi.org/10.1587/transfun.e92.a.708} +} + + +@article{coifman2006diffusion, + title = {Diffusion maps}, + author = {Ronald R. Coifman and St{\'{e}}phane Lafon}, + year = {2006}, + month = {Jul.}, + journal = {Applied and Computational Harmonic Analysis}, + publisher = {Elsevier {BV}}, + volume = {21}, + number = {1}, + pages = {5--30}, + doi = {10.1016/j.acha.2006.04.006}, + url = {https://doi.org/10.1016/j.acha.2006.04.006} +} + + +@article{cover1967nearest, + title = {Nearest neighbor pattern classification}, + author = {T. Cover and P. Hart}, + year = {1967}, + month = {Jan}, + journal = {{IEEE} Transactions on Information Theory}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, + volume = {13}, + number = {1}, + pages = {21--27}, + doi = {10.1109/tit.1967.1053964}, + url = {https://doi.org/10.1109/tit.1967.1053964} +} + + +@inproceedings{davis2006prauc, + title = {The relationship between Precision-Recall and {ROC} curves}, + author = {Jesse Davis and Mark Goadrich}, + year = {2006}, + booktitle = {Proceedings of the 23rd international conference on Machine learning - {ICML} {\textquotesingle}06}, + publisher = {{ACM} Press}, + doi = {10.1145/1143844.1143874}, + url = {https://doi.org/10.1145/1143844.1143874} +} + + +@string{dec = {Dec.}} + +@article{Demetci2020scot, + author = {Pinar Demetci and Rebecca Santorella and Bj{\"o}rn Sandstede and William Stafford Noble and Ritambhara Singh}, + title = {Gromov-Wasserstein optimal transport to align single-cell multi-omics data}, + elocation-id = {2020.04.28.066787}, + year = {2020}, + doi = {10.1101/2020.04.28.066787}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787}, + eprint = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787.full.pdf}, + journal = {bioRxiv} +} + + +@article{dimitrov2022comparison, + title = {Comparison of methods and resources for cell-cell communication inference from single-cell {RNA}-Seq data}, + author = {Daniel Dimitrov and D{\'{e}}nes T\"{u}rei and Martin Garrido-Rodriguez and Paul L. Burmedi and James S. Nagai and Charlotte Boys and Ricardo O. Ramirez Flores and Hyojin Kim and Bence Szalai and Ivan G. Costa and Alberto Valdeolivas and Aur{\'{e}}lien Dugourd and Julio Saez-Rodriguez}, + year = {2022}, + month = {Jun.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-30755-0}, + url = {https://doi.org/10.1038/s41467-022-30755-0} +} + + +@article{donoho2017yearsdatascience, + doi = {10.1080/10618600.2017.1384734}, + url = {https://doi.org/10.1080/10618600.2017.1384734}, + year = {2017}, + month = {Oct.}, + publisher = {Informa {UK} Limited}, + volume = {26}, + number = {4}, + pages = {745--766}, + author = {David Donoho}, + title = {50 Years of Data Science}, + journal = {Journal of Computational and Graphical Statistics} +} + + +@article{efremova2020cellphonedb, + title = {{CellPhoneDB}: inferring cell{\textendash}cell communication from combined expression of multi-subunit ligand{\textendash}receptor complexes}, + author = {Mirjana Efremova and Miquel Vento-Tormo and Sarah A. Teichmann and Roser Vento-Tormo}, + year = {2020}, + month = {Feb.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {4}, + pages = {1484--1506}, + doi = {10.1038/s41596-020-0292-x}, + url = {https://doi.org/10.1038/s41596-020-0292-x} +} + + +@article{emmons2016analysis, + title = {Analysis of Network Clustering Algorithms and Cluster Quality Metrics at Scale}, + volume = {11}, + ISSN = {1932-6203}, + url = {http://dx.doi.org/10.1371/journal.pone.0159161}, + doi = {10.1371/journal.pone.0159161}, + number = {7}, + journal = {PLOS ONE}, + publisher = {Public Library of Science (PLoS)}, + author = {Emmons, Scott and Kobourov, Stephen and Gallant, Mike and B\"{o}rner, Katy}, + editor = {Dovrolis, Constantine}, + year = {2016}, + month = jul, + pages = {e0159161} +} + + +@article{eraslan2019single, + title = {Single-cell {RNA}-seq denoising using a deep count autoencoder}, + author = {G\"{o}kcen Eraslan and Lukas M. Simon and Maria Mircea and Nikola S. Mueller and Fabian J. Theis}, + year = {2019}, + month = {Jan}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {10}, + number = {1}, + doi = {10.1038/s41467-018-07931-2}, + url = {https://doi.org/10.1038/s41467-018-07931-2} +} + + +@article{fang2022conservation, + title = {Conservation and divergence of cortical cell organization in human and mouse revealed by MERFISH}, + volume = {377}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abm1741}, + DOI = {10.1126/science.abm1741}, + number = {6601}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Fang, Rongxin and Xia, Chenglong and Close, Jennie L. and Zhang, Meng and He, Jiang and Huang, Zhengkai and Halpern, Aaron R. and Long, Brian and Miller, Jeremy A. and Lein, Ed S. and Zhuang, Xiaowei}, + year = {2022}, + month = jul, + pages = {56-62} +} + + +@string{feb = {Feb.}} + + +@article{fix1989discriminatory, + doi = {10.2307/1403797}, + url = {https://doi.org/10.2307/1403797}, + year = {1989}, + month = {Dec.}, + publisher = {{JSTOR}}, + volume = {57}, + number = {3}, + pages = {238}, + author = {Evelyn Fix and J. L. Hodges}, + title = {Discriminatory Analysis. Nonparametric Discrimination: Consistency Properties}, + journal = {International Statistical Review / Revue Internationale de Statistique} +} + + +@article{gower1975generalized, + title = {Generalized procrustes analysis}, + author = {J. C. Gower}, + year = {1975}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {1}, + pages = {33--51}, + doi = {10.1007/bf02291478}, + url = {https://doi.org/10.1007/bf02291478} +} + + +@article{grandini2020metrics, + title = {Metrics for Multi-Class Classification: an Overview}, + author = {Grandini, Margherita and Bagli, Enrico and Visani, Giorgio}, + year = {2020}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.2008.05756}, + url = {https://arxiv.org/abs/2008.05756}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@article{granja2021archr, + title = {{ArchR} is a scalable software package for integrative single-cell chromatin accessibility analysis}, + author = {Jeffrey M. Granja and M. Ryan Corces and Sarah E. Pierce and S. Tansu Bagdatli and Hani Choudhry and Howard Y. Chang and William J. Greenleaf}, + year = {2021}, + month = {Feb.}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {3}, + pages = {403--411}, + doi = {10.1038/s41588-021-00790-6}, + url = {https://doi.org/10.1038/s41588-021-00790-6} +} + + +@article{grn2014validation, + title = {Validation of noise models for single-cell transcriptomics}, + author = {Dominic Gr\"{u}n and Lennart Kester and Alexander van Oudenaarden}, + year = {2014}, + month = {Apr.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {6}, + pages = {637--640}, + doi = {10.1038/nmeth.2930}, + url = {https://doi.org/10.1038/nmeth.2930} +} + + +@article{haghverdi2018batch, + title = {Batch effects in single-cell {RNA}-sequencing data are corrected by matching mutual nearest neighbors}, + author = {Laleh Haghverdi and Aaron T L Lun and Michael D Morgan and John C Marioni}, + year = {2018}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {36}, + number = {5}, + pages = {421--427}, + doi = {10.1038/nbt.4091}, + url = {https://doi.org/10.1038/nbt.4091} +} + + +@article{hammarlund2018cengen, + title = {The {CeNGEN} Project: The Complete Gene Expression Map of an Entire Nervous System}, + author = {Marc Hammarlund and Oliver Hobert and David M. Miller and Nenad Sestan}, + year = {2018}, + month = {Aug.}, + journal = {Neuron}, + publisher = {Elsevier {BV}}, + volume = {99}, + number = {3}, + pages = {430--433}, + doi = {10.1016/j.neuron.2018.07.042}, + url = {https://doi.org/10.1016/j.neuron.2018.07.042} +} + + +@article{hansen2012removing, + title = {Adjusting batch effects in microarray expression data using empirical Bayes methods}, + author = {W. Evan Johnson and Cheng Li and Ariel Rabinovic}, + year = {2006}, + month = {Apr.}, + journal = {Biostatistics}, + publisher = {Oxford University Press ({OUP})}, + volume = {8}, + number = {1}, + pages = {118--127}, + doi = {10.1093/biostatistics/kxj037}, + url = {https://doi.org/10.1093/biostatistics/kxj037} +} + + +@article{hao2021integrated, + title = {Integrated analysis of multimodal single-cell data}, + author = {Yuhan Hao and Stephanie Hao and Erica Andersen-Nissen and William M. Mauck and Shiwei Zheng and Andrew Butler and Maddie J. Lee and Aaron J. Wilk and Charlotte Darby and Michael Zager and Paul Hoffman and Marlon Stoeckius and Efthymia Papalexi and Eleni P. Mimitou and Jaison Jain and Avi Srivastava and Tim Stuart and Lamar M. Fleming and Bertrand Yeung and Angela J. Rogers and Juliana M. McElrath and Catherine A. Blish and Raphael Gottardo and Peter Smibert and Rahul Satija}, + year = {2021}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {184}, + number = {13}, + pages = {3573--3587.e29}, + doi = {10.1016/j.cell.2021.04.048}, + url = {https://doi.org/10.1016/j.cell.2021.04.048} +} + + +@article{hao2021somde, + title={SOMDE: a scalable method for identifying spatially variable genes with self-organizing map}, + author={Hao, Minsheng and Hua, Kui and Zhang, Xuegong}, + journal={Bioinformatics}, + volume={37}, + number={23}, + pages={4392--4398}, + year={2021}, + publisher={Oxford University Press}, + doi={10.1093/bioinformatics/btab471} +} + + +@article{hie2019efficient, + title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, + author = {Brian Hie and Bryan Bryson and Bonnie Berger}, + year = {2019}, + month = {May}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {6}, + pages = {685--691}, + doi = {10.1038/s41587-019-0113-3}, + url = {https://doi.org/10.1038/s41587-019-0113-3} +} + + +@article{hinton1989connectionist, + title = {Connectionist learning procedures}, + author = {Geoffrey E. Hinton}, + year = {1989}, + month = {Sept.}, + journal = {Artificial Intelligence}, + publisher = {Elsevier {BV}}, + volume = {40}, + number = {1-3}, + pages = {185--234}, + doi = {10.1016/0004-3702(89)90049-0}, + url = {https://doi.org/10.1016/0004-3702(89)90049-0} +} + + +@book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398} +} + + +@article{hou2019scmatch, + title = {{scMatch}: a single-cell gene expression profile annotation tool using reference datasets}, + author = {Rui Hou and Elena Denisenko and Alistair R R Forrest}, + year = {2019}, + month = {Apr.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {22}, + pages = {4688--4695}, + doi = {10.1093/bioinformatics/btz292}, + url = {https://doi.org/10.1093/bioinformatics/btz292}, + editor = {Janet Kelso} +} + + +@article{hou2020predicting, + title = {Predicting cell-to-cell communication networks using {NATMI}}, + author = {Rui Hou and Elena Denisenko and Huan Ting Ong and Jordan A. Ramilowski and Alistair R. R. Forrest}, + year = {2020}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {1}, + doi = {10.1038/s41467-020-18873-z}, + url = {https://doi.org/10.1038/s41467-020-18873-z} +} + + +@article{hou2020systematic, + title = {A systematic evaluation of single-cell {RNA}-sequencing imputation methods}, + author = {Wenpin Hou and Zhicheng Ji and Hongkai Ji and Stephanie C. Hicks}, + year = {2020}, + month = {Aug.}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + doi = {10.1186/s13059-020-02132-x}, + url = {https://doi.org/10.1186/s13059-020-02132-x} +} + + +@article{hubert1985comparing, + doi = {10.1007/bf01908075}, + url = {https://doi.org/10.1007/bf01908075}, + year = {1985}, + month = {Dec.}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {2}, + number = {1}, + pages = {193--218}, + author = {Lawrence Hubert and Phipps Arabie}, + title = {Comparing partitions}, + journal = {Journal of Classification} +} + + +@article{hu2021spagcn, + title={SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network}, + author={Hu, Jian and Li, Xiangjie and Coleman, Kyle and Schroeder, Amelia and Ma, Nan and Irwin, David J and Lee, Edward B and Shinohara, Russell T and Li, Mingyao}, + journal={Nature methods}, + volume={18}, + number={11}, + pages={1342--1351}, + year={2021}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-021-01255-8} +} + + +@string{jan = {Jan}} + + +@string{jul = {Jul.}} + + +@string{jun = {Jun.}} + + +@article{kats2021spatialde2, + title={SpatialDE2: fast and localized variance component analysis of spatial transcriptomics}, + author={Kats, Ilia and Vento-Tormo, Roser and Stegle, Oliver}, + journal={Biorxiv}, + pages={2021--10}, + year={2021}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2021.10.27.466045} +} + + +@article{kendall1938new, + doi = {10.1093/biomet/30.1-2.81}, + url = {https://doi.org/10.1093/biomet/30.1-2.81}, + year = {1938}, + month = {Jun.}, + publisher = {Oxford University Press ({OUP})}, + volume = {30}, + number = {1-2}, + pages = {81--93}, + author = {M. G. KENDALL}, + title = {A new measure of rank correlation}, + journal = {Biometrika} +} + + +@article{kiselev2019challenges, + title = {Challenges in unsupervised clustering of single-cell {RNA}-seq data}, + author = {Vladimir Yu Kiselev and Tallulah S. Andrews and Martin Hemberg}, + year = {2019}, + month = {Jan}, + journal = {Nature Reviews Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {20}, + number = {5}, + pages = {273--282}, + doi = {10.1038/s41576-018-0088-9}, + url = {https://doi.org/10.1038/s41576-018-0088-9} +} + + +@article{kleshchevnikov2022cell2location, + title = {Cell2location maps fine-grained cell types in spatial transcriptomics}, + author = {Vitalii Kleshchevnikov and Artem Shmatko and Emma Dann and Alexander Aivazidis and Hamish W. King and Tong Li and Rasa Elmentaite and Artem Lomakin and Veronika Kedlian and Adam Gayoso and Mika Sarkin Jain and Jun Sung Park and Lauma Ramona and Elizabeth Tuck and Anna Arutyunyan and Roser Vento-Tormo and Moritz Gerstung and Louisa James and Oliver Stegle and Omer Ali Bayraktar}, + year = {2022}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {5}, + pages = {661--671}, + doi = {10.1038/s41587-021-01139-4}, + url = {https://doi.org/10.1038/s41587-021-01139-4} +} + + +@article{korsunsky2019fast, + title = {Fast, sensitive and accurate integration of single-cell data with Harmony}, + author = {Ilya Korsunsky and Nghia Millard and Jean Fan and Kamil Slowikowski and Fan Zhang and Kevin Wei and Yuriy Baglaenko and Michael Brenner and Po-ru Loh and Soumya Raychaudhuri}, + year = {2019}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {12}, + pages = {1289--1296}, + doi = {10.1038/s41592-019-0619-0}, + url = {https://doi.org/10.1038/s41592-019-0619-0} +} + + +@article{kraemer2018dimred, + title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, + author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, + year = {2018}, + journal = {The R Journal}, + publisher = {The R Foundation}, + volume = {10}, + number = {1}, + pages = {342}, + doi = {10.32614/rj-2018-039}, + url = {https://doi.org/10.32614/rj-2018-039} +} + + +@article{kruskal1964mds, + title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, + author = {J. B. Kruskal}, + year = {1964}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {29}, + number = {1}, + pages = {1--27}, + doi = {10.1007/bf02289565}, + url = {https://doi.org/10.1007/bf02289565} +} + + +@article{kuppe2022spatial, + title={Spatial multi-omic map of human myocardial infarction}, + author={Kuppe, Christoph and Ramirez Flores, Ricardo O and Li, Zhijian and Hayat, Sikander and Levinson, Rebecca T and Liao, Xian and Hannani, Monica T and Tanevski, Jovan and W{\"u}nnemann, Florian and Nagai, James S and others}, + journal={Nature}, + volume={608}, + number={7924}, + pages={766--777}, + year={2022}, + publisher={Nature Publishing Group UK London} +} + + +@article{lance2022multimodal, + title = {Multimodal single cell data integration challenge: results and lessons learned}, + author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, + year = {2022}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2022.04.11.487796}, + url = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796}, + elocation-id = {2022.04.11.487796}, + eprint = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796.full.pdf} +} + + +@article{lance2024predicting, + title = {Predicting cellular profiles across modalities in longitudinal single-cell data: An Open Problems competition}, + author = {...}, + year = {2024}, + journal = {In preparation}, +} + + +@book{lawson1995solving, + title = {Solving Least Squares Problems}, + author = {Charles L. Lawson and Richard J. Hanson}, + year = {1995}, + month = {Jan}, + publisher = {Society for Industrial and Applied Mathematics}, + doi = {10.1137/1.9781611971217}, + url = {https://doi.org/10.1137/1.9781611971217} +} + + +@article{lee2009quality, + title = {Quality assessment of dimensionality reduction: Rank-based criteria}, + author = {John A. Lee and Michel Verleysen}, + year = {2009}, + month = {Mar.}, + journal = {Neurocomputing}, + publisher = {Elsevier {BV}}, + volume = {72}, + number = {7-9}, + pages = {1431--1443}, + doi = {10.1016/j.neucom.2008.12.017}, + url = {https://doi.org/10.1016/j.neucom.2008.12.017} +} + + +@article{li2021bayesian, + author = {Li, Qiwei and Zhang, Minzhe and Xie, Yang and Xiao, Guanghua}, + title = "{Bayesian modeling of spatial molecular profiling data via Gaussian process}", + journal = {Bioinformatics}, + volume = {37}, + number = {22}, + pages = {4129-4136}, + year = {2021}, + month = {06}, + abstract = "{The location, timing and abundance of gene expression (both mRNA and proteins) within a tissue define the molecular mechanisms of cell functions. Recent technology breakthroughs in spatial molecular profiling, including imaging-based technologies and sequencing-based technologies, have enabled the comprehensive molecular characterization of single cells while preserving their spatial and morphological contexts. This new bioinformatics scenario calls for effective and robust computational methods to identify genes with spatial patterns.We represent a novel Bayesian hierarchical model to analyze spatial transcriptomics data, with several unique characteristics. It models the zero-inflated and over-dispersed counts by deploying a zero-inflated negative binomial model that greatly increases model stability and robustness. Besides, the Bayesian inference framework allows us to borrow strength in parameter estimation in a de novo fashion. As a result, the proposed model shows competitive performances in accuracy and robustness over existing methods in both simulation studies and two real data applications.The related R/C++ source code is available at https://github.com/Minzhe/BOOST-GP.Supplementary data are available at Bioinformatics online. }", + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab455}, + url = {https://doi.org/10.1093/bioinformatics/btab455}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/22/4129/50335106/btab455.pdf}, +} + + +@article{linderman2018zero, + title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, + author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/397588}, + url = {https://www.biorxiv.org/content/early/2018/08/22/397588}, + elocation-id = {397588}, + eprint = {https://www.biorxiv.org/content/early/2018/08/22/397588.full.pdf} +} + + +@article{liu2020high, + title = {High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue}, + volume = {183}, + ISSN = {0092-8674}, + url = {http://dx.doi.org/10.1016/j.cell.2020.10.026}, + DOI = {10.1016/j.cell.2020.10.026}, + number = {6}, + journal = {Cell}, + publisher = {Elsevier BV}, + author = {Liu, Yang and Yang, Mingyu and Deng, Yanxiang and Su, Graham and Enninful, Archibald and Guo, Cindy C. and Tebaldi, Toma and Zhang, Di and Kim, Dongjoo and Bai, Zhiliang and Norris, Eileen and Pan, Alisia and Li, Jiatong and Xiao, Yang and Halene, Stephanie and Fan, Rong}, + year = {2020}, + month = dec, + pages = {1665--1681.e18} +} + + +@article{lohoff2021integration, + title = {Integration of spatial and single-cell transcriptomic data elucidates mouse organogenesis}, + volume = {40}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-021-01006-2}, + DOI = {10.1038/s41587-021-01006-2}, + number = {1}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Lohoff, T. and Ghazanfar, S. and Missarova, A. and Koulena, N. and Pierson, N. and Griffiths, J. A. and Bardot, E. S. and Eng, C.-H. L. and Tyser, R. C. V. and Argelaguet, R. and Guibentif, C. and Srinivas, S. and Briscoe, J. and Simons, B. D. and Hadjantonakis, A.-K. and G\"{o}ttgens, B. and Reik, W. and Nichols, J. and Cai, L. and Marioni, J. C.}, + year = {2021}, + month = sep, + pages = {74-85} +} + + +@article{lopez2018deep, + title = {Deep generative modeling for single-cell transcriptomics}, + author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, + year = {2018}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {12}, + pages = {1053--1058}, + doi = {10.1038/s41592-018-0229-2}, + url = {https://doi.org/10.1038/s41592-018-0229-2} +} + + +@article{lopez2022destvi, + title = {{DestVI} identifies continuums of cell types in spatial transcriptomics data}, + author = {Romain Lopez and Baoguo Li and Hadas Keren-Shaul and Pierre Boyeau and Merav Kedmi and David Pilzer and Adam Jelinski and Ido Yofe and Eyal David and Allon Wagner and Can Ergen and Yoseph Addadi and Ofra Golani and Franca Ronchese and Michael I. Jordan and Ido Amit and Nir Yosef}, + year = {2022}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {9}, + pages = {1360--1369}, + doi = {10.1038/s41587-022-01272-8}, + url = {https://doi.org/10.1038/s41587-022-01272-8} +} + + +@article{lotfollahi2020query, + title = {Query to reference single-cell integration with transfer learning}, + author = {Lotfollahi, Mohammad and Naghipourfar, Mohsen and Luecken, Malte D. and Khajavi, Matin and B{\"u}ttner, Maren and Avsec, Ziga and Misharin, Alexander V. and Theis, Fabian J.}, + year = {2020}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2020.07.16.205997}, + url = {https://doi.org/10.1101/2020.07.16.205997}, + elocation-id = {2020.07.16.205997}, + eprint = {https://www.biorxiv.org/content/early/2020/07/16/2020.07.16.205997.full.pdf} +} + + +@article{luecken2022benchmarking, + title = {Benchmarking atlas-level data integration in single-cell genomics}, + author = {Malte D. Luecken and M. B\"{u}ttner and K. Chaichoompu and A. Danese and M. Interlandi and M. F. Mueller and D. C. Strobl and L. Zappia and M. Dugas and M. Colom{\'{e}}-Tatch{\'{e}} and Fabian J. Theis}, + year = {2021}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {1}, + pages = {41--50}, + doi = {10.1038/s41592-021-01336-8}, + url = {https://doi.org/10.1038/s41592-021-01336-8} +} + + +@article{lueks2011evaluate, + title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, + author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, + year = {2011}, + journal = {arXiv}, + doi = {10.48550/ARXIV.1110.3917}, + url = {https://arxiv.org/abs/1110.3917}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@misc{lun2019fastmnn, + title = {A description of the theory behind the fastMNN algorithm}, + author = {Lun, Aaron}, + year = {2019}, + url = {https://marionilab.github.io/FurtherMNN2018/theory/description.html} +} + + +@string{mar = {Mar.}} + + +@string{may = {May}} + + +@article{mcinnes2018umap, + title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, + author = {McInnes, Leland and Healy, John and Melville, James}, + year = {2018}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.1802.03426}, + url = {https://arxiv.org/abs/1802.03426}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + + +@article{mereu2020benchmarking, + doi = {10.1038/s41587-020-0469-4}, + author = {Mereu, Elisabetta and Lafzi, Atefeh and Moutinho, Catia and Ziegenhain, Christoph and McCarthy, Davis J and Alvarez-Varela, Adrian and Batlle, Eduard and Sagar and Gruen, Dominic and Lau, Julia K and others}, + journal = {Nature biotechnology}, + number = {6}, + pages = {747--755}, + publisher = {Nature Publishing Group US New York}, + title = {Benchmarking single-cell {RNA}-sequencing protocols for cell atlas projects}, + volume = {38}, + year = {2020} +} + + +@inbook{miles2005rsquared, + title = {Encyclopedia of Statistics in Behavioral Science}, + author = {Jeremy Miles}, + year = {2005}, + month = {Oct.}, + publisher = {John Wiley {\&} Sons, Ltd}, + doi = {10.1002/0470013192.bsa526}, + url = {https://doi.org/10.1002/0470013192.bsa526}, + chapter = {{R-Squared}, Adjusted {R-Squared}} +} + + +@article{moon2019visualizing, + title = {Visualizing structure and transitions in high-dimensional biological data}, + author = {Kevin R. Moon and David van Dijk and Zheng Wang and Scott Gigante and Daniel B. Burkhardt and William S. Chen and Kristina Yim and Antonia van den Elzen and Matthew J. Hirn and Ronald R. Coifman and Natalia B. Ivanova and Guy Wolf and Smita Krishnaswamy}, + year = {2019}, + month = {Dec.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {12}, + pages = {1482--1492}, + doi = {10.1038/s41587-019-0336-3}, + url = {https://doi.org/10.1038/s41587-019-0336-3} +} + + +@article{narayan2021assessing, + title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, + author = {Ashwin Narayan and Bonnie Berger and Hyunghoon Cho}, + year = {2021}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {39}, + number = {6}, + pages = {765--774}, + doi = {10.1038/s41587-020-00801-7}, + url = {https://doi.org/10.1038/s41587-020-00801-7} +} + + +@article{nestorowa2016single, + title = {A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation}, + author = {Sonia Nestorowa and Fiona K. Hamey and Blanca Pijuan Sala and Evangelia Diamanti and Mairi Shepherd and Elisa Laurenti and Nicola K. Wilson and David G. Kent and Berthold G\"{o}ttgens}, + year = {2016}, + month = {Aug.}, + journal = {Blood}, + publisher = {American Society of Hematology}, + volume = {128}, + number = {8}, + pages = {e20--e31}, + doi = {10.1182/blood-2016-05-716480}, + url = {https://doi.org/10.1182/blood-2016-05-716480} +} + + +@inproceedings{luecken2021neurips, + author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M}, + booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, + editor = {J. Vanschoren and S. Yeung}, + pages = {}, + publisher = {Curran}, + title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells}, + url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf}, + volume = {1}, + year = {2021} +} + + +@string{nov = {Nov.}} + + +@string{oct = {Oct.}} + + +@article{olsson2016single, + title = {Single-cell analysis of mixed-lineage states leading to a binary cell fate choice}, + author = {Andre Olsson and Meenakshi Venkatasubramanian and Viren K. Chaudhri and Bruce J. Aronow and Nathan Salomonis and Harinder Singh and H. Leighton Grimes}, + year = {2016}, + month = {Aug.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {537}, + number = {7622}, + pages = {698--702}, + doi = {10.1038/nature19348}, + url = {https://doi.org/10.1038/nature19348} +} + + +@misc{openproblems, + title = {Open Problems}, + author = {{Open Problems for Single Cell Analysis Consortium}}, + year = {2022}, + url = {https://openproblems.bio} +} + + +@article{palla2022squidpy, + title={Squidpy: a scalable framework for spatial omics analysis}, + author={Palla, Giovanni and Spitzer, Hannah and Klein, Michal and Fischer, David and Schaar, Anna Christina and Kuemmerle, Louis Benedikt and Rybakov, Sergei and Ibarra, Ignacio L and Holmberg, Olle and Virshup, Isaac and others}, + journal={Nature methods}, + volume={19}, + number={2}, + pages={171--178}, + year={2022}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-021-01358-2} +} + + +@article{pearson1895regression, + doi = {10.1098/rspl.1895.0041}, + title = {VII. Note on regression and inheritance in the case of two parents}, + author = {Pearson, Karl}, + journal = {proceedings of the royal society of London}, + volume = {58}, + number = {347-352}, + pages = {240--242}, + year = {1895}, + publisher = {The Royal Society London} +} + + +@article{pearson1901pca, + title = {On lines and planes of closest fit to systems of points in space}, + author = {Karl Pearson}, + year = {1901}, + month = {Nov.}, + journal = {The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}, + publisher = {Informa {UK} Limited}, + volume = {2}, + number = {11}, + pages = {559--572}, + doi = {10.1080/14786440109462720}, + url = {https://doi.org/10.1080/14786440109462720} +} + + +@article{pliner2019supervised, + title = {Supervised classification enables rapid annotation of cell atlases}, + author = {Hannah A. Pliner and Jay Shendure and Cole Trapnell}, + year = {2019}, + month = {Sept.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {10}, + pages = {983--986}, + doi = {10.1038/s41592-019-0535-3}, + url = {https://doi.org/10.1038/s41592-019-0535-3} +} + + +@article{polanski2020bbknn, + title = {{BBKNN}: fast batch alignment of single cell transcriptomes}, + author = {Krzysztof Pola{\'{n}}ski and Matthew D Young and Zhichao Miao and Kerstin B Meyer and Sarah A Teichmann and Jong-Eun Park}, + year = {2019}, + month = {Aug.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + doi = {10.1093/bioinformatics/btz625}, + url = {https://doi.org/10.1093/bioinformatics/btz625}, + editor = {Bonnie Berger} +} + + +@article{raredon2022computation, + title = {Computation and visualization of cell{\textendash}cell signaling topologies in single-cell systems data using Connectome}, + author = {Micha Sam Brickman Raredon and Junchen Yang and James Garritano and Meng Wang and Dan Kushnir and Jonas Christian Schupp and Taylor S. Adams and Allison M. Greaney and Katherine L. Leiby and Naftali Kaminski and Yuval Kluger and Andre Levchenko and Laura E. Niklason}, + year = {2022}, + month = {Mar.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {12}, + number = {1}, + doi = {10.1038/s41598-022-07959-x}, + url = {https://doi.org/10.1038/s41598-022-07959-x} +} + + +@article{rodriques2019slide, + title = {Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution}, + author = {Samuel G. Rodriques and Robert R. Stickels and Aleksandrina Goeva and Carly A. Martin and Evan Murray and Charles R. Vanderburg and Joshua Welch and Linlin M. Chen and Fei Chen and Evan Z. Macosko}, + year = {2019}, + month = {Mar.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {363}, + number = {6434}, + pages = {1463--1467}, + doi = {10.1126/science.aaw1219}, + url = {https://doi.org/10.1126/science.aaw1219} +} + + +@article{russell2023slide, + title = {Slide-tags enables single-nucleus barcoding for multimodal spatial genomics}, + volume = {625}, + ISSN = {1476-4687}, + url = {http://dx.doi.org/10.1038/s41586-023-06837-4}, + DOI = {10.1038/s41586-023-06837-4}, + number = {7993}, + journal = {Nature}, + publisher = {Springer Science and Business Media LLC}, + author = {Russell, Andrew J. C. and Weir, Jackson A. and Nadaf, Naeem M. and Shabet, Matthew and Kumar, Vipin and Kambhampati, Sandeep and Raichur, Ruth and Marrero, Giovanni J. and Liu, Sophia and Balderrama, Karol S. and Vanderburg, Charles R. and Shanmugam, Vignesh and Tian, Luyi and Iorgulescu, J. Bryan and Yoon, Charles H. and Wu, Catherine J. and Macosko, Evan Z. and Chen, Fei}, + year = {2023}, + month = dec, + pages = {101–109} +} + + +@InProceedings{santos2009on, + author = {Santos, Jorge M. and Embrechts, Mark"}, + editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios}, + title = {On the Use of the Adjusted Rand Index as a Metric for Evaluating Supervised Classification}, + booktitle = {Artificial Neural Networks -- ICANN 2009}, + year = {2009}, + publisher = {Springer Berlin Heidelberg}, + address = {Berlin, Heidelberg}, + pages = {175--184}, + isbn = {978-3-642-04277-5}, + doi = {10.1007/978-3-642-04277-5_18}, + url = {https://doi.org/10.1007/978-3-642-04277-5_18} +} + + +@article{sarkar2021separating, + title = {Separating measurement and expression models clarifies confusion in single-cell {RNA} sequencing analysis}, + author = {Abhishek Sarkar and Matthew Stephens}, + year = {2021}, + month = {May}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {6}, + pages = {770--777}, + doi = {10.1038/s41588-021-00873-4}, + url = {https://doi.org/10.1038/s41588-021-00873-4} +} + + +@article{schober2018correlation, + title = {Correlation Coefficients}, + author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, + year = {2018}, + month = {May}, + journal = {Anesthesia {\&} Analgesia}, + publisher = {Ovid Technologies (Wolters Kluwer Health)}, + volume = {126}, + number = {5}, + pages = {1763--1768}, + doi = {10.1213/ane.0000000000002864}, + url = {https://doi.org/10.1213/ane.0000000000002864} +} + + +@string{sep = {Sept.}} + + +@inproceedings{stanley2020harmonic, + title = {Harmonic Alignment}, + author = {Jay S. Stanley and Scott Gigante and Guy Wolf and Smita Krishnaswamy}, + year = {2020}, + month = {Jan}, + booktitle = {Proceedings of the 2020 {SIAM} International Conference on Data Mining}, + publisher = {Society for Industrial and Applied Mathematics}, + pages = {316--324}, + doi = {10.1137/1.9781611976236.36}, + url = {https://doi.org/10.1137/1.9781611976236.36} +} + + +@article{stickels2020highly, + title = {Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2}, + volume = {39}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-020-0739-1}, + DOI = {10.1038/s41587-020-0739-1}, + number = {3}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Stickels, Robert R. and Murray, Evan and Kumar, Pawan and Li, Jilong and Marshall, Jamie L. and Di Bella, Daniela J. and Arlotta, Paola and Macosko, Evan Z. and Chen, Fei}, + year = {2020}, + month = dec, + pages = {313–319} +} + + +@article{stoeckius2017simultaneous, + title = {Simultaneous epitope and transcriptome measurement in single cells}, + author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, + year = {2017}, + month = {Jul.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {14}, + number = {9}, + pages = {865--868}, + doi = {10.1038/nmeth.4380}, + url = {https://doi.org/10.1038/nmeth.4380} +} + + +@article{stuart2019comprehensive, + title = {Comprehensive Integration of Single-Cell Data}, + author = {Stuart, T. and Butler, A. and Hoffman, P. and Hafemeister, C. and Papalexi, E. and Mauck, W.M. and Hao, Y. and Stoeckius, M. and Smibert, P. and Satija, R.}, + year = {2019}, + journal = {Cell}, + volume = {177}, + number = {7}, + pages = {1888--1902.e21}, + doi = {10.1016/j.cell.2019.05.031} +} + + +@article{sun2020statistical, + title={Statistical analysis of spatial expression patterns for spatially resolved transcriptomic studies}, + author={Sun, Shiquan and Zhu, Jiaqiang and Zhou, Xiang}, + journal={Nature methods}, + volume={17}, + number={2}, + pages={193--200}, + year={2020}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-019-0701-7} +} + + +@article{svensson2018spatialde, + title={SpatialDE: identification of spatially variable genes}, + author={Svensson, Valentine and Teichmann, Sarah A and Stegle, Oliver}, + journal={Nature methods}, + volume={15}, + number={5}, + pages={343--346}, + year={2018}, + publisher={Nature Publishing Group}, + doi={10.1038/nmeth.4636} +} + + +@article{szubert2019structurepreserving, + title = {Structure-preserving visualisation of high dimensional single-cell datasets}, + author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, + year = {2019}, + month = {Jun.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {9}, + number = {1}, + doi = {10.1038/s41598-019-45301-0}, + url = {https://doi.org/10.1038/s41598-019-45301-0} +} + + +@article{tabula2018single, + title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, + author = {{Tabula Muris Consortium}}, + year = {2018}, + month = {Oct.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {562}, + number = {7727}, + pages = {367--372}, + doi = {10.1038/s41586-018-0590-4}, + url = {https://doi.org/10.1038/s41586-018-0590-4} +} + + +@article{tabula2020single, + title = {A single-cell transcriptomic atlas characterizes ageing tissues in the mouse}, + author = {{Tabula Muris Consortium}}, + year = {2020}, + month = {Jul.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {583}, + number = {7817}, + pages = {590--595}, + doi = {10.1038/s41586-020-2496-1}, + url = {https://doi.org/10.1038/s41586-020-2496-1} +} + + +@article{tasic2016adult, + title = {Adult mouse cortical cell taxonomy revealed by single cell transcriptomics}, + author = {Bosiljka Tasic and Vilas Menon and Thuc Nghi Nguyen and Tae Kyung Kim and Tim Jarsky and Zizhen Yao and Boaz Levi and Lucas T Gray and Staci A Sorensen and Tim Dolbeare and Darren Bertagnolli and Jeff Goldy and Nadiya Shapovalova and Sheana Parry and Changkyu Lee and Kimberly Smith and Amy Bernard and Linda Madisen and Susan M Sunkin and Michael Hawrylycz and Christof Koch and Hongkui Zeng}, + year = {2016}, + month = {Jan}, + journal = {Nature Neuroscience}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {2}, + pages = {335--346}, + doi = {10.1038/nn.4216}, + url = {https://doi.org/10.1038/nn.4216} +} + + +@article{tian2019benchmarking, + title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, + author = {Luyi Tian and Xueyi Dong and Saskia Freytag and Kim-Anh L{\^{e}} Cao and Shian Su and Abolfazl JalalAbadi and Daniela Amann-Zalcenstein and Tom S. Weber and Azadeh Seidi and Jafar S. Jabbari and Shalin H. Naik and Matthew E. Ritchie}, + year = {2019}, + month = {May}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {6}, + pages = {479--487}, + doi = {10.1038/s41592-019-0425-8}, + url = {https://doi.org/10.1038/s41592-019-0425-8} +} + + +@article{tran2020benchmark, + doi = {10.1186/s13059-019-1850-9}, + url = {https://doi.org/10.1186/s13059-019-1850-9}, + year = {2020}, + month = {Jan}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + author = {Hoa Thi Nhu Tran and Kok Siong Ang and Marion Chevrier and Xiaomeng Zhang and Nicole Yee Shin Lee and Michelle Goh and Jinmiao Chen}, + title = {A benchmark of batch-effect correction methods for single-cell {RNA} sequencing data}, + journal = {Genome Biology} +} + + +@article{van2018recovering, + title = {Recovering Gene Interactions from Single-Cell Data Using Data Diffusion}, + author = {David van Dijk and Roshan Sharma and Juozas Nainys and Kristina Yim and Pooja Kathail and Ambrose J. Carr and Cassandra Burdziak and Kevin R. Moon and Christine L. Chaffer and Diwakar Pattabiraman and Brian Bierie and Linas Mazutis and Guy Wolf and Smita Krishnaswamy and Dana Pe'er}, + year = {2018}, + month = {Jul.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {174}, + number = {3}, + pages = {716--729.e27}, + doi = {10.1016/j.cell.2018.05.061}, + url = {https://doi.org/10.1016/j.cell.2018.05.061} +} + + +@article{vandermaaten2008visualizing, + title = {Visualizing Data using t-SNE}, + author = {{van der} Maaten, Laurens and Hinton, Geoffrey}, + year = {2008}, + journal = {Journal of Machine Learning Research}, + volume = {9}, + number = {86}, + pages = {2579--2605}, + url = {http://jmlr.org/papers/v9/vandermaaten08a.html} +} + + +@inproceedings{venna2001neighborhood, + title = {Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2001}, + booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, + publisher = {Springer Berlin Heidelberg}, + pages = {485--491}, + doi = {{10.1007/3-540-44668-0\_68}}, + url = {{https://doi.org/10.1007/3-540-44668-0\_68}} +} + + +@article{venna2006local, + title = {Local multidimensional scaling}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2006}, + month = {Jul.}, + journal = {Neural Networks}, + publisher = {Elsevier {BV}}, + volume = {19}, + number = {6-7}, + pages = {889--899}, + doi = {10.1016/j.neunet.2006.05.014}, + url = {https://doi.org/10.1016/j.neunet.2006.05.014} +} + + +@article{virshup2021anndataannotateddata, + doi = {10.1101/2021.12.16.473007}, + url = {https://doi.org/10.1101/2021.12.16.473007}, + year = {2021}, + month = {Dec.}, + publisher = {Cold Spring Harbor Laboratory}, + author = {Isaac Virshup and Sergei Rybakov and Fabian J. Theis and Philipp Angerer and F. Alexander Wolf}, + title = {anndata: Annotated data} +} + + +@article{wagner2018knearest, + title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, + author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/217737}, + url = {https://www.biorxiv.org/content/early/2018/04/09/217737}, + elocation-id = {217737}, + eprint = {https://www.biorxiv.org/content/early/2018/04/09/217737.full.pdf} +} + + +@article{wagner2018single, + title = {Single-cell mapping of gene expression landscapes and lineage in the zebrafish embryo}, + author = {Daniel E. Wagner and Caleb Weinreb and Zach M. Collins and James A. Briggs and Sean G. Megason and Allon M. Klein}, + year = {2018}, + month = {Jun.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {360}, + number = {6392}, + pages = {981--987}, + doi = {10.1126/science.aar4362}, + url = {https://doi.org/10.1126/science.aar4362} +} + + +@article{wang2013target, + title = {Target analysis by integration of transcriptome and {ChIP}-seq data with {BETA}}, + author = {Su Wang and Hanfei Sun and Jian Ma and Chongzhi Zang and Chenfei Wang and Juan Wang and Qianzi Tang and Clifford A Meyer and Yong Zhang and X Shirley Liu}, + year = {2013}, + month = {Nov.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {8}, + number = {12}, + pages = {2502--2515}, + doi = {10.1038/nprot.2013.150}, + url = {https://doi.org/10.1038/nprot.2013.150} +} + + +@article{wang2017visualization, + title = {Visualization and analysis of single-cell {RNA}-seq data by kernel-based similarity learning}, + volume = {14}, + copyright = {2017 Springer Nature America, Inc.}, + issn = {1548-7105}, + url = {https://www.nature.com/articles/nmeth.4207}, + doi = {10.1038/nmeth.4207}, + abstract = {The SIMLR software identifies similarities between cells across a range of single-cell RNA-seq data, enabling effective dimension reduction, clustering and visualization.}, + language = {en}, + number = {4}, + journal = {Nature Methods}, + author = {Wang, Bo and Zhu, Junjie and Pierson, Emma and Ramazzotti, Daniele and Batzoglou, Serafim}, + month = apr, + year = {2017}, + publisher = {Nature Publishing Group}, + keywords = {Gene expression, Genome informatics, Machine learning, Statistical methods}, + pages = {414--416}, +} + + +@article{wang2018three, + title = {Three-dimensional intact-tissue sequencing of single-cell transcriptional states}, + volume = {361}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.aat5691}, + DOI = {10.1126/science.aat5691}, + number = {6400}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Wang, Xiao and Allen, William E. and Wright, Matthew A. and Sylwestrak, Emily L. and Samusik, Nikolay and Vesuna, Sam and Evans, Kathryn and Liu, Cindy and Ramakrishnan, Charu and Liu, Jia and Nolan, Garry P. and Bava, Felice-Alessio and Deisseroth, Karl}, + year = {2018}, + month = jul +} + + +@article{wang2022high, + title = {High-resolution 3D spatiotemporal transcriptomic maps of developing Drosophila embryos and larvae}, + volume = {57}, + ISSN = {1534-5807}, + url = {http://dx.doi.org/10.1016/j.devcel.2022.04.006}, + DOI = {10.1016/j.devcel.2022.04.006}, + number = {10}, + journal = {Developmental Cell}, + publisher = {Elsevier BV}, + author = {Wang, Mingyue and Hu, Qinan and Lv, Tianhang and Wang, Yuhang and Lan, Qing and Xiang, Rong and Tu, Zhencheng and Wei, Yanrong and Han, Kai and Shi, Chang and Guo, Junfu and Liu, Chao and Yang, Tao and Du, Wensi and An, Yanru and Cheng, Mengnan and Xu, Jiangshan and Lu, Haorong and Li, Wangsheng and Zhang, Shaofang and Chen, Ao and Chen, Wei and Li, Yuxiang and Wang, Xiaoshan and Xu, Xun and Hu, Yuhui and Liu, Longqi}, + year = {2022}, + month = may, + pages = {1271--1283.e4} +} + + +@article{weber2023nnsvg, + title={nnSVG for the scalable identification of spatially variable genes using nearest-neighbor Gaussian processes}, + author={Weber, Lukas M and Saha, Arkajyoti and Datta, Abhirup and Hansen, Kasper D and Hicks, Stephanie C}, + journal={Nature communications}, + volume={14}, + number={1}, + pages={4059}, + year={2023}, + publisher={Nature Publishing Group UK London}, + doi={10.1038/s41467-023-39748-z} +} + + +@article{welch2019single, + title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, + author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, + year = {2019}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {177}, + number = {7}, + pages = {1873--1887.e17}, + doi = {10.1016/j.cell.2019.05.006}, + url = {https://doi.org/10.1016/j.cell.2019.05.006} +} + + +@article{wilkinson1973symbolic, + doi = {10.2307/2346786}, + url = {https://doi.org/10.2307/2346786}, + year = {1973}, + publisher = {{JSTOR}}, + volume = {22}, + number = {3}, + pages = {392}, + author = {G. N. Wilkinson and C. E. Rogers}, + title = {Symbolic Description of Factorial Models for Analysis of Variance}, + journal = {Applied Statistics} +} + + +@article{wu2021single, + title = {A single-cell and spatially resolved atlas of human breast cancers}, + author = {Sunny Z. Wu and Ghamdan Al-Eryani and Daniel Lee Roden and Simon Junankar and Kate Harvey and Alma Andersson and Aatish Thennavan and Chenfei Wang and James R. Torpy and Nenad Bartonicek and Taopeng Wang and Ludvig Larsson and Dominik Kaczorowski and Neil I. Weisenfeld and Cedric R. Uytingco and Jennifer G. Chew and Zachary W. Bent and Chia-Ling Chan and Vikkitharan Gnanasambandapillai and Charles-Antoine Dutertre and Laurence Gluch and Mun N. Hui and Jane Beith and Andrew Parker and Elizabeth Robbins and Davendra Segara and Caroline Cooper and Cindy Mak and Belinda Chan and Sanjay Warrier and Florent Ginhoux and Ewan Millar and Joseph E. Powell and Stephen R. Williams and X. Shirley Liu and Sandra O'Toole and Elgene Lim and Joakim Lundeberg and Charles M. Perou and Alexander Swarbrick}, + year = {2021}, + month = {Sept.}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {9}, + pages = {1334--1347}, + doi = {10.1038/s41588-021-00911-1}, + url = {https://doi.org/10.1038/s41588-021-00911-1} +} + + +@article{xiong2020neuralee, + title = {{NeuralEE}: A {GPU}-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale {scRNA}-Seq Data}, + author = {Jiankang Xiong and Fuzhou Gong and Lin Wan and Liang Ma}, + year = {2020}, + month = {Oct.}, + journal = {Frontiers in Genetics}, + publisher = {Frontiers Media {SA}}, + volume = {11}, + doi = {10.3389/fgene.2020.00786}, + url = {https://doi.org/10.3389/fgene.2020.00786} +} + + +@article{xiong2021online, + title = {Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space}, + author = {Lei Xiong and Kang Tian and Yuzhe Li and Weixi Ning and Xin Gao and Qiangfeng Cliff Zhang}, + year = {2022}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-33758-z}, + url = {https://doi.org/10.1038/s41467-022-33758-z} +} + + +@article{xu2021probabilistic, + title = {Probabilistic harmonization and annotation of single-cell transcriptomics data with deep generative models}, + author = {Chenling Xu and Romain Lopez and Edouard Mehlman and Jeffrey Regier and Michael I Jordan and Nir Yosef}, + year = {2021}, + month = {Jan}, + journal = {Molecular Systems Biology}, + publisher = {{Embo}}, + volume = {17}, + number = {1}, + doi = {10.15252/msb.20209620}, + url = {https://doi.org/10.15252/msb.20209620} +} + + +@article{zappia2018exploring, + doi = {10.1371/journal.pcbi.1006245}, + url = {https://doi.org/10.1371/journal.pcbi.1006245}, + year = {2018}, + month = {Jun.}, + publisher = {Public Library of Science ({PLoS})}, + volume = {14}, + number = {6}, + pages = {e1006245}, + author = {Luke Zappia and Belinda Phipson and Alicia Oshlack}, + editor = {Dina Schneidman}, + title = {Exploring the single-cell {RNA}-seq analysis landscape with the {scRNA}-tools database}, + journal = {{PLOS} Computational Biology} +} + + +@article{zhang2021pydrmetrics, + title = {{pyDRMetrics} - A Python toolkit for dimensionality reduction quality assessment}, + author = {Yinsheng Zhang and Qian Shang and Guoming Zhang}, + year = {2021}, + month = {Feb.}, + journal = {Heliyon}, + publisher = {Elsevier {BV}}, + volume = {7}, + number = {2}, + pages = {e06199}, + doi = {10.1016/j.heliyon.2021.e06199}, + url = {https://doi.org/10.1016/j.heliyon.2021.e06199} +} + + +@article{zhang2022identification, + title={Identification of spatially variable genes with graph cuts}, + author={Zhang, Ke and Feng, Wanwan and Wang, Peng}, + journal={Nature Communications}, + volume={13}, + number={1}, + pages={5488}, + year={2022}, + publisher={Nature Publishing Group UK London}, + doi={10.1038/s41467-022-33182-3} +} + + +@article{zhu2021spark, + title={SPARK-X: non-parametric modeling enables scalable and robust detection of spatial expression patterns for large spatial transcriptomic studies}, + author={Zhu, Jiaqiang and Sun, Shiquan and Zhou, Xiang}, + journal={Genome biology}, + volume={22}, + number={1}, + pages={184}, + year={2021}, + publisher={Springer}, + doi={10.1186/s13059-021-02404-0} +} + + +@article {hrovatin2023delineating, + author = {Karin Hrovatin and Aim{\'e}e Bastidas-Ponce and Mostafa Bakhti and Luke Zappia and Maren B{\"u}ttner and Ciro Sallino and Michael Sterr and Anika B{\"o}ttcher and Adriana Migliorini and Heiko Lickert and Fabian J. Theis}, + title = {Delineating mouse β-cell identity during lifetime and in diabetes with a single cell atlas}, + elocation-id = {2022.12.22.521557}, + year = {2023}, + doi = {10.1101/2022.12.22.521557}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557}, + eprint = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557.full.pdf}, + journal = {bioRxiv} +} + +@article{sikkema2023integrated, + title = {An integrated cell atlas of the lung in health and disease}, + volume = {29}, + ISSN = {1546-170X}, + url = {http://dx.doi.org/10.1038/s41591-023-02327-2}, + DOI = {10.1038/s41591-023-02327-2}, + number = {6}, + journal = {Nature Medicine}, + publisher = {Springer Science and Business Media LLC}, + author = {Sikkema, Lisa and Ramírez-Suástegui, Ciro and Strobl, Daniel C. and Gillett, Tessa E. and Zappia, Luke and Madissoon, Elo and Markov, Nikolay S. and Zaragosi, Laure-Emmanuelle and Ji, Yuge and Ansari, Meshal and Arguel, Marie-Jeanne and Apperloo, Leonie and Banchero, Martin and Bécavin, Christophe and Berg, Marijn and Chichelnitskiy, Evgeny and Chung, Mei-i and Collin, Antoine and Gay, Aurore C. A. and Gote-Schniering, Janine and Hooshiar Kashani, Baharak and Inecik, Kemal and Jain, Manu and Kapellos, Theodore S. and Kole, Tessa M. and Leroy, Sylvie and Mayr, Christoph H. and Oliver, Amanda J. and von Papen, Michael and Peter, Lance and Taylor, Chase J. and Walzthoeni, Thomas and Xu, Chuan and Bui, Linh T. and De Donno, Carlo and Dony, Leander and Faiz, Alen and Guo, Minzhe and Gutierrez, Austin J. and Heumos, Lukas and Huang, Ni and Ibarra, Ignacio L. and Jackson, Nathan D. and Kadur Lakshminarasimha Murthy, Preetish and Lotfollahi, Mohammad and Tabib, Tracy and Talavera-López, Carlos and Travaglini, Kyle J. and Wilbrey-Clark, Anna and Worlock, Kaylee B. and Yoshida, Masahiro and Chen, Yuexin and Hagood, James S. and Agami, Ahmed and Horvath, Peter and Lundeberg, Joakim and Marquette, Charles-Hugo and Pryhuber, Gloria and Samakovlis, Chistos and Sun, Xin and Ware, Lorraine B. and Zhang, Kun and van den Berge, Maarten and Bossé, Yohan and Desai, Tushar J. and Eickelberg, Oliver and Kaminski, Naftali and Krasnow, Mark A. and Lafyatis, Robert and Nikolic, Marko Z. and Powell, Joseph E. and Rajagopal, Jayaraj and Rojas, Mauricio and Rozenblatt-Rosen, Orit and Seibold, Max A. and Sheppard, Dean and Shepherd, Douglas P. and Sin, Don D. and Timens, Wim and Tsankov, Alexander M. and Whitsett, Jeffrey and Xu, Yan and Banovich, Nicholas E. and Barbry, Pascal and Duong, Thu Elizabeth and Falk, Christine S. and Meyer, Kerstin B. and Kropski, Jonathan A. and Pe’er, Dana and Schiller, Herbert B. and Tata, Purushothama Rao and Schultze, Joachim L. and Teichmann, Sara A. and Misharin, Alexander V. and Nawijn, Martijn C. and Luecken, Malte D. and Theis, Fabian J.}, + year = {2023}, + month = jun, + pages = {1563–1577} +} + +@article{consortium2022tabula, + title = {The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4896}, + DOI = {10.1126/science.abl4896}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Jones, Robert C. and Karkanias, Jim and Krasnow, Mark A. and Pisco, Angela Oliveira and Quake, Stephen R. and Salzman, Julia and Yosef, Nir and Bulthaup, Bryan and Brown, Phillip and Harper, William and Hemenez, Marisa and Ponnusamy, Ravikumar and Salehi, Ahmad and Sanagavarapu, Bhavani A. and Spallino, Eileen and Aaron, Ksenia A. and Concepcion, Waldo and Gardner, James M. and Kelly, Burnett and Neidlinger, Nikole and Wang, Zifa and Crasta, Sheela and Kolluru, Saroja and Morri, Maurizio and Pisco, Angela Oliveira and Tan, Serena Y. and Travaglini, Kyle J. and Xu, Chenling and Alcántara-Hernández, Marcela and Almanzar, Nicole and Antony, Jane and Beyersdorf, Benjamin and Burhan, Deviana and Calcuttawala, Kruti and Carter, Matthew M. and Chan, Charles K. F. and Chang, Charles A. and Chang, Stephen and Colville, Alex and Crasta, Sheela and Culver, Rebecca N. and Cvijović, Ivana and D’Amato, Gaetano and Ezran, Camille and Galdos, Francisco X. and Gillich, Astrid and Goodyer, William R. and Hang, Yan and Hayashi, Alyssa and Houshdaran, Sahar and Huang, Xianxi and Irwin, Juan C. and Jang, SoRi and Juanico, Julia Vallve and Kershner, Aaron M. and Kim, Soochi and Kiss, Bernhard and Kolluru, Saroja and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Li, Baoxiang and Loeb, Gabriel B. and Lu, Wan-Jin and Mantri, Sruthi and Markovic, Maxim and McAlpine, Patrick L. and de Morree, Antoine and Morri, Maurizio and Mrouj, Karim and Mukherjee, Shravani and Muser, Tyler and Neuh\"{o}fer, Patrick and Nguyen, Thi D. and Perez, Kimberly and Phansalkar, Ragini and Pisco, Angela Oliveira and Puluca, Nazan and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Schaum, Nicholas and Scott, Bronwyn and Seddighzadeh, Bobak and Segal, Joe and Sen, Sushmita and Sikandar, Shaheen and Spencer, Sean P. and Steffes, Lea C. and Subramaniam, Varun R. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Van Treuren, Will and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Vo, Kim Chi and Vorperian, Sevahn K. and Wang, Wanxin and Weinstein, Hannah N. W. and Winkler, Juliane and Wu, Timothy T. H. and Xie, Jamie and Yung, Andrea R. and Zhang, Yue and Detweiler, Angela M. and Mekonen, Honey and Neff, Norma F. and Sit, Rene V. and Tan, Michelle and Yan, Jia and Bean, Gregory R. and Charu, Vivek and Forgó, Erna and Martin, Brock A. and Ozawa, Michael G. and Silva, Oscar and Tan, Serena Y. and Toland, Angus and Vemuri, Venkata N. P. and Afik, Shaked and Awayan, Kyle and Botvinnik, Olga Borisovna and Byrne, Ashley and Chen, Michelle and Dehghannasiri, Roozbeh and Detweiler, Angela M. and Gayoso, Adam and Granados, Alejandro A. and Li, Qiqing and Mahmoudabadi, Gita and McGeever, Aaron and de Morree, Antoine and Olivieri, Julia Eve and Park, Madeline and Pisco, Angela Oliveira and Ravikumar, Neha and Salzman, Julia and Stanley, Geoff and Swift, Michael and Tan, Michelle and Tan, Weilun and Tarashansky, Alexander J. and Vanheusden, Rohan and Vorperian, Sevahn K. and Wang, Peter and Wang, Sheng and Xing, Galen and Xu, Chenling and Yosef, Nir and Alcántara-Hernández, Marcela and Antony, Jane and Chan, Charles K. F. and Chang, Charles A. and Colville, Alex and Crasta, Sheela and Culver, Rebecca and Dethlefsen, Les and Ezran, Camille and Gillich, Astrid and Hang, Yan and Ho, Po-Yi and Irwin, Juan C. and Jang, SoRi and Kershner, Aaron M. and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Liu, Shixuan and Loeb, Gabriel B. and Lu, Wan-Jin and Maltzman, Jonathan S. and Metzger, Ross J. and de Morree, Antoine and Neuh\"{o}fer, Patrick and Perez, Kimberly and Phansalkar, Ragini and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Sasagawa, Koki and Scott, Bronwyn and Sinha, Rahul and Song, Hanbing and Spencer, Sean P. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Wang, Bruce and Wang, Wanxin and Winkler, Juliane and Xie, Jamie and Yung, Andrea R. and Artandi, Steven E. and Beachy, Philip A. and Clarke, Michael F. and Giudice, Linda C. and Huang, Franklin W. and Huang, Kerwyn Casey and Idoyaga, Juliana and Kim, Seung K. and Krasnow, Mark and Kuo, Christin S. and Nguyen, Patricia and Quake, Stephen R. and Rando, Thomas A. and Red-Horse, Kristy and Reiter, Jeremy and Relman, David A. and Sonnenburg, Justin L. and Wang, Bruce and Wu, Albert and Wu, Sean M. and Wyss-Coray, Tony}, + year = {2022}, + month = may +} + +@article{dominguez2022crosstissue, + title = {Cross-tissue immune cell analysis reveals tissue-specific features in humans}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl5197}, + DOI = {10.1126/science.abl5197}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Domínguez Conde, C. and Xu, C. and Jarvis, L. B. and Rainbow, D. B. and Wells, S. B. and Gomes, T. and Howlett, S. K. and Suchanek, O. and Polanski, K. and King, H. W. and Mamanova, L. and Huang, N. and Szabo, P. A. and Richardson, L. and Bolt, L. and Fasouli, E. S. and Mahbubani, K. T. and Prete, M. and Tuck, L. and Richoz, N. and Tuong, Z. K. and Campos, L. and Mousa, H. S. and Needham, E. J. and Pritchard, S. and Li, T. and Elmentaite, R. and Park, J. and Rahmani, E. and Chen, D. and Menon, D. K. and Bayraktar, O. A. and James, L. K. and Meyer, K. B. and Yosef, N. and Clatworthy, M. R. and Sims, P. A. and Farber, D. L. and Saeb-Parsy, K. and Jones, J. L. and Teichmann, S. A.}, + year = {2022}, + month = may +} + +@article{eraslan2022singlenucleus, + title = {Single-nucleus cross-tissue molecular reference maps toward understanding disease gene function}, + volume = {376}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4290}, + DOI = {10.1126/science.abl4290}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Eraslan, G\"{o}kcen and Drokhlyansky, Eugene and Anand, Shankara and Fiskin, Evgenij and Subramanian, Ayshwarya and Slyper, Michal and Wang, Jiali and Van Wittenberghe, Nicholas and Rouhana, John M. and Waldman, Julia and Ashenberg, Orr and Lek, Monkol and Dionne, Danielle and Win, Thet Su and Cuoco, Michael S. and Kuksenko, Olena and Tsankov, Alexander M. and Branton, Philip A. and Marshall, Jamie L. and Greka, Anna and Getz, Gad and Segrè, Ayellet V. and Aguet, Fran\c{c}ois and Rozenblatt-Rosen, Orit and Ardlie, Kristin G. and Regev, Aviv}, + year = {2022}, + month = may +} + +@article{li2023integrated, + title = {Integrated multi-omics single cell atlas of the human retina}, + url = {http://dx.doi.org/10.1101/2023.11.07.566105}, + DOI = {10.1101/2023.11.07.566105}, + publisher = {Cold Spring Harbor Laboratory}, + author = {Li, Jin and Wang, Jun and Ibarra, Ignacio L and Cheng, Xuesen and Luecken, Malte D and Lu, Jiaxiong and Monavarfeshani, Aboozar and Yan, Wenjun and Zheng, Yiqiao and Zuo, Zhen and Zayas Colborn, Samantha Lynn and Cortez, Berenice Sarahi and Owen, Leah A and Tran, Nicholas M and Shekhar, Karthik and Sanes, Joshua R and Stout, J Timothy and Chen, Shiming and Li, Yumei and DeAngelis, Margaret M and Theis, Fabian J and Chen, Rui}, + year = {2023}, + month = nov +} + +@article{wilson2022multimodal, + title = {Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression}, + volume = {13}, + ISSN = {2041-1723}, + url = {http://dx.doi.org/10.1038/s41467-022-32972-z}, + DOI = {10.1038/s41467-022-32972-z}, + number = {1}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media LLC}, + author = {Wilson, Parker C. and Muto, Yoshiharu and Wu, Haojia and Karihaloo, Anil and Waikar, Sushrut S. and Humphreys, Benjamin D.}, + year = {2022}, + month = sep +} + +@article{steuernagel2022hypomap, + title = {HypoMap—a unified single-cell gene expression atlas of the murine hypothalamus}, + volume = {4}, + ISSN = {2522-5812}, + url = {http://dx.doi.org/10.1038/s42255-022-00657-y}, + DOI = {10.1038/s42255-022-00657-y}, + number = {10}, + journal = {Nature Metabolism}, + publisher = {Springer Science and Business Media LLC}, + author = {Steuernagel, Lukas and Lam, Brian Y. H. and Klemm, Paul and Dowsett, Georgina K. C. and Bauder, Corinna A. and Tadross, John A. and Hitschfeld, Tamara Sotelo and del Rio Martin, Almudena and Chen, Weiyi and de Solis, Alain J. and Fenselau, Henning and Davidsen, Peter and Cimino, Irene and Kohnke, Sara N. and Rimmington, Debra and Coll, Anthony P. and Beyer, Andreas and Yeo, Giles S. H. and Br\"{u}ning, Jens C.}, + year = {2022}, + month = oct, + pages = {1402–1419} +} + +@article{tian2023singlecell, + title = {Single-cell DNA methylation and 3D genome architecture in the human brain}, + volume = {382}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.adf5357}, + DOI = {10.1126/science.adf5357}, + number = {6667}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Tian, Wei and Zhou, Jingtian and Bartlett, Anna and Zeng, Qiurui and Liu, Hanqing and Castanon, Rosa G. and Kenworthy, Mia and Altshul, Jordan and Valadon, Cynthia and Aldridge, Andrew and Nery, Joseph R. and Chen, Huaming and Xu, Jiaying and Johnson, Nicholas D. and Lucero, Jacinta and Osteen, Julia K. and Emerson, Nora and Rink, Jon and Lee, Jasper and Li, Yang E. and Siletti, Kimberly and Liem, Michelle and Claffey, Naomi and O’Connor, Carolyn and Yanny, Anna Marie and Nyhus, Julie and Dee, Nick and Casper, Tamara and Shapovalova, Nadiya and Hirschstein, Daniel and Ding, Song-Lin and Hodge, Rebecca and Levi, Boaz P. and Keene, C. Dirk and Linnarsson, Sten and Lein, Ed and Ren, Bing and Behrens, M. Margarita and Ecker, Joseph R.}, + year = {2023}, + month = oct +} + + +@article{sonrel2023metaanalysis, + title = {Meta-analysis of (single-cell method) benchmarks reveals the need for extensibility and interoperability}, + volume = {24}, + ISSN = {1474-760X}, + url = {http://dx.doi.org/10.1186/s13059-023-02962-5}, + DOI = {10.1186/s13059-023-02962-5}, + number = {1}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media LLC}, + author = {Sonrel, Anthony and Luetge, Almut and Soneson, Charlotte and Mallona, Izaskun and Germain, Pierre-Luc and Knyazev, Sergey and Gilis, Jeroen and Gerber, Reto and Seurinck, Ruth and Paul, Dominique and Sonder, Emanuel and Crowell, Helena L. and Fanaswala, Imran and Al-Ajami, Ahmad and Heidari, Elyas and Schmeing, Stephan and Milosavljevic, Stefan and Saeys, Yvan and Mangul, Serghei and Robinson, Mark D.}, + year = {2023}, + month = may +} + + +@article{saelens2019comparison, + title = {A comparison of single-cell trajectory inference methods}, + volume = {37}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-019-0071-9}, + DOI = {10.1038/s41587-019-0071-9}, + number = {5}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Saelens, Wouter and Cannoodt, Robrecht and Todorov, Helena and Saeys, Yvan}, + year = {2019}, + month = apr, + pages = {547–554} +} + + +@article{huang2018savergene, + title = {SAVER: gene expression recovery for single-cell RNA sequencing}, + volume = {15}, + ISSN = {1548-7105}, + url = {http://dx.doi.org/10.1038/s41592-018-0033-z}, + DOI = {10.1038/s41592-018-0033-z}, + number = {7}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media LLC}, + author = {Huang, Mo and Wang, Jingshu and Torre, Eduardo and Dueck, Hannah and Shaffer, Sydney and Bonasio, Roberto and Murray, John I. and Raj, Arjun and Li, Mingyao and Zhang, Nancy R.}, + year = {2018}, + month = jun, + pages = {539–542} +} + + +@article{chari2023speciousart, + title = {The specious art of single-cell genomics}, + volume = {19}, + ISSN = {1553-7358}, + url = {http://dx.doi.org/10.1371/journal.pcbi.1011288}, + DOI = {10.1371/journal.pcbi.1011288}, + number = {8}, + journal = {PLOS Computational Biology}, + publisher = {Public Library of Science (PLoS)}, + author = {Chari, Tara and Pachter, Lior}, + editor = {Papin, Jason A.}, + year = {2023}, + month = aug, + pages = {e1011288} +} + diff --git a/src/common/ontology/check_obsolete_terms/config.vsh.yaml b/src/common/ontology/check_obsolete_terms/config.vsh.yaml new file mode 100644 index 0000000000..fc006f6cf9 --- /dev/null +++ b/src/common/ontology/check_obsolete_terms/config.vsh.yaml @@ -0,0 +1,76 @@ +functionality: + status: disabled + name: check_obsolete_terms + namespace: common/ontology + description: | + Check for obsolete ontology terms in the dataset. + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: "Input h5ad file." + required: true + direction: input + example: dataset.h5ad + - name: "--struct" + type: string + description: "In which struct to look for the term." + required: true + direction: input + example: "obs" + - name: "--input_term" + type: string + description: "In which field to look for the term." + required: true + direction: input + example: "cell_type_ontology_term_id" + - name: Ontology + arguments: + - name: "--ontology" + type: file + description: "Ontology to check." + required: true + direction: input + example: cl.obo + - name: Arguments + arguments: + - name: "--obsolete_as_na" + type: boolean + description: "Whether to replace obsolete terms with NA." + default: true + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5ad file. + direction: output + example: output.h5ad + - name: "--output_term" + type: string + description: "In which field to store the updated term." + required: true + example: "cell_type_ontology_term_id" + - name: "--output_name" + type: string + description: "In which field to store the updated term name." + required: true + example: "cell_type" + - name: "--output_obsolete" + type: string + description: "In which field to store whether a term is obsolete." + required: true + example: "cell_type_ontology_obsolete" + resources: + - type: r_script + path: script.R + test_resources: + - type: r_script + path: test.R + - path: /resources_test/common/cellxgene_census +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + packages: [ dplyr, tidyr, tibble, ontologyIndex, processx ] \ No newline at end of file diff --git a/src/common/ontology/check_obsolete_terms/script.R b/src/common/ontology/check_obsolete_terms/script.R new file mode 100644 index 0000000000..bc1ef0ccb7 --- /dev/null +++ b/src/common/ontology/check_obsolete_terms/script.R @@ -0,0 +1,63 @@ +library(dplyr, warn.conflicts = FALSE) +library(tidyr, warn.conflicts = FALSE) +library(tibble, warn.conflicts = FALSE) +library(ontologyIndex, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "resources_test/common/cellxgene_census/dataset.h5ad", + ontology = "resources_test/common/cellxgene_census/cl.obo", + input_term = "cell_type_ontology_term_id", + struct = "obs", + output = "output.h5ad", + output_term = "cell_type_ontology_term_id", + output_name = "cell_type", + output_obsolete = "cell_type_ontology_obsolete", + obsolete_as_na = TRUE +) +## VIASH END + +cat("Read ontology\n") +ont <- ontologyIndex::get_ontology( + par$ontology, + extract_tags = "everything" +) +ont_tib <- ont %>% + as.data.frame %>% + select(id, name, obsolete, replaced_by) %>% + as_tibble + +cat("Read anndata\n") +adata <- anndata::read_h5ad(par$input, backed = "r") + +cat("Find terms\n") +term_ids <- adata[[par$struct]][[par$input_term]] + +unique_term_ids <- as.character(unique(term_ids)) + +cat("Look for obsolete or replaced terms\n") +ont_map <- ont_tib %>% + slice(match(unique_term_ids, id)) %>% + transmute( + orig_id = id, + id = case_when( + !obsolete ~ id, + replaced_by != "" ~ replaced_by, + rep(par$obsolete_as_na, length(id)) ~ rep(NA_character_, length(id)), + TRUE ~ id + ) + ) %>% + left_join(ont_tib %>% select(id, name, obsolete), by = "id") + +cat("Store new columns in data structure\n") +new_data <- ont_map %>% slice(match(term_ids, orig_id)) +adata[[par$struct]][[par$output_term]] <- new_data$id +adata[[par$struct]][[par$output_name]] <- new_data$name +adata[[par$struct]][[par$output_obsolete]] <- ifelse( + !is.na(new_data$obsolete), + new_data$obsolete, + TRUE +) + +cat("Write to file\n") +anndata::write_h5ad(adata, par$output) diff --git a/src/common/ontology/check_obsolete_terms/test.R b/src/common/ontology/check_obsolete_terms/test.R new file mode 100644 index 0000000000..5e3c582021 --- /dev/null +++ b/src/common/ontology/check_obsolete_terms/test.R @@ -0,0 +1,54 @@ +library(assertthat) + +## VIASH START +meta <- list( + executable = "target/docker/common/ontology/check_obsolete_terms", + resources_dir = "resources_test/common/" +) +## VIASH END + +input_file <- paste0(meta$resources_dir, "/cellxgene_census/dataset.h5ad") +ontology_file <- paste0(meta$resources_dir, "/cellxgene_census/cl.obo") +temp_file <- tempfile(fileext = ".h5ad") +temp2_file <- tempfile(fileext = ".h5ad") + +# add obsolete terms to the dataset +input <- anndata::read_h5ad(input_file) +input$obs$cell_type_ontology_term_id <- as.character(input$obs$cell_type_ontology_term_id) +input$obs$cell_type_ontology_term_id[1:3] <- "CL:0000375" # obsolete, replaced by 'CL:0007010' +input$obs$cell_type_ontology_term_id[4:6] <- "CL:0000399" # obsolete, removed +input$obs$cell_type_ontology_term_id[7:9] <- "CL:0007011" # not obsolete +zzz <- input$write_h5ad(temp_file) + +# run component +zzz <- processx::run( + meta$executable, + c( + "--input", temp_file, + "--struct", "obs", + "--input_term", "cell_type_ontology_term_id", + "--ontology", ontology_file, + "--output", temp2_file, + "--output_term", "cell_type_ontology_term_id_new", + "--output_name", "cell_type_new", + "--output_obsolete", "cell_type_obsolete_new" + ), + echo = TRUE +) + +# check output +output <- anndata::read_h5ad(temp2_file) + +print(output$obs[1:10, , drop = FALSE]) + +assert_that( + all(output$obs$cell_type_ontology_term_id_new[1:3] == "CL:0007010"), + all(is.na(output$obs$cell_type_ontology_term_id_new[4:6])), + all(output$obs$cell_type_ontology_term_id_new[7:9] == "CL:0007011"), + all(output$obs$cell_type_new[1:3] == "preosteoblast"), + all(is.na(output$obs$cell_type_new[4:6])), + all(output$obs$cell_type_new[7:9] == "enteric neuron"), + all(!output$obs$cell_type_obsolete_new[1:3]), + all(output$obs$cell_type_obsolete_new[4:6]), + all(!output$obs$cell_type_obsolete_new[7:9]) +) diff --git a/src/common/process_dataset_metadata/run/config.vsh.yaml b/src/common/process_dataset_metadata/run/config.vsh.yaml new file mode 100644 index 0000000000..550b621ef6 --- /dev/null +++ b/src/common/process_dataset_metadata/run/config.vsh.yaml @@ -0,0 +1,29 @@ +functionality: + name: run + namespace: common/process_dataset_metadata + description: >- + This workflow transforms the meta information of the datasets into a format + that can be used by the website. + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + required: true + direction: input + example: meta.yaml + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + default: meta.json + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + dependencies: + - name: common/process_task_results/yaml_to_json +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/common/process_dataset_metadata/run/main.nf b/src/common/process_dataset_metadata/run/main.nf new file mode 100644 index 0000000000..2e453d5d52 --- /dev/null +++ b/src/common/process_dataset_metadata/run/main.nf @@ -0,0 +1,17 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | yaml_to_json.run( + fromState: ["input"], + toState: ["output"] + ) + + | setState(["output"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/common/process_dataset_metadata/run/run.sh b/src/common/process_dataset_metadata/run/run.sh new file mode 100644 index 0000000000..27ea225ed3 --- /dev/null +++ b/src/common/process_dataset_metadata/run/run.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# fail on error +set -e + +# ensure we're in the root of the repo +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +DATASET_DIR="s3://openproblems-data/resources/datasets/" + +for LOADER in $(aws s3 ls $DATASET_DIR); do + + if [ "$LOADER" == "PRE" ]; then + continue + fi + + BASE_DIR="${DATASET_DIR%/}/$LOADER" + + for DATASET in $(aws s3 ls $BASE_DIR); do + + if [ "$DATASET" == "PRE" ]; then + continue + fi + + FILE_DIR="${BASE_DIR%/}/${DATASET%/}/log_cp10k/" + FILES=$(aws s3 ls $FILE_DIR) + metafiles=$(echo "$FILES" | grep "meta" | awk '{print $NF}') + # metafiles=$(find $INPUT -type f -name "*meta*") + # echo $metafiles + + for metafile in $metafiles; do + INPUT="${FILE_DIR%/}/$metafile" + OUTPUT_DIR="../website/datasets/$LOADER/${DATASET%/}/data/" + OUTPUT_FILE="${metafile%.*}.json" + echo "Processing $LOADER - $DATASET : $INPUT" + + # start the + NXF_VER=23.10.0 nextflow run . \ + -main-script target/nextflow/common/process_dataset_metadata/run/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + --id "extract_metadata" \ + --input "$INPUT" \ + --output "$OUTPUT_FILE" \ + --output_state "state.yaml" \ + --publish_dir "$OUTPUT_DIR" + done + +# cause quarto rerender to index page when in preview mode +# touch ../website/results/$TASK/index.qmd + done +done \ No newline at end of file diff --git a/src/common/process_task_results/api/get_info.yaml b/src/common/process_task_results/api/get_info.yaml new file mode 100644 index 0000000000..117504cc75 --- /dev/null +++ b/src/common/process_task_results/api/get_info.yaml @@ -0,0 +1,23 @@ +functionality: + namespace: common/process_task_results + arguments: + - name: "--input" + type: "file" + example: + description: "A yaml file" + - name: "--task_id" + type: "string" + description: "A task dir" + example: label_projection + - name: "--output" + type: "file" + direction: "output" + default: "output.json" + description: "Output json" + test_resources: + - type: python_script + path: /src/common/comp_tests/check_get_info.py + - path: /src + dest: openproblems-v2/src + - path: /_viash.yaml + dest: openproblems-v2/_viash.yaml \ No newline at end of file diff --git a/src/common/process_task_results/generate_qc/config.vsh.yaml b/src/common/process_task_results/generate_qc/config.vsh.yaml new file mode 100644 index 0000000000..68a5d19682 --- /dev/null +++ b/src/common/process_task_results/generate_qc/config.vsh.yaml @@ -0,0 +1,39 @@ +functionality: + name: "generate_qc" + description: "Generate task QC metrics" + namespace: common/process_task_results + arguments: + - name: "--task_info" + type: "file" + example: task_info.json + description: "Task info file" + - name: "--method_info" + type: "file" + example: method_info.json + description: "Method info file" + - name: "--metric_info" + type: "file" + example: metric_info.json + description: "Metric info file" + - name: "--dataset_info" + type: "file" + example: dataset_info.json + description: "Dataset info file" + - name: "--results" + type: "file" + example: results.json + description: "Results file" + - name: "--output" + type: "file" + direction: "output" + default: "output.json" + description: "Output json" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/generate_qc/script.py b/src/common/process_task_results/generate_qc/script.py new file mode 100644 index 0000000000..f15a877522 --- /dev/null +++ b/src/common/process_task_results/generate_qc/script.py @@ -0,0 +1,294 @@ +import json +import numpy as np + +## VIASH START +## VIASH END + +EXPECTED_TASK_FIELDS = ["task_id", "task_name", "task_summary", "task_description"] +EXPECTED_METHOD_FIELDS = ["task_id", "commit_sha", "method_id", "method_name", "method_summary", "paper_reference", "is_baseline"] +EXPECTED_METRIC_FIELDS = ["task_id", "commit_sha", "metric_id", "metric_name", "metric_summary", "paper_reference", "maximize"] +EXPECTED_DATASET_FIELDS = ["task_id", "dataset_id", "dataset_name", "dataset_summary", "data_reference", "data_url"] + +def dump_json(obj, fp): + """Dump to JSON in a numpy-safe fashion.""" + json.dump( + obj, + fp, + indent=4, + sort_keys=False, + separators=(", ", ": "), + ensure_ascii=False, + ) + +def create_quality_control(task_info, dataset_info, method_info, metric_info, results): + """Quality control to detect anomalies in the results.""" + task_id = task_info["task_id"] + + result_qc = [] + + def add_qc( + category: str, + name: str, + value, + severity_value: float, + code: str, + message: str, + ) -> None: + "Add an entry to the result qc" + if severity_value <= 1: + severity = 0 + elif severity_value <= 2: + severity = 1 + elif severity_value <= 3: + severity = 2 + else: + severity = 3 + result_qc.append({ + "task_id": task_id, + "category": category, + "name": name, + "value": value, + "severity": severity, + "severity_value": severity_value, + "code": code, + "message": message + }) + + def percent_missing(list_of_dicts, field): + are_missing = [] + for item in list_of_dicts: + if field == 'paper_reference' and item.get('is_baseline', False): + are_missing.append(0.0) + elif field in item and item[field] is not None: + are_missing.append(0.0) + else: + are_missing.append(1.0) + return np.mean(are_missing) + + # check task_info + for field in EXPECTED_TASK_FIELDS: + pct_missing = percent_missing([task_info], field) + add_qc( + "Task info", + f"Pct '{field}' missing", + pct_missing, + 3.0 if pct_missing > 0 else 0.0, + "percent_missing([task_info], field)", + f"Task metadata field '{field}' should be defined\n" + f" Task id: {task_id}\n" + f" Field: {field}\n" + ) + + # check method_info + for field in EXPECTED_METHOD_FIELDS: + pct_missing = percent_missing(method_info, field) + add_qc( + "Method info", + f"Pct '{field}' missing", + pct_missing, + 3.0 if pct_missing > 0 else 0.0, + "percent_missing(method_info, field)", + f"Method metadata field '{field}' should be defined\n" + f" Task id: {task_id}\n" + f" Field: {field}\n" + ) + + # check metric_info + for field in EXPECTED_METRIC_FIELDS: + pct_missing = percent_missing(metric_info, field) + add_qc( + "Metric info", + f"Pct '{field}' missing", + pct_missing, + 3.0 if pct_missing > 0 else 0.0, + "percent_missing(metric_info, field)", + f"Metric metadata field '{field}' should be defined\n" + f" Task id: {task_id}\n" + f" Field: {field}\n" + ) + + # check dataset_info + for field in EXPECTED_DATASET_FIELDS: + pct_missing = percent_missing(dataset_info, field) + add_qc( + "Dataset info", + f"Pct '{field}' missing", + pct_missing, + 3.0 if pct_missing > 0 else 0.0, + "percent_missing(dataset_info, field)", + f"Dataset metadata field '{field}' should be defined\n" + f" Task id: {task_id}\n" + f" Field: {field}\n" + ) + + # turn results into long format for easier processing + results_long = [ + { + "task_id": x["task_id"], + "method_id": x["method_id"], + "dataset_id": x["dataset_id"], + "metric_id": metric["metric_id"], + "metric_value" : x["metric_values"].get(metric["metric_id"]), + "scaled_score" : x["scaled_scores"].get(metric["metric_id"]), + } + for metric in metric_info + for x in results + ] + + # check percentage missing + pct_missing = 1 - len(results_long) / (len(method_info) * len(metric_info) * len(dataset_info)) + add_qc( + "Raw data", + "Number of results", + len(results), + pct_missing / .1, + "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", + f"Number of results should be equal to #methods × #metrics × #datasets.\n" + f" Task id: {task_id}\n" + f" Number of results: {len(results)}\n" + f" Number of methods: {len(method_info)}\n" + f" Number of metrics: {len(metric_info)}\n" + f" Number of datasets: {len(dataset_info)}\n" + ) + + # QC per metric + for metric in metric_info: + metric_id = metric["metric_id"] + values = [ + res + for res in results_long + if res["metric_id"] == metric_id + and res["metric_value"] is not None + and np.isreal(res["metric_value"]) + ] + pct_missing = 1 - len(values) / len(dataset_info) / len(method_info) + + add_qc( + "Raw results", + f"Metric '{metric_id}' %missing", + pct_missing, + pct_missing / .1, + "pct_missing <= .1", + f"Percentage of missing results should be less than 10%.\n" + f" Task id: {task_id}\n" + f" Metric id: {metric_id}\n" + f" Percentage missing: {pct_missing*100:.0f}%\n" + ) + + # QC per method + for method in method_info: + method_id = method["method_id"] + values = [ + res + for res in results_long + if res["method_id"] == method_id + and res["metric_value"] is not None + and np.isreal(res["metric_value"]) + ] + pct_missing = 1 - len(values) / len(dataset_info) / len(metric_info) + + add_qc( + "Raw results", + f"Method '{method_id}' %missing", + pct_missing, + pct_missing / .1, + "pct_missing <= .1", + f"Percentage of missing results should be less than 10%.\n" + f" Task id: {task_id}\n" + f" method id: {method_id}\n" + f" Percentage missing: {pct_missing*100:.0f}%\n" + ) + + # QC per dataset + for dataset in dataset_info: + dataset_id = dataset["dataset_id"] + values = [ + res + for res in results_long + if res["dataset_id"] == dataset_id + and res["metric_value"] is not None + and np.isreal(res["metric_value"]) + ] + pct_missing = 1 - len(values) / len(metric_info) / len(method_info) + + add_qc( + "Raw results", + f"Dataset '{dataset_id}' %missing", + pct_missing, + pct_missing / .1, + "pct_missing <= .1", + f"Percentage of missing results should be less than 10%.\n" + f" Task id: {task_id}\n" + f" dataset id: {dataset_id}\n" + f" Percentage missing: {pct_missing*100:.0f}%\n" + ) + + + # QC per metric and method + for metric in metric_info: + for method in method_info: + metric_id = metric["metric_id"] + method_id = method["method_id"] + scores = [ + res["scaled_score"] + for res in results_long + if res["metric_id"] == metric_id + and res["method_id"] == method_id + and res["scaled_score"] is not None + and np.isreal(res["scaled_score"]) + ] + + if len(scores) >= 1: + worst_score = np.min(scores).item() + best_score = np.max(scores).item() + + add_qc( + "Scaling", + f"Worst score {method_id} {metric_id}", + worst_score, + worst_score / -1, + "worst_score >= -1", + f"Method {method_id} performs much worse than baselines.\n" + f" Task id: {task_id}\n" + f" Method id: {method_id}\n" + f" Metric id: {metric_id}\n" + f" Worst score: {worst_score}%\n" + ) + + add_qc( + "Scaling", + f"Best score {method_id} {metric_id}", + best_score, + best_score / 2, + "best_score <= 2", + f"Method {method_id} performs a lot better than baselines.\n" + f" Task id: {task_id}\n" + f" Method id: {method_id}\n" + f" Metric id: {metric_id}\n" + f" Best score: {best_score}%\n" + ) + + return result_qc + +def main(par): + # read data from files + with open(par["task_info"], "r", encoding="utf8") as file: + task_info = json.load(file) + with open(par["method_info"], "r", encoding="utf8") as file: + method_info = json.load(file) + with open(par["metric_info"], "r", encoding="utf8") as file: + metric_info = json.load(file) + with open(par["dataset_info"], "r", encoding="utf8") as file: + dataset_info = json.load(file) + with open(par["results"], "r", encoding="utf8") as file: + results = json.load(file) + + # create info objects + quality_control = create_quality_control(task_info, dataset_info, method_info, metric_info, results) + + # write data to files + with open(par["output"], "w", encoding="utf8") as file: + dump_json(quality_control, file) + +if __name__ == "__main__": + main(par) diff --git a/src/common/process_task_results/get_api_info/config.vsh.yaml b/src/common/process_task_results/get_api_info/config.vsh.yaml new file mode 100644 index 0000000000..0e7eb1696e --- /dev/null +++ b/src/common/process_task_results/get_api_info/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../api/get_info.yaml +functionality: + status: disabled + name: get_api_info + description: "Extract api info" + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, dplyr, yaml, rlang, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] + - type: native diff --git a/src/common/process_task_results/get_api_info/script.R b/src/common/process_task_results/get_api_info/script.R new file mode 100644 index 0000000000..1686dee222 --- /dev/null +++ b/src/common/process_task_results/get_api_info/script.R @@ -0,0 +1,79 @@ +library(purrr) +library(dplyr) +library(yaml) +library(rlang) + +## VIASH START +par <- list( + input = ".", + task_id = "label_projection", + output = "output/api.json" +) +## VIASH END + +comp_yamls <- list.files(paste(par$input, "src/tasks", par$task_id, "api", sep = "/"), pattern = "comp_", full.names = TRUE) +file_yamls <- list.files(paste(par$input, "src/tasks", par$task_id, "api", sep = "/"), pattern = "file_", full.names = TRUE) + +# list component - file args links +comp_file <- map_df(comp_yamls, function(yaml_file) { + conf <- yaml::read_yaml(yaml_file) + + map_df(conf$functionality$arguments, function(arg) { + tibble( + comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + arg_name = gsub("^-*", "", arg$name), + direction = arg$direction %||% "input", + file_name = basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .) + ) + }) +}) + +# get component info +comp_info <- map_df(comp_yamls, function(yaml_file) { + conf <- yaml::read_yaml(yaml_file) + + tibble( + name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .) + ) +}) + +# get file info +file_info <- map_df(file_yamls, function(yaml_file) { + arg <- yaml::read_yaml(yaml_file) + + tibble( + name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + description = arg$description, + label = arg$info$label, + example = arg$example, + clean_label = name %>% gsub("file_", "", .) %>% gsub("_", " ", .) + ) +}) + +# get file - slot args +file_slot <- map_df(file_yamls, function(yaml_file) { + arg <- yaml::read_yaml(yaml_file) + + map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) { + df <- map_df(slot, as.data.frame) + df$struct <- group_name + df$file_name <- basename(yaml_file) %>% gsub("\\.yaml", "", .) + as_tibble(df) + }) +}) %>% + mutate(multiple = multiple %|% FALSE) + +out <- list( + comp_info = purrr::transpose(comp_info), + file_info = purrr::transpose(file_info), + comp_file_io = purrr::transpose(comp_file), + file_schema = purrr::transpose(file_slot) +) + +jsonlite::write_json( + out, + par$output, + auto_unbox = TRUE, + pretty = TRUE +) diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml new file mode 100644 index 0000000000..10247a22ba --- /dev/null +++ b/src/common/process_task_results/get_dataset_info/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../api/get_info.yaml +functionality: + name: "get_dataset_info" + description: "Extract dataset info and convert to expected format for website results" + resources: + - type: r_script + path: script.R + test_resources: + - type: file + path: /resources_test/common/task_metadata/dataset_info.yaml + dest: test_file.yaml +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_dataset_info/script.R b/src/common/process_task_results/get_dataset_info/script.R new file mode 100644 index 0000000000..a2c5317c05 --- /dev/null +++ b/src/common/process_task_results/get_dataset_info/script.R @@ -0,0 +1,54 @@ +requireNamespace("jsonlite", quietly = TRUE) +requireNamespace("yaml", quietly = TRUE) +library(purrr, warn.conflicts = FALSE) +library(rlang, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "output/label_projection/dataset_uns.yaml", + output = "output/dataset_info.json" +) +## VIASH END + +datasets <- yaml::yaml.load_file(par$input) + +# transform into format expected by website +outputs <- map(datasets, function(dataset) { + # ↑ the 'dataset' object could be used as the new format + + # TODO: it'd be nice if the s3 path was also included in the dataset info + + # construct v1 format + out <- list( + "task_id" = par$task_id, + "dataset_id" = dataset$dataset_id, + "dataset_name" = dataset$dataset_name, + "dataset_summary" = dataset$dataset_summary, + "dataset_description" = dataset$dataset_description %||% NA_character_, + "data_reference" = dataset$dataset_reference %||% NA_character_, + "data_url" = dataset$dataset_url %||% NA_character_, + "date_created" = dataset$date_created %||% NA_character_, + "file_size" = dataset$file_size %||% NA_character_ + ) + + if (!is.null(dataset[["common_dataset_id"]])) { + out[["common_dataset_id"]] <- dataset[["common_dataset_id"]] + } + + # show warning when certain data is missing and return null? + for (n in names(out)) { + if (is.null(out[[n]])) { + out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) + stop("missing value for value '", n, "' in ", out_as_str) + } + } + + out +}) + +jsonlite::write_json( + outputs, + par$output, + auto_unbox = TRUE, + pretty = TRUE +) diff --git a/src/common/process_task_results/get_method_info/config.vsh.yaml b/src/common/process_task_results/get_method_info/config.vsh.yaml new file mode 100644 index 0000000000..053bbac53c --- /dev/null +++ b/src/common/process_task_results/get_method_info/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../api/get_info.yaml +functionality: + name: "get_method_info" + description: "Extract method info" + resources: + - type: r_script + path: script.R + test_resources: + - type: file + path: /resources_test/common/task_metadata/method_configs.yaml + dest: test_file.yaml +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_method_info/script.R b/src/common/process_task_results/get_method_info/script.R new file mode 100644 index 0000000000..a332413b69 --- /dev/null +++ b/src/common/process_task_results/get_method_info/script.R @@ -0,0 +1,76 @@ +requireNamespace("jsonlite", quietly = TRUE) +requireNamespace("yaml", quietly = TRUE) +library(purrr, warn.conflicts = FALSE) +library(rlang, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "output/temp/method_configs.yaml", + output = "output/test/method_info.json" +) +## VIASH END + +configs <- yaml::yaml.load_file(par$input) + +outputs <- map(configs, function(config) { + if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") { + return(NULL) + } + + # prep for viash 0.9.0 + build_info <- config$build_info %||% config$info + if ("functionality" %in% names(config)) { + config[names(config$functionality)] <- config$functionality + config[["functionality"]] <- NULL + } + + info <- config$info + + # add extra info + info$config_path <- gsub(".*/src/", "src/", build_info$config) + info$task_id <- gsub("/.*", "", config$namespace) + info$id <- config$name + info$namespace <- config$namespace + info$commit_sha <- build_info$git_commit %||% "missing-sha" + info$code_version <- "missing-version" + info$implementation_url <- paste0( + build_info$git_remote, "/blob/", + build_info$git_commit, "/", + info$config_path + ) + + # ↑ this could be used as the new format + + # construct v1 format + out <- list( + task_id = info$task_id, + method_id = info$id, + method_name = info$label, + method_summary = info$summary, + method_description = info$description, + is_baseline = grepl("control", info$type), + paper_reference = info$reference %||% NA_character_, + code_url = info$repository_url %||% NA_character_, + implementation_url = info$implementation_url %||% NA_character_, + code_version = NA_character_, + commit_sha = info$commit_sha + ) + + # show warning when certain data is missing and return null? + for (n in names(out)) { + if (is.null(out[[n]])) { + out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) + stop("missing value for value '", n, "' in ", out_as_str) + } + } + + # return output + out +}) + +jsonlite::write_json( + outputs, + par$output, + auto_unbox = TRUE, + pretty = TRUE +) \ No newline at end of file diff --git a/src/common/process_task_results/get_metric_info/config.vsh.yaml b/src/common/process_task_results/get_metric_info/config.vsh.yaml new file mode 100644 index 0000000000..ee5833b5b9 --- /dev/null +++ b/src/common/process_task_results/get_metric_info/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../api/get_info.yaml +functionality: + name: "get_metric_info" + description: "Extract metric info" + resources: + - type: r_script + path: script.R + test_resources: + - type: file + path: /resources_test/common/task_metadata/metric_configs.yaml + dest: test_file.yaml +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_metric_info/script.R b/src/common/process_task_results/get_metric_info/script.R new file mode 100644 index 0000000000..5ef8f6b04b --- /dev/null +++ b/src/common/process_task_results/get_metric_info/script.R @@ -0,0 +1,81 @@ +requireNamespace("jsonlite", quietly = TRUE) +requireNamespace("yaml", quietly = TRUE) +library(purrr, warn.conflicts = FALSE) +library(rlang, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "output/temp/metric_configs.yaml", + output = "output/metric_info.json" +) +## VIASH END + +configs <- yaml::yaml.load_file(par$input) + +outputs <- map(configs, function(config) { + if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") { + return(NULL) + } + + # prep for viash 0.9.0 + build_info <- config$build_info %||% config$info + if ("functionality" %in% names(config)) { + config[names(config$functionality)] <- config$functionality + config[["functionality"]] <- NULL + } + + map( + config$info$metrics, + function(info) { + # add extra info + info$config_path <- gsub(".*/src/", "src/", build_info$config) + info$task_id <- gsub("/.*", "", config$namespace) + info$id <- info$name + info$component_id <- config$name + info$namespace <- config$namespace + info$commit_sha <- build_info$git_commit %||% "missing-sha" + info$code_version <- "missing-version" + info$implementation_url <- paste0( + build_info$git_remote, "/blob/", + build_info$git_commit, "/", + info$config_path + ) + + # ↑ this could be used as the new format + + # construct v1 format + out <- list( + task_id = info$task_id, + metric_id = info$id, + metric_name = info$label, + metric_summary = info$summary, + metric_description = info$description, + paper_reference = info$reference %||% NA_character_, + implementation_url = info$implementation_url %||% NA_character_, + code_version = NA_character_, + commit_sha = info$commit_sha, + maximize = info$maximize + ) + + # show warning when certain data is missing and return null? + for (n in names(out)) { + if (is.null(out[[n]])) { + out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) + stop("missing value for value '", n, "' in ", out_as_str) + } + } + + # return output + out + } + ) +}) + +outputs <- unlist(outputs, recursive = FALSE) + +jsonlite::write_json( + outputs, + par$output, + auto_unbox = TRUE, + pretty = TRUE +) \ No newline at end of file diff --git a/src/common/process_task_results/get_results/config.vsh.yaml b/src/common/process_task_results/get_results/config.vsh.yaml new file mode 100644 index 0000000000..cd639fad4d --- /dev/null +++ b/src/common/process_task_results/get_results/config.vsh.yaml @@ -0,0 +1,51 @@ +functionality: + name: "get_results" + description: "Extract execution info" + namespace: common/process_task_results + arguments: + - name: "--task_id" + type: "string" + example: "batch_integration" + description: "Task id" + - name: "--input_scores" + type: "file" + example: score_uns.yaml + description: "Scores file" + - name: "--input_execution" + type: "file" + example: trace.txt + description: "Nextflow log file" + - name: "--input_dataset_info" + type: "file" + example: dataset_info.json + description: "Method info file" + - name: "--input_method_info" + type: "file" + example: method_info.json + description: "Method info file" + - name: "--input_metric_info" + type: "file" + example: metric_info.json + description: "Metric info file" + - name: "--output_results" + type: "file" + direction: "output" + default: "results.json" + description: "Output json" + - name: "--output_metric_execution_info" + type: "file" + direction: "output" + default: "metric_execution_info.json" + description: "Output metric execution info" + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, dplyr, tidyr, readr, lubridate, dynutils, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_results/script.R b/src/common/process_task_results/get_results/script.R new file mode 100644 index 0000000000..822562aa18 --- /dev/null +++ b/src/common/process_task_results/get_results/script.R @@ -0,0 +1,237 @@ +requireNamespace("jsonlite", quietly = TRUE) +requireNamespace("yaml", quietly = TRUE) +requireNamespace("dynutils", quietly = TRUE) +requireNamespace("readr", quietly = TRUE) +requireNamespace("lubridate", quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) +library(tidyr, warn.conflicts = FALSE) +library(purrr, warn.conflicts = FALSE) +library(rlang, warn.conflicts = FALSE) + +## VIASH START +dir <- "work/c1/6660ea0cc6155d7e13fa341d16057b/_viash_par" +par <- list( + task_id = "task_1", + input_scores = paste0(dir, "/input_scores_1/score_uns.yaml"), + input_execution = paste0(dir, "/input_execution_1/trace.txt"), + input_dataset_info = paste0(dir, "/input_dataset_info_1/output.json"), + input_method_info = paste0(dir, "/input_method_info_1/output.json"), + input_metric_info = paste0(dir, "/input_metric_info_1/output.json"), + output_results = "output/results.json", + output_metric_execution_info = "output/metric_execution_info.json" +) +## VIASH END + +# --- helper functions --------------------------------------------------------- +cat("Loading helper functions\n") +parse_exit <- function(x) { + if (is.na(x) || x == "-") { + NA_integer_ + } else { + as.integer(x) + } +} +parse_duration <- function(x) { + if (is.na(x) || x == "-") { + NA_real_ + } else { + as.numeric(lubridate::duration(toupper(x))) + } +} +parse_cpu <- function(x) { + if (is.na(x) || x == "-") { + NA_real_ + } else { + as.numeric(gsub(" *%", "", x)) + } +} +parse_size <- function(x) { + out <- + if (is.na(x) || x == "-") { + NA_integer_ + } else if (grepl("GB", x)) { + as.numeric(gsub(" *GB", "", x)) * 1024 + } else if (grepl("MB", x)) { + as.numeric(gsub(" *MB", "", x)) + } else if (grepl("KB", x)) { + as.numeric(gsub(" *KB", "", x)) / 1024 + } else if (grepl("B", x)) { + as.numeric(gsub(" *B", "", x)) / 1024 / 1024 + } else { + NA_integer_ + } + as.integer(ceiling(out)) +} + +# --- read input files --------------------------------------------------------- +cat("Reading input files\n") +# read scores +raw_scores <- + yaml::yaml.load_file(par$input_scores) %>% + map_df(function(x) { + tryCatch({ + as_tibble(as.data.frame( + x[c("dataset_id", "method_id", "metric_ids", "metric_values")] + )) + }, error = function(e) { + message("Encountered error while reading scores: ", e$message) + NULL + }) + }) + +# read metric info +dataset_info <- jsonlite::read_json(par$input_dataset_info, simplifyVector = TRUE) +method_info <- jsonlite::read_json(par$input_method_info, simplifyVector = TRUE) +metric_info <- jsonlite::read_json(par$input_metric_info, simplifyVector = TRUE) + +# --- process scores and execution info ---------------------------------------- +cat("Processing scores and execution info\n") +scale_scores <- function(values, is_control, maximize) { + control_values <- values[is_control & !is.na(values)] + if (length(control_values) < 2) { + return(NA_real_) + } + + min_control_value <- min(control_values) + max_control_value <- max(control_values) + + if (min_control_value == max_control_value) { + return(NA_real_) + } + + scaled <- (values - min_control_value) / (max_control_value - min_control_value) + + if (maximize) { + scaled + } else { + 1 - scaled + } +} +aggregate_scores <- function(scaled_score) { + mean(pmin(1, pmax(0, scaled_score)) %|% 0) +} +scores <- raw_scores %>% + complete( + dataset_id, + method_id, + metric_ids, + fill = list(metric_values = NA_real_) + ) %>% + left_join(method_info %>% select(method_id, is_baseline), by = "method_id") %>% + left_join(metric_info %>% select(metric_ids = metric_id, maximize), by = "metric_ids") %>% + group_by(metric_ids, dataset_id) %>% + mutate(scaled_score = scale_scores(metric_values, is_baseline, maximize[[1]]) %|% 0) %>% + group_by(dataset_id, method_id) %>% + summarise( + metric_values = list(as.list(setNames(metric_values, metric_ids))), + scaled_scores = list(as.list(setNames(scaled_score, metric_ids))), + mean_score = aggregate_scores(scaled_score), + .groups = "drop" + ) + +# read nxf log and process the task id +norm_methods <- "/log_cp10k|/log_cpm|/sqrt_cp10k|/sqrt_cpm|/l1_sqrt|/log_scran_pooling" +id_regex <- paste0("^.*:(.*)_process \\(([^\\.]*)(", norm_methods, ")?(.[^\\.]*)?\\.(.*)\\)$") + +trace <- readr::read_tsv(par$input_execution) %>% + mutate( + id = name, + process_id = stringr::str_extract(id, id_regex, 1L), + dataset_id = stringr::str_extract(id, id_regex, 2L), + normalization_id = gsub("^/", "", stringr::str_extract(id, id_regex, 3L)), + grp4 = gsub("^\\.", "", stringr::str_extract(id, id_regex, 4L)), + grp5 = stringr::str_extract(id, id_regex, 5L), + submit = strptime(submit, "%Y-%m-%d %H:%M:%S"), + ) %>% + # detect whether entry is a metric or a method + mutate( + method_id = ifelse(is.na(grp4), grp5, grp4), + metric_id = ifelse(is.na(grp4), grp4, grp5) + ) %>% + select(-grp4, -grp5) %>% + filter(!is.na(method_id)) %>% + # take last entry for each run + arrange(desc(submit)) %>% + group_by(name) %>% + slice(1) %>% + ungroup() + +# parse values +execution_info <- trace %>% + filter(process_id == method_id) %>% # only keep method entries + rowwise() %>% + transmute( + dataset_id, + normalization_id, + method_id, + resources = list(list( + exit_code = parse_exit(exit), + duration_sec = parse_duration(realtime), + cpu_pct = parse_cpu(`%cpu`), + peak_memory_mb = parse_size(peak_vmem), + disk_read_mb = parse_size(rchar), + disk_write_mb = parse_size(wchar) + )) + ) %>% + ungroup() + +# combine scores with execution info +# fill up missing entries with NAs and 0s +metric_ids <- unique(raw_scores$metric_ids) +rep_names <- function(val) { + setNames( + as.list(rep(val, length(metric_ids))), + metric_ids + ) +} +out <- full_join( + scores, + execution_info, + by = c("method_id", "dataset_id") +) %>% + rowwise() %>% + mutate( + task_id = par$task_id, + metric_values = list(metric_values %||% rep_names(NA_real_)), + scaled_scores = list(scaled_scores %||% rep_names(0)), + mean_score = mean_score %|% 0, + ) %>% + ungroup() + + +# --- process metric execution info -------------------------------------------- +cat("Processing metric execution info\n") +metric_execution_info <- trace %>% + filter(process_id == metric_id) %>% # only keep metric entries + rowwise() %>% + transmute( + dataset_id, + normalization_id, + method_id, + metric_id, + resources = list(list( + exit_code = parse_exit(exit), + duration_sec = parse_duration(realtime), + cpu_pct = parse_cpu(`%cpu`), + peak_memory_mb = parse_size(peak_vmem), + disk_read_mb = parse_size(rchar), + disk_write_mb = parse_size(wchar) + )) + ) %>% + ungroup() + +# --- write output files ------------------------------------------------------- +cat("Writing output files\n") +# write output files +jsonlite::write_json( + purrr::transpose(out), + par$output_results, + auto_unbox = TRUE, + pretty = TRUE +) +jsonlite::write_json( + purrr::transpose(metric_execution_info), + par$output_metric_execution_info, + auto_unbox = TRUE, + pretty = TRUE +) diff --git a/src/common/process_task_results/get_task_info/config.vsh.yaml b/src/common/process_task_results/get_task_info/config.vsh.yaml new file mode 100644 index 0000000000..2e8fbd2b66 --- /dev/null +++ b/src/common/process_task_results/get_task_info/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../api/get_info.yaml +functionality: + name: "get_task_info" + description: "Extract task info" + resources: + - type: r_script + path: script.R + test_resources: + - type: file + path: /resources_test/common/task_metadata/task_info.yaml + dest: test_file.yaml +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ purrr, yaml, rlang, processx ] + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_task_info/script.R b/src/common/process_task_results/get_task_info/script.R new file mode 100644 index 0000000000..cfe529edfc --- /dev/null +++ b/src/common/process_task_results/get_task_info/script.R @@ -0,0 +1,40 @@ +requireNamespace("jsonlite", quietly = TRUE) +requireNamespace("yaml", quietly = TRUE) +library(purrr, warn.conflicts = FALSE) +library(rlang, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "output/temp/task_info.yaml", + output = "output/test/task_info.json" +) +## VIASH END + +info <- yaml::yaml.load_file(par$input) +# ↑ this could be used as the new format + +# construct v1 format +out <- list( + task_id = info$name, + commit_sha = NA_character_, + task_name = info$label, + task_summary = info$summary, + task_description = paste0(info$motivation, "\n\n", info$description), + repo = "openproblems-bio/openproblems-v2", + authors = info$authors +) + +# show warning when certain data is missing and return null? +for (n in names(out)) { + if (is.null(out[[n]])) { + out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) + stop("missing value for value '", n, "' in ", out_as_str) + } +} + +jsonlite::write_json( + out, + par$output, + auto_unbox = TRUE, + pretty = TRUE +) diff --git a/src/common/process_task_results/run/config.vsh.yaml b/src/common/process_task_results/run/config.vsh.yaml new file mode 100644 index 0000000000..d746a54245 --- /dev/null +++ b/src/common/process_task_results/run/config.vsh.yaml @@ -0,0 +1,91 @@ +functionality: + name: run + namespace: common/process_task_results + description: >- + This workflow transforms the meta information of the results into a format + that can be used by the website. + argument_groups: + - name: Inputs + arguments: + - name: "--input_scores" + type: file + required: true + direction: input + description: A yaml file containing the scores of each of the methods + example: score_uns.yaml + - name: "--input_method_configs" + type: file + required: true + direction: input + example: method_configs.yaml + - name: "--input_metric_configs" + type: file + required: true + direction: input + example: metric_configs.yaml + - name: "--input_dataset_info" + type: file + required: true + direction: input + example: dataset_info.yaml + - name: "--input_execution" + type: file + required: true + direction: input + example: trace.txt + - name: "--input_task_info" + type: file + required: true + direction: input + example: task_info.yaml + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: results.json + - name: "--output_method_info" + type: file + required: true + direction: output + default: method_info.json + - name: "--output_metric_info" + type: file + required: true + direction: output + default: metric_info.json + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_info.json + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.json + - name: "--output_qc" + type: file + required: true + direction: output + default: quality_control.json + - name: "--output_metric_execution_info" + type: file + required: true + direction: output + default: metric_execution_info.json + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + dependencies: + - name: common/process_task_results/get_results + - name: common/process_task_results/get_method_info + - name: common/process_task_results/get_metric_info + - name: common/process_task_results/get_dataset_info + - name: common/process_task_results/get_task_info + - name: common/process_task_results/generate_qc +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/common/process_task_results/run/main.nf b/src/common/process_task_results/run/main.nf new file mode 100644 index 0000000000..dadbcfa1f6 --- /dev/null +++ b/src/common/process_task_results/run/main.nf @@ -0,0 +1,91 @@ +// workflow auto { +// findStates(params, meta.config) +// | meta.workflow.run( +// auto: [publish: "state"] +// ) +// } + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | get_task_info.run( + key: "task_info", + fromState: [ + "input": "input_task_info" + ], + toState: ["output_task": "output"] + ) + + // extract task id from task info + | map { id, state -> + def task_id = readJson(state.output_task).task_id + [id, state + ["task_id": task_id]] + } + + | get_method_info.run( + fromState: [ + "input": "input_method_configs", + "task_id" : "task_id" + ], + toState: ["output_method": "output"] + ) + + | get_metric_info.run( + fromState: [ + "input": "input_metric_configs", + "task_id" : "task_id" + ], + toState: ["output_metric": "output"] + ) + + | get_dataset_info.run( + fromState: [ + "task_id" : "task_id", + "input": "input_dataset_info", + ], + toState: ["output_dataset": "output"] + ) + + | get_results.run( + fromState: [ + "task_id": "task_id", + "input_scores": "input_scores", + "input_execution": "input_execution", + "input_dataset_info": "output_dataset", + "input_method_info": "output_method", + "input_metric_info": "output_metric" + ], + toState: [ + "output_results": "output_results", + "output_metric_execution_info": "output_metric_execution_info" + ] + ) + + | generate_qc.run( + fromState: [ + "task_info": "output_task", + "method_info": "output_method", + "metric_info": "output_metric", + "dataset_info": "output_dataset", + "results": "output_results" + ], + toState: ["output_qc": "output"] + ) + + | setState([ + "output_scores": "output_results", + "output_method_info": "output_method", + "output_metric_info": "output_metric", + "output_dataset_info": "output_dataset", + "output_task_info": "output_task", + "output_qc": "output_qc", + "output_metric_execution_info": "output_metric_execution_info" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/common/process_task_results/run/run_nf_tower_test.sh b/src/common/process_task_results/run/run_nf_tower_test.sh new file mode 100644 index 0000000000..95fa080f12 --- /dev/null +++ b/src/common/process_task_results/run/run_nf_tower_test.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +DATASETS_DIR="s3://openproblems-data/resources/batch_integration/results/" + +# try running on nf tower +cat > /tmp/params.yaml << 'HERE' +id: batch_integration_transform +input_scores: "$DATASETS_DIR/scores.yaml" +input_dataset_info: "$DATASETS_DIR/dataset_info.yaml" +input_method_configs: "$DATASETS_DIR/method_configs.yaml" +input_metric_configs: "$DATASETS_DIR/metric_configs.yaml" +input_execution: "$DATASETS_DIR/trace.txt" +input_task_info: "$DATASETS_DIR/task_info.yaml" +task_id: "batch_integration" +output_scores: "results.json" +output_method_info: "method_info.json" +output_metric_info: "metric_info.json" +output_dataset_info: "dataset_info.json" +output_task_info: "task_info.json" +publish_dir: $DATASETS_DIR +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} + + +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/common/workflows/transform_meta/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --config /tmp/nextflow.config \ No newline at end of file diff --git a/src/common/process_task_results/run/run_test.sh b/src/common/process_task_results/run/run_test.sh new file mode 100755 index 0000000000..762785b754 --- /dev/null +++ b/src/common/process_task_results/run/run_test.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# fail on error +set -e + +# ensure we're in the root of the repo +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT" + +for TASK in "denoising" "dimensionality_reduction" "batch_integration" "label_projection" "match_modalities" "predict_modality"; do +# for TASK in "label_projection"; do + BASE_DIR="s3://openproblems-data/resources/$TASK/results/" + + # find subdir in bucket with latest date + DATE=$(aws s3 ls $BASE_DIR --recursive | awk '{print $4}' | grep 'task_info.yaml' | sort -r | head -n 1 | sed 's#.*/run_\(.*\)/[^/]*$#\1#') + + INPUT_DIR="$BASE_DIR/run_$DATE" + OUTPUT_DIR="../website/results/$TASK/data" + + # # temp sync + # aws s3 sync $INPUT_DIR output/temp + + echo "Processing $TASK - $DATE" + + # start the run + NXF_VER=23.10.0 nextflow run . \ + -main-script target/nextflow/common/process_task_results/run/main.nf \ + -profile docker \ + -resume \ + -c src/wf_utils/labels_ci.config \ + --id "process" \ + --input_scores "$INPUT_DIR/score_uns.yaml" \ + --input_dataset_info "$INPUT_DIR/dataset_uns.yaml" \ + --input_method_configs "$INPUT_DIR/method_configs.yaml" \ + --input_metric_configs "$INPUT_DIR/metric_configs.yaml" \ + --input_execution "$INPUT_DIR/trace.txt" \ + --input_task_info "$INPUT_DIR/task_info.yaml" \ + --output_state "state.yaml" \ + --publish_dir "$OUTPUT_DIR" + + # cause quarto rerender to index page when in preview mode + touch ../website/results/$TASK/index.qmd +done \ No newline at end of file diff --git a/src/common/process_task_results/yaml_to_json/config.vsh.yaml b/src/common/process_task_results/yaml_to_json/config.vsh.yaml new file mode 100644 index 0000000000..7231cdcdbf --- /dev/null +++ b/src/common/process_task_results/yaml_to_json/config.vsh.yaml @@ -0,0 +1,16 @@ +__merge__: ../api/get_info.yaml +functionality: + name: "yaml_to_json" + description: "convert yaml file to json file" + resources: + - type: python_script + path: script.py + test_resources: + - type: file + path: /resources_test/common/task_metadata/dataset_info.yaml + dest: test_file.yaml +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + - type: native diff --git a/src/common/process_task_results/yaml_to_json/script.py b/src/common/process_task_results/yaml_to_json/script.py new file mode 100644 index 0000000000..45f6374515 --- /dev/null +++ b/src/common/process_task_results/yaml_to_json/script.py @@ -0,0 +1,16 @@ +import yaml +import json + +## VIASH START +par = { + "input": ".", + "task_id": "denoising", + "output": "output/task.json", +} +## VIASH END + +with open(par["input"], "r") as f: + yaml_file = yaml.safe_load(f) + +with open(par["output"], "w") as out: + json.dump(yaml_file, out, indent=2) diff --git a/src/common/resources_test_scripts/aws_sync.sh b/src/common/resources_test_scripts/aws_sync.sh new file mode 100644 index 0000000000..0541df125a --- /dev/null +++ b/src/common/resources_test_scripts/aws_sync.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Run the command in this script manually" +exit 1 + +aws s3 sync "resources_test" "s3://openproblems-data/resources_test" --exclude "*/temp*" --exclude "*/tmp*" --delete --dryrun +aws s3 sync "resources" "s3://openproblems-data/resources" --exclude */temp_* --delete --dryrun diff --git a/src/common/resources_test_scripts/task_metadata.sh b/src/common/resources_test_scripts/task_metadata.sh new file mode 100755 index 0000000000..cd9072f443 --- /dev/null +++ b/src/common/resources_test_scripts/task_metadata.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# make sure folloewing command has been executed +# viash ns build -q 'common' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/batch_integration" +OUTPUT_DIR="resources_test/common/task_metadata" + + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +# Create small git sha input file +sha_file="$OUTPUT_DIR/input_git_sha.json" + +cat < $sha_file +[ + { + "path": "tasks/denoising/README.md", + "last_modified": "2022-09-20 14:26:51 -0400", + "sha": "3fe9251ba906061b6769eed2ac9da0db5f8e26bb" + }, + { + "path": "tasks/denoising/__init__.py", + "last_modified": "2022-09-30 14:49:17 +0200", + "sha": "c97decf07adb2e3050561d6fa9ae46132be07bef" + }, + { + "path": "tasks/denoising/api.py", + "last_modified": "2022-10-21 13:56:15 -0400", + "sha": "b460ecb183328c857cbbf653488f522a4034a61c" + }, + { + "path": "tasks/denoising/datasets/__init__.py", + "last_modified": "2022-11-23 10:32:02 -0500", + "sha": "725ff0c46140aaa6bbded68646256f64bc63df6d" + }, + { + "path": "tasks/denoising/datasets/pancreas.py", + "last_modified": "2022-12-04 12:06:43 -0500", + "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" + }, + { + "path": "tasks/denoising/datasets/pbmc.py", + "last_modified": "2022-12-04 12:06:43 -0500", + "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" + }, + { + "path": "tasks/denoising/datasets/tabula_muris_senis.py", + "last_modified": "2022-12-04 12:06:43 -0500", + "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" + }, + { + "path": "tasks/denoising/datasets/utils.py", + "last_modified": "2022-11-15 17:19:16 -0500", + "sha": "c2470ce02e6f196267cec1c554ba7ae389c0956a" + }, + { + "path": "tasks/denoising/methods/__init__.py", + "last_modified": "2022-10-21 13:56:15 -0400", + "sha": "b460ecb183328c857cbbf653488f522a4034a61c" + }, + { + "path": "tasks/denoising/methods/alra.R", + "last_modified": "2022-05-16 15:10:42 -0400", + "sha": "ba06cf71b564eb23823a662341055dc5ac2be231" + }, + { + "path": "tasks/denoising/methods/alra.py", + "last_modified": "2022-07-25 12:29:34 -0400", + "sha": "411a416150ecabce25e1f59bde422a029d0a8baa" + }, + { + "path": "tasks/denoising/methods/baseline.py", + "last_modified": "2022-10-21 13:56:15 -0400", + "sha": "b460ecb183328c857cbbf653488f522a4034a61c" + }, + { + "path": "tasks/denoising/methods/dca.py", + "last_modified": "2022-12-01 15:38:21 -0500", + "sha": "aa2253779e9aa9cd178f54ac0f3b6ba521ecd59f" + }, + { + "path": "tasks/denoising/methods/knn_smoothing.py", + "last_modified": "2022-11-14 11:54:15 -0500", + "sha": "bbecf4e9ad90007c2711394e7fbd8e49cbd3e4a1" + }, + { + "path": "tasks/denoising/methods/magic.py", + "last_modified": "2022-11-14 11:57:35 -0500", + "sha": "2af9a4918ed3370859f71774558068961f6d22c6" + }, + { + "path": "tasks/denoising/metrics/__init__.py", + "last_modified": "2021-01-19 13:31:20 -0500", + "sha": "8e0600c516c392fa747137415b6a93b8af0f61d8" + }, + { + "path": "tasks/denoising/metrics/mse.py", + "last_modified": "2022-11-15 17:19:16 -0500", + "sha": "c2470ce02e6f196267cec1c554ba7ae389c0956a" + }, + { + "path": "tasks/denoising/metrics/poisson.py", + "last_modified": "2022-12-04 12:06:43 -0500", + "sha": "4bb8a7e04545a06c336d3d9364a1dd84fa2af1a4" + } +] +EOT + +# Create all metadata +export NXF_VER=22.04.5 + +nextflow run . \ + -main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -c src/wf_utils/labels_ci.config \ + -with-trace \ + -entry auto \ + --input_states "$DATASETS_DIR/pancreas/state.yaml" \ + --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml", "method_ids": ["bbknn", "mnnpy", "mnnr"]}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" + +cp trace.txt "$OUTPUT_DIR/trace.txt" + + +viash run src/common/process_task_results/get_method_info/config.vsh.yaml -- --input "$OUTPUT_DIR/method_configs.yaml" --output "$OUTPUT_DIR/method_info.json" diff --git a/src/common/schemas/api_component.yaml b/src/common/schemas/api_component.yaml new file mode 100644 index 0000000000..b197e2e367 --- /dev/null +++ b/src/common/schemas/api_component.yaml @@ -0,0 +1,67 @@ +title: Component API +description: | + A component type specification file. +type: object +required: [functionality] +properties: + functionality: + type: object + description: Information regarding the functionality of the component. + required: [namespace, info, arguments, test_resources] + additionalProperties: false + properties: + namespace: + "$ref": "defs_common.yaml#/definitions/Namespace" + info: + type: object + description: Metadata of the component. + additionalProperties: false + required: [type, type_info] + properties: + type: + "$ref": "defs_common.yaml#/definitions/ComponentType" + subtype: + "$ref": "defs_common.yaml#/definitions/ComponentSubtype" + type_info: + type: object + description: Metadata related to the component type. + required: [label, summary, description] + properties: + label: + $ref: "defs_common.yaml#/definitions/Label" + summary: + $ref: "defs_common.yaml#/definitions/Summary" + description: + $ref: "defs_common.yaml#/definitions/Description" + arguments: + type: array + description: Component-specific parameters. + items: + anyOf: + - $ref: 'defs_common.yaml#/definitions/ComponentAPIFile' + - $ref: 'defs_viash.yaml#/definitions/BooleanArgument' + - $ref: 'defs_viash.yaml#/definitions/BooleanArgument' + - $ref: 'defs_viash.yaml#/definitions/BooleanTrueArgument' + - $ref: 'defs_viash.yaml#/definitions/BooleanFalseArgument' + - $ref: 'defs_viash.yaml#/definitions/DoubleArgument' + - $ref: 'defs_viash.yaml#/definitions/IntegerArgument' + - $ref: 'defs_viash.yaml#/definitions/LongArgument' + - $ref: 'defs_viash.yaml#/definitions/StringArgument' + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + platforms: + type: array + description: A list of platforms which Viash generates target artifacts for. + items: + anyOf: + - "$ref": "defs_common.yaml#/definitions/PlatformDocker" + - "$ref": "defs_common.yaml#/definitions/PlatformNative" + - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/api_file.yaml b/src/common/schemas/api_file.yaml new file mode 100644 index 0000000000..6294439eda --- /dev/null +++ b/src/common/schemas/api_file.yaml @@ -0,0 +1,26 @@ +title: File API +description: A file format specification file. +type: object +additionalProperties: false +required: [type, example, info] +properties: + type: + const: file + example: + description: A file in the `resources_test` folder which is an example of this file format. + type: string + __merge__: + $ref: "defs_common.yaml#/definitions/Merge" + info: + description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' + type: object + required: [label, summary] + properties: + label: + $ref: "defs_common.yaml#/definitions/Label" + summary: + $ref: "defs_common.yaml#/definitions/Summary" + description: + $ref: "defs_common.yaml#/definitions/Description" + slots: + $ref: "defs_common.yaml#/definitions/AnnDataSlots" diff --git a/src/common/schemas/defs_common.yaml b/src/common/schemas/defs_common.yaml new file mode 100644 index 0000000000..8451cf5c52 --- /dev/null +++ b/src/common/schemas/defs_common.yaml @@ -0,0 +1,276 @@ +definitions: + PlatformVdsl3: + title: VDSL3 + description: Next-gen platform for generating NextFlow VDSL3 modules. + properties: + type: + const: nextflow + description: Next-gen platform for generating NextFlow VDSL3 modules. + directives: + $ref: 'defs_viash.yaml#/definitions/NextflowDirectives' + required: [ type ] + additionalProperties: false + PlatformDocker: + title: Docker platform + description: | + Run a Viash component on a Docker backend platform. + By specifying which dependencies your component needs, users are be able to build + a docker container from scratch using the setup flag, or pull it from a docker repository. + type: object + properties: + type: + const: docker + description: Run a Viash component on a Docker backend platform. + image: + type: string + description: The base container to start from. You can also add the tag here + if you wish. + run_args: + anyOf: + - type: string + description: Add docker run arguments. + - type: array + items: + type: string + description: Add docker run arguments. + target_image_source: + type: string + description: The source of the target image. This is used for defining labels + in the dockerfile. + setup: + type: array + items: + "$ref": "defs_viash.yaml#/definitions/Requirements" + test_setup: + type: array + items: + "$ref": "defs_viash.yaml#/definitions/Requirements" + required: [type, image] + additionalProperties: false + PlatformNative: + title: Native platform + type: object + properties: + type: + const: native + description: Specifies the type of the platform. Running a Viash component + on a native platform means that the script will be executed in your current + environment. + required: [ type ] + additionalProperties: false + PreferredNormalization: + enum: [l1_sqrt, log_cpm, log_cp10k, log_scran_pooling, sqrt_cpm, sqrt_cp10k, counts] + description: | + Which normalization method a component prefers. + + Each value corresponds to a normalization component in the directory `src/datasets/normalization`. + ComponentSubtype: + type: string + description: | + A component subtype, in case the task has multiple subtypes of methods and metrics. + ComponentType: + type: string + description: | + A component subtype, in case the task has multiple subtypes of methods and metrics. + Name: + type: string + description: | + A unique identifier. Can only contain lowercase letters, numbers or underscores. + pattern: "^[a-z_][a-z0-9_]*$" + maxLength: 50 + Namespace: + type: string + description: | + The namespace a component is part of. + pattern: "^[a-z_][a-z0-9_/]*$" + Label: + type: string + description: | + A unique, human-readable, short label. Used for creating summary tables and visualisations. + maxLength: 50 + Image: + type: string + description: | + The name of the image file to use for the component on the website. + Summary: + type: string + description: | + A one sentence summary of purpose and methodology. Used for creating an overview tables. + minLength: 15 + maxLength: 180 + Description: + type: string + description: | + A longer description (one or more paragraphs). Used for creating reference documentation and supplementary information. + minLength: 30 + BibtexReference: + type: string + description: | + A bibtex reference key to the paper where the component is described. + DocumentationURL: + type: string + format: uri + pattern: "^https://" + description: The url to the documentation of the used software library. + RepositoryURL: + type: string + format: uri + pattern: "^https://" + description: The url to the repository of the used software library. + MigrationV1: + type: object + required: [path, commit] + properties: + additionalProperties: false + path: + type: string + description: | + If this component was migrated from the OpenProblems v1 repository, this value + represents the location of the Python file relative to the root of the repository. + commit: + type: string + description: | + If this component was migrated from the OpenProblems v1 repository, this value + is the Git commit SHA of the v1 repository corresponding to when this component + was last updated. + note: + type: string + description: | + An optional note on any changes made during the migration. + MethodVariants: + type: object + description: Alternative parameter sets which should be evaluated in the benchmark. + properties: + preferred_normalization: + "$ref": "#/definitions/PreferredNormalization" + CompAPIMerge: + type: string + description: | + The API specifies which type of component this is. + It contains specifications for: + + - The input/output files + - Common parameters + - A unit test + Merge: + type: string + description: | + Another YAML to inherit values from. + ComponentAPIFile: + description: A `file` type argument has a string value that points to a file or folder path. + type: object + properties: + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f` or `foo`. The number of dashes determines how values can be passed: \n\n - `--foo` is a long option, which can be passed with `executable_name --foo=value` or `executable_name --foo value`\n - `-f` is a short option, which can be passed with `executable_name -f value`\n - `foo` is an argument, which can be passed with `executable_name value` \n" + type: string + __merge__: + type: string + description: The file format specification file. + direction: + description: Makes this argument an `input` or an `output`, as in does the file/folder needs to be read or written. `input` by default. + $ref: 'defs_viash.yaml#/definitions/Direction' + info: + description: 'Structured information. Can be any shape: a string, vector, map or even nested map.' + type: object + required: + description: Make the value for this argument required. If set to `true`, an error will be produced if no value was provided. `false` by default. + type: boolean + required: [name, __merge__, direction, required] + additionalProperties: false + AnnDataSlots: + properties: + X: + $ref: "#/definitions/AnnDataSlot" + layers: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + var: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + varm: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + varp: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + obs: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + obsm: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + obsp: + type: array + items: + $ref: "#/definitions/AnnDataSlot" + uns: + type: array + items: + oneOf: + - $ref: "#/definitions/AnnDataSlot" + - $ref: "#/definitions/AnnDataSlotObject" + AnnDataSlot: + properties: + type: + enum: [integer, double, string, boolean] + name: + type: string + description: A unique identifier. + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + description: + type: string + required: + type: boolean + required: [type, name, description, required] + AnnDataSlotObject: + properties: + type: + enum: [object] + name: + type: string + description: A unique identifier. + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + description: + type: string + required: + type: boolean + required: [type, name, description, required] + Author: + description: Author metadata. + type: object + additionalProperties: false + properties: + name: + description: Full name of the author, usually in the name of FirstName MiddleName LastName. + type: string + info: + description: Additional information on the author + type: object + additionalProperties: false + properties: + github: + type: string + orcid: + type: string + email: + type: string + twitter: + type: string + linkedin: + type: string + roles: + description: | + Role of the author. Possible values: + + * `"author"`: Authors who have made substantial contributions to the component. + * `"maintainer"`: The maintainer of the component. + * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). + type: array + items: + enum: [maintainer, author, contributor] \ No newline at end of file diff --git a/src/common/schemas/defs_viash.yaml b/src/common/schemas/defs_viash.yaml new file mode 100644 index 0000000000..fff25ab382 --- /dev/null +++ b/src/common/schemas/defs_viash.yaml @@ -0,0 +1,2252 @@ +$schema: "https://json-schema.org/draft-07/schema#" +title: Viash config schema definitions. +oneOf: + - $ref: "#/definitions/Config" +definitions: + Config: + description: "A Viash Config" + properties: + functionality: + description: "The functionality-part of the config file describes the behaviour\ + \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ + \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ + \ generate a stylish command-line interface for you.\n" + $ref: "#/definitions/Functionality" + platforms: + description: "Definition of the platforms" + type: "array" + items: + $ref: "#/definitions/Platforms" + info: + description: "Definition of meta data" + $ref: "#/definitions/Info" + required: + - "functionality" + additionalProperties: false + NativePlatform: + description: "Running a Viash component on a native platform means that the script\ + \ will be executed in your current environment.\nAny dependencies are assumed\ + \ to have been installed by the user, so the native platform is meant for developers\ + \ (who know what they're doing) or for simple bash scripts (which have no extra\ + \ dependencies).\n" + type: "object" + properties: + id: + description: "As with all platforms, you can give a platform a different name.\ + \ By specifying `id: foo`, you can target this platform (only) by specifying\ + \ `-p foo` in any of the Viash commands." + type: "string" + type: + description: "Running a Viash component on a native platform means that the\ + \ script will be executed in your current environment.\nAny dependencies\ + \ are assumed to have been installed by the user, so the native platform\ + \ is meant for developers (who know what they're doing) or for simple bash\ + \ scripts (which have no extra dependencies).\n" + const: "native" + required: + - "type" + additionalProperties: false + DockerPlatform: + description: "Run a Viash component on a Docker backend platform.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: "object" + properties: + organization: + description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." + type: "string" + registry: + description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" + type: "string" + image: + description: "The base container to start from. You can also add the tag here\ + \ if you wish." + type: "string" + tag: + description: "Specify a Docker image based on its tag." + type: "string" + target_tag: + description: "The tag the resulting image gets. Advanced usage only." + type: "string" + run_args: + anyOf: + - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ + \ arguments." + type: "string" + - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ + \ arguments." + type: "array" + items: + type: "string" + namespace_separator: + description: "The separator between the namespace and the name of the component,\ + \ used for determining the image name. Default: `\"/\"`." + type: "string" + resolve_volume: + description: "Enables or disables automatic volume mapping. Enabled when set\ + \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." + $ref: "#/definitions/DockerResolveVolume" + port: + anyOf: + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "string" + setup: + description: "A list of requirements for installing the following types of\ + \ packages:\n\n - @[apt](apt_req)\n - @[apk](apk_req)\n - @[Docker setup\ + \ instructions](docker_req)\n - @[JavaScript](javascript_req)\n - @[Python](python_req)\n\ + \ - @[R](r_req)\n - @[Ruby](ruby_req)\n - @[yum](yum_req)\n\nThe order in\ + \ which these dependencies are specified determines the order in which they\ + \ will be installed.\n" + type: "array" + items: + $ref: "#/definitions/Requirements" + workdir: + description: "The working directory when starting the container. This doesn't\ + \ change the Dockerfile but gets added as a command-line argument at runtime." + type: "string" + target_image: + description: "If anything is specified in the setup section, running the `---setup`\ + \ will result in an image with the name of `:`. If\ + \ nothing is specified in the `setup` section, simply `image` will be used.\ + \ Advanced usage only." + type: "string" + cmd: + anyOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" + target_image_source: + description: "The source of the target image. This is used for defining labels\ + \ in the dockerfile." + type: "string" + test_setup: + description: "Additional requirements specific for running unit tests." + type: "array" + items: + $ref: "#/definitions/Requirements" + entrypoint: + anyOf: + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "string" + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "array" + items: + type: "string" + id: + description: "As with all platforms, you can give a platform a different name.\ + \ By specifying `id: foo`, you can target this platform (only) by specifying\ + \ `-p foo` in any of the Viash commands." + type: "string" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" + setup_strategy: + description: "The Docker setup strategy to use when building a container.\n\ + \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ + \ / `b` | Always build the image from the dockerfile. This is the default\ + \ setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb` | Always\ + \ build the image from the dockerfile, with caching enabled.\n| `ifneedbebuild`\ + \ | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ + \ | Build the image with caching enabled if it does not exist locally, with\ + \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ + \ from [Docker Hub](https://hub.docker.com) or the @[specified docker registry](docker_registry).\n\ + | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ + \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ + \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ + \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ + \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ + \ If the image does not exist locally, pull the image. If the image does\ + \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ + \ not exist locally, pull the image. If the image does exist, build it with\ + \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ + \ or the @[specified docker registry](docker_registry).\n| `pushifnotpresent`\ + \ | Push the container to [Docker Hub](https://hub.docker.com) or the @[specified\ + \ docker registry](docker_registry) if the @[tag](docker_tag) does not exist\ + \ yet.\n| `donothing` / `meh` | Do not build or pull anything.\n\n" + $ref: "#/definitions/DockerSetupStrategy" + type: + description: "Run a Viash component on a Docker backend platform.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a\ + \ docker container from scratch using the setup flag, or pull it from a\ + \ docker repository.\n" + const: "docker" + target_organization: + description: "The organization set in the resulting image. Advanced usage\ + \ only." + type: "string" + chown: + description: "In Linux, files created by a Docker container will be owned\ + \ by `root`. With `chown: true`, Viash will automatically change the ownership\ + \ of output files (arguments with `type: file` and `direction: output`)\ + \ to the user running the Viash command after execution of the component.\ + \ Default value: `true`." + type: "boolean" + required: + - "image" + - "type" + additionalProperties: false + NextflowVdsl3Platform: + description: "Next-gen platform for generating NextFlow VDSL3 modules." + type: "object" + properties: + auto: + description: "@[Automated processing flags](nextflow_auto) which can be toggled\ + \ on or off:\n\n| Flag | Description | Default |\n|---|---------|----|\n\ + | `simplifyInput` | If `true`, an input tuple only containing only a single\ + \ File (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed\ + \ to a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n\ + | `simplifyOutput` | If `true`, an output tuple containing a map with a\ + \ File (e.g. `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically\ + \ transformed to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `true`\ + \ |\n| `transcript` | If `true`, the module's transcripts from `work/` are\ + \ automatically published to `params.transcriptDir`. If not defined, `params.publishDir\ + \ + \"/_transcripts\"` will be used. Will throw an error if neither are\ + \ defined. | `false` |\n| `publish` | If `true`, the module's outputs are\ + \ automatically published to `params.publishDir`. Will throw an error if\ + \ `params.publishDir` is not defined. | `false` |\n\n" + $ref: "#/definitions/NextflowAuto" + directives: + description: "@[Directives](nextflow_directives) are optional settings that\ + \ affect the execution of the process. These mostly match up with the Nextflow\ + \ counterparts. \n" + $ref: "#/definitions/NextflowDirectives" + container: + description: "Specifies the Docker platform id to be used to run Nextflow." + type: "string" + debug: + description: "Whether or not to print debug messages." + type: "boolean" + id: + description: "Every platform can be given a specific id that can later be\ + \ referred to explicitly when running or building the Viash component." + type: "string" + type: + description: "Next-gen platform for generating NextFlow VDSL3 modules." + const: "nextflow" + config: + description: "Allows tweaking how the @[Nextflow Config](nextflow_config)\ + \ file is generated." + $ref: "#/definitions/NextflowConfig" + required: + - "type" + additionalProperties: false + Platforms: + anyOf: + - $ref: "#/definitions/NativePlatform" + - $ref: "#/definitions/DockerPlatform" + - $ref: "#/definitions/NextflowVdsl3Platform" + Info: + description: "Meta information fields filled in by Viash during build." + type: "object" + properties: + git_tag: + description: "Git tag." + type: "string" + git_remote: + description: "Git remote name." + type: "string" + viash_version: + description: "The Viash version that was used to build the component." + type: "string" + config: + description: "Path to the config used during build." + type: "string" + output: + description: "Folder path to the build artifacts." + type: "string" + platform: + description: "The platform id used during build." + type: "string" + git_commit: + description: "Git commit hash." + type: "string" + executable: + description: "Output folder with main executable path." + type: "string" + required: + - "config" + additionalProperties: false + Functionality: + description: "The functionality-part of the config file describes the behaviour\ + \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ + \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ + \ generate a stylish command-line interface for you.\n" + type: "object" + properties: + name: + description: "Name of the component and the filename of the executable when\ + \ built with `viash build`." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + authors: + description: "A list of @[authors](author). An author must at least have a\ + \ name, but can also have a list of roles, an e-mail address, and a map\ + \ of custom properties.\n\nSuggested values for roles are:\n \n| Role |\ + \ Abbrev. | Description |\n|------|---------|-------------|\n| maintainer\ + \ | mnt | for the maintainer of the code. Ideally, exactly one maintainer\ + \ is specified. |\n| author | aut | for persons who have made substantial\ + \ contributions to the software. |\n| contributor | ctb| for persons who\ + \ have made smaller contributions (such as code patches).\n| datacontributor\ + \ | dtc | for persons or organisations that contributed data sets for the\ + \ software\n| copyrightholder | cph | for all copyright holders. This is\ + \ a legal concept so should use the legal name of an institution or corporate\ + \ body.\n| funder | fnd | for persons or organizations that furnished financial\ + \ support for the development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ + \ is extremely comprehensive.\n" + type: "array" + items: + $ref: "#/definitions/Author" + status: + description: "Allows setting a component to active, deprecated or disabled." + $ref: "#/definitions/Status" + requirements: + description: "@[Computational requirements](computational_requirements) related\ + \ to running the component. \n`cpus` specifies the maximum number of (logical)\ + \ cpus a component is allowed to use., whereas\n`memory` specifies the maximum\ + \ amount of memory a component is allowed to allicate. Memory units must\ + \ be\nin B, KB, MB, GB, TB or PB." + $ref: "#/definitions/ComputationalRequirements" + resources: + description: "@[Resources](resources) are files that support the component.\ + \ The first resource should be @[a script](scripting_languages) that will\ + \ be executed when the functionality is run. Additional resources will be\ + \ copied to the same directory.\n\nCommon properties:\n\n * type: `file`\ + \ / `r_script` / `python_script` / `bash_script` / `javascript_script` /\ + \ `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + test_resources: + description: "One or more @[scripts](scripting_languages) to be used to test\ + \ the component behaviour when `viash test` is invoked. Additional files\ + \ of type `file` will be made available only during testing. Each test script\ + \ should expect no command-line inputs, be platform-independent, and return\ + \ an exit code >0 when unexpected behaviour occurs during testing. See @[Unit\ + \ Testing](unit_testing) for more info." + type: "array" + items: + $ref: "#/definitions/Resource" + argument_groups: + description: "A grouping of the @[arguments](argument), used to display the\ + \ help message.\n\n - `name: foo`, the name of the argument group. \n -\ + \ `description: Description of foo`, a description of the argument group.\ + \ Multiline descriptions are supported.\n - `arguments: [arg1, arg2, ...]`,\ + \ list of the arguments names.\n\n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" + description: + description: "A description of the component. This will be displayed with\ + \ `--help`." + type: "string" + usage: + description: "A description on how to use the component. This will be displayed\ + \ with `--help` under the 'Usage:' section." + type: "string" + namespace: + description: "Namespace this component is a part of. See the @[Namespaces\ + \ guide](namespace) for more information on namespaces." + type: "string" + arguments: + description: "A list of @[arguments](argument) for this component. For each\ + \ argument, a type and a name must be specified. Depending on the type of\ + \ argument, different properties can be set. See these reference pages per\ + \ type for more information: \n\n - @[string](arg_string)\n - @[file](arg_file)\n\ + \ - @[integer](arg_integer)\n - @[double](arg_double)\n - @[boolean](arg_boolean)\n\ + \ - @[boolean_true](arg_boolean_true)\n - @[boolean_false](arg_boolean_false)\n" + type: "array" + items: + $ref: "#/definitions/Argument" + required: + - "name" + additionalProperties: false + Author: + description: "Author metadata." + type: "object" + properties: + name: + description: "Full name of the author, usually in the name of FirstName MiddleName\ + \ LastName." + type: "string" + email: + description: "E-mail of the author." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + roles: + anyOf: + - description: "Role of the author. Suggested items:\n\n* `\"author\"`: Authors\ + \ who have made substantial contributions to the component.\n* `\"maintainer\"\ + `: The maintainer of the component.\n* `\"contributor\"`: Authors who\ + \ have made smaller contributions (such as code patches etc.).\n" + type: "string" + - description: "Role of the author. Suggested items:\n\n* `\"author\"`: Authors\ + \ who have made substantial contributions to the component.\n* `\"maintainer\"\ + `: The maintainer of the component.\n* `\"contributor\"`: Authors who\ + \ have made smaller contributions (such as code patches etc.).\n" + type: "array" + items: + type: "string" + props: + description: "Author properties. Must be a map of strings." + type: "object" + additionalProperties: + description: "Author properties. Must be a map of strings." + type: "string" + required: + - "name" + additionalProperties: false + ComputationalRequirements: + description: "Computational requirements related to running the component." + type: "object" + properties: + cpus: + description: "The maximum number of (logical) cpus a component is allowed\ + \ to use." + type: "integer" + commands: + description: "A list of commands which should be present on the system for\ + \ the script to function." + type: "array" + items: + type: "string" + memory: + description: "The maximum amount of memory a component is allowed to allocate.\ + \ Unit must be one of B, KB, MB, GB, TB or PB." + type: "string" + required: [] + additionalProperties: false + RubyRequirements: + description: "Specify which Ruby packages should be available in order to run\ + \ the component." + type: "object" + properties: + type: + description: "Specify which Ruby packages should be available in order to\ + \ run the component." + const: "ruby" + packages: + anyOf: + - description: "Specifies which packages to install." + type: "string" + - description: "Specifies which packages to install." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + YumRequirements: + description: "Specify which yum packages should be available in order to run the\ + \ component." + type: "object" + properties: + type: + description: "Specify which yum packages should be available in order to run\ + \ the component." + const: "yum" + packages: + anyOf: + - description: "Specifies which packages to install." + type: "string" + - description: "Specifies which packages to install." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + JavascriptRequirements: + description: "Specify which JavaScript packages should be available in order to\ + \ run the component." + type: "object" + properties: + github: + anyOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - description: "Specifies which packages to install from GitHub." + type: "array" + items: + type: "string" + url: + anyOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - description: "Specifies which packages to install using a generic URI." + type: "array" + items: + type: "string" + git: + anyOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - description: "Specifies which packages to install using a Git URI." + type: "array" + items: + type: "string" + npm: + anyOf: + - description: "Specifies which packages to install from npm." + type: "string" + - description: "Specifies which packages to install from npm." + type: "array" + items: + type: "string" + type: + description: "Specify which JavaScript packages should be available in order\ + \ to run the component." + const: "javascript" + packages: + anyOf: + - description: "Specifies which packages to install from npm." + type: "string" + - description: "Specifies which packages to install from npm." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + DockerRequirements: + description: "Specify which Docker commands should be run during setup." + type: "object" + properties: + run: + anyOf: + - description: "Specifies which `RUN` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - description: "Specifies which `RUN` entries to add to the Dockerfile while\ + \ building it." + type: "array" + items: + type: "string" + label: + anyOf: + - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ + \ building it." + type: "array" + items: + type: "string" + build_args: + anyOf: + - description: "Specifies which `ARG` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - description: "Specifies which `ARG` entries to add to the Dockerfile while\ + \ building it." + type: "array" + items: + type: "string" + type: + description: "Specify which Docker commands should be run during setup." + const: "docker" + add: + anyOf: + - description: "Specifies which `ADD` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - description: "Specifies which `ADD` entries to add to the Dockerfile while\ + \ building it." + type: "array" + items: + type: "string" + env: + anyOf: + - description: "Specifies which `ENV` entries to add to the Dockerfile while\ + \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ + \ the container." + type: "string" + - description: "Specifies which `ENV` entries to add to the Dockerfile while\ + \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ + \ the container." + type: "array" + items: + type: "string" + copy: + anyOf: + - description: "Specifies which `COPY` entries to add to the Dockerfile while\ + \ building it." + type: "string" + - description: "Specifies which `COPY` entries to add to the Dockerfile while\ + \ building it." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + RRequirements: + description: "Specify which R packages should be available in order to run the\ + \ component." + type: "object" + properties: + bioc: + anyOf: + - description: "Specifies which packages to install from BioConductor." + type: "string" + - description: "Specifies which packages to install from BioConductor." + type: "array" + items: + type: "string" + github: + anyOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - description: "Specifies which packages to install from GitHub." + type: "array" + items: + type: "string" + gitlab: + anyOf: + - description: "Specifies which packages to install from GitLab." + type: "string" + - description: "Specifies which packages to install from GitLab." + type: "array" + items: + type: "string" + url: + anyOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - description: "Specifies which packages to install using a generic URI." + type: "array" + items: + type: "string" + bioc_force_install: + description: "Forces packages specified in `bioc` to be reinstalled, even\ + \ if they are already present in the container. Default: false." + type: "boolean" + git: + anyOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - description: "Specifies which packages to install using a Git URI." + type: "array" + items: + type: "string" + cran: + anyOf: + - description: "Specifies which packages to install from CRAN." + type: "string" + - description: "Specifies which packages to install from CRAN." + type: "array" + items: + type: "string" + bitbucket: + anyOf: + - description: "Specifies which packages to install from Bitbucket." + type: "string" + - description: "Specifies which packages to install from Bitbucket." + type: "array" + items: + type: "string" + svn: + anyOf: + - description: "Specifies which packages to install using an SVN URI." + type: "string" + - description: "Specifies which packages to install using an SVN URI." + type: "array" + items: + type: "string" + packages: + anyOf: + - description: "Specifies which packages to install from CRAN." + type: "string" + - description: "Specifies which packages to install from CRAN." + type: "array" + items: + type: "string" + script: + anyOf: + - description: "Specifies a code block to run as part of the build." + type: "string" + - description: "Specifies a code block to run as part of the build." + type: "array" + items: + type: "string" + type: + description: "Specify which R packages should be available in order to run\ + \ the component." + const: "r" + required: + - "type" + additionalProperties: false + ApkRequirements: + description: "Specify which apk packages should be available in order to run the\ + \ component." + type: "object" + properties: + type: + description: "Specify which apk packages should be available in order to run\ + \ the component." + const: "apk" + packages: + anyOf: + - description: "Specifies which packages to install." + type: "string" + - description: "Specifies which packages to install." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + PythonRequirements: + description: "Specify which Python packages should be available in order to run\ + \ the component." + type: "object" + properties: + github: + anyOf: + - description: "Specifies which packages to install from GitHub." + type: "string" + - description: "Specifies which packages to install from GitHub." + type: "array" + items: + type: "string" + gitlab: + anyOf: + - description: "Specifies which packages to install from GitLab." + type: "string" + - description: "Specifies which packages to install from GitLab." + type: "array" + items: + type: "string" + pip: + anyOf: + - description: "Specifies which packages to install from pip." + type: "string" + - description: "Specifies which packages to install from pip." + type: "array" + items: + type: "string" + pypi: + anyOf: + - description: "Specifies which packages to install from PyPI using pip." + type: "string" + - description: "Specifies which packages to install from PyPI using pip." + type: "array" + items: + type: "string" + git: + anyOf: + - description: "Specifies which packages to install using a Git URI." + type: "string" + - description: "Specifies which packages to install using a Git URI." + type: "array" + items: + type: "string" + upgrade: + description: "Sets the `--upgrade` flag when set to true. Default: true." + type: "boolean" + packages: + anyOf: + - description: "Specifies which packages to install from pip." + type: "string" + - description: "Specifies which packages to install from pip." + type: "array" + items: + type: "string" + url: + anyOf: + - description: "Specifies which packages to install using a generic URI." + type: "string" + - description: "Specifies which packages to install using a generic URI." + type: "array" + items: + type: "string" + svn: + anyOf: + - description: "Specifies which packages to install using an SVN URI." + type: "string" + - description: "Specifies which packages to install using an SVN URI." + type: "array" + items: + type: "string" + bazaar: + anyOf: + - description: "Specifies which packages to install using a Bazaar URI." + type: "string" + - description: "Specifies which packages to install using a Bazaar URI." + type: "array" + items: + type: "string" + script: + anyOf: + - description: "Specifies a code block to run as part of the build." + type: "string" + - description: "Specifies a code block to run as part of the build." + type: "array" + items: + type: "string" + type: + description: "Specify which Python packages should be available in order to\ + \ run the component." + const: "python" + mercurial: + anyOf: + - description: "Specifies which packages to install using a Mercurial URI." + type: "string" + - description: "Specifies which packages to install using a Mercurial URI." + type: "array" + items: + type: "string" + user: + description: "Sets the `--user` flag when set to true. Default: false." + type: "boolean" + required: + - "type" + additionalProperties: false + AptRequirements: + description: "Specify which apt packages should be available in order to run the\ + \ component." + type: "object" + properties: + interactive: + description: "If `false`, the Debian frontend is set to non-interactive (recommended).\ + \ Default: false." + type: "boolean" + type: + description: "Specify which apt packages should be available in order to run\ + \ the component." + const: "apt" + packages: + anyOf: + - description: "Specifies which packages to install." + type: "string" + - description: "Specifies which packages to install." + type: "array" + items: + type: "string" + required: + - "type" + additionalProperties: false + Requirements: + anyOf: + - $ref: "#/definitions/RubyRequirements" + - $ref: "#/definitions/YumRequirements" + - $ref: "#/definitions/JavascriptRequirements" + - $ref: "#/definitions/DockerRequirements" + - $ref: "#/definitions/RRequirements" + - $ref: "#/definitions/ApkRequirements" + - $ref: "#/definitions/PythonRequirements" + - $ref: "#/definitions/AptRequirements" + StringArgument: + description: "A `string` type argument has a value made up of an ordered sequences\ + \ of characters, like \"Hello\" or \"I'm a string\"." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "string" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "string" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `string` type argument has a value made up of an ordered sequences\ + \ of characters, like \"Hello\" or \"I'm a string\"." + const: "string" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + BooleanArgument: + description: "A `boolean` type argument has two possible values: `true` or `false`." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--trim`, `-t`\ + \ or `trim`. The number of dashes determines how values can be passed: \ + \ \n\n - `--trim` is a long option, which can be passed with `executable_name\ + \ --trim`\n - `-t` is a short option, which can be passed with `executable_name\ + \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ + \ trim` \n" + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "boolean" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "boolean" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "boolean" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "boolean" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `boolean` type argument has two possible values: `true` or\ + \ `false`." + const: "boolean" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + BooleanTrueArgument: + description: "An argument of the `boolean_true` type acts like a `boolean` flag\ + \ with a default value of `false`. When called as an argument it sets the `boolean`\ + \ to `true`." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--silent`,\ + \ `-s` or `silent`. The number of dashes determines how values can be passed:\ + \ \n\n - `--silent` is a long option, which can be passed with `executable_name\ + \ --silent`\n - `-s` is a short option, which can be passed with `executable_name\ + \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ + \ silent` \n" + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + type: + description: "An argument of the `boolean_true` type acts like a `boolean`\ + \ flag with a default value of `false`. When called as an argument it sets\ + \ the `boolean` to `true`." + const: "boolean_true" + required: + - "name" + - "type" + additionalProperties: false + IntegerArgument: + description: "An `integer` type argument has a numeric value without decimal points." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "integer" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + type: "integer" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "integer" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "integer" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + type: "integer" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "An `integer` type argument has a numeric value without decimal\ + \ points." + const: "integer" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + LongArgument: + description: "An `long` type argument has a numeric value without decimal points." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + choices: + description: "Limit the amount of valid values for this argument to those\ + \ set in this list. When set and a value not present in the list is provided,\ + \ an error will be produced." + type: "array" + items: + type: "integer" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + type: "integer" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "integer" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "integer" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "integer" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "integer" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + type: "integer" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "An `long` type argument has a numeric value without decimal\ + \ points." + const: "long" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + BooleanFalseArgument: + description: "An argument of the `boolean_false` type acts like an inverted `boolean`\ + \ flag with a default value of `true`. When called as an argument it sets the\ + \ `boolean` to `false`." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--no-log`,\ + \ `-n` or `no-log`. The number of dashes determines how values can be passed:\ + \ \n\n - `--no-log` is a long option, which can be passed with `executable_name\ + \ --no-log`\n - `-n` is a short option, which can be passed with `executable_name\ + \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ + \ no-log` \n" + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + type: + description: "An argument of the `boolean_false` type acts like an inverted\ + \ `boolean` flag with a default value of `true`. When called as an argument\ + \ it sets the `boolean` to `false`." + const: "boolean_false" + required: + - "name" + - "type" + additionalProperties: false + DoubleArgument: + description: "A `double` type argument has a numeric value with decimal points" + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + max: + description: "Maximum allowed value for this argument. If set and the provided\ + \ value is higher than the maximum, an error will be produced. Can be combined\ + \ with [`min`](#min) to clamp values." + type: "number" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "number" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "number" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "number" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "number" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + min: + description: "Minimum allowed value for this argument. If set and the provided\ + \ value is lower than the minimum, an error will be produced. Can be combined\ + \ with [`max`](#max) to clamp values." + type: "number" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `double` type argument has a numeric value with decimal points" + const: "double" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + FileArgument: + description: "A `file` type argument has a string value that points to a file\ + \ or folder path." + type: "object" + properties: + alternatives: + anyOf: + - description: "List of alternative format variations for this argument." + type: "string" + - description: "List of alternative format variations for this argument." + type: "array" + items: + type: "string" + name: + description: "The name of the argument. Can be in the formats `--foo`, `-f`\ + \ or `foo`. The number of dashes determines how values can be passed: \n\ + \n - `--foo` is a long option, which can be passed with `executable_name\ + \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ + \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ + \ which can be passed with `executable_name value` \n" + type: "string" + create_parent: + description: "If the output filename is a path and it does not exist, create\ + \ it before executing the script (only for `direction: output`)." + type: "boolean" + direction: + description: "Makes this argument an `input` or an `output`, as in does the\ + \ file/folder needs to be read or written. `input` by default." + $ref: "#/definitions/Direction" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + must_exist: + description: "Checks whether the file or folder exists. For input files, this\ + \ check will happen before the execution of the script, while for output\ + \ files the check will happen afterwards." + type: "boolean" + default: + anyOf: + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "string" + - description: "The default value when no argument value is provided. This\ + \ will not work if the [`required`](#required) property is enabled." + type: "array" + items: + type: "string" + example: + anyOf: + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "string" + - description: "An example value for this argument. If no [`default`](#default)\ + \ property was specified, this will be used for that purpose." + type: "array" + items: + type: "string" + description: + description: "A description of the argument. This will be displayed with `--help`." + type: "string" + multiple_sep: + description: "The delimiter character for providing [`multiple`](#multiple)\ + \ values. `:` by default." + type: "string" + multiple: + description: "Treat the argument value as an array. Arrays can be passed using\ + \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ + \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ + \ property. `false` by default." + type: "boolean" + type: + description: "A `file` type argument has a string value that points to a file\ + \ or folder path." + const: "file" + required: + description: "Make the value for this argument required. If set to `true`,\ + \ an error will be produced if no value was provided. `false` by default." + type: "boolean" + required: + - "name" + - "type" + additionalProperties: false + Argument: + anyOf: + - $ref: "#/definitions/StringArgument" + - $ref: "#/definitions/BooleanArgument" + - $ref: "#/definitions/BooleanTrueArgument" + - $ref: "#/definitions/IntegerArgument" + - $ref: "#/definitions/LongArgument" + - $ref: "#/definitions/BooleanFalseArgument" + - $ref: "#/definitions/DoubleArgument" + - $ref: "#/definitions/FileArgument" + ArgumentGroup: + type: "object" + properties: + name: + description: "The name of the argument group." + type: "string" + description: + description: "A description of the argument group. Multiline descriptions\ + \ are supported." + type: "string" + arguments: + description: "List of the arguments names." + type: "array" + items: + $ref: "#/definitions/Argument" + required: + - "name" + - "arguments" + additionalProperties: false + JavaScriptScript: + description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "javascript_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + CSharpScript: + description: "An executable C# script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable C# script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "csharp_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + Executable: + description: "An executable file." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable file." + const: "executable" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + ScalaScript: + description: "An executable Scala script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Scala script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "scala_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + NextflowScript: + description: "A Nextflow script. Work in progress; added mainly for annotation\ + \ at the moment." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + entrypoint: + description: "The name of the workflow to be executed." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "A Nextflow script. Work in progress; added mainly for annotation\ + \ at the moment." + const: "nextflow_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + PlainFile: + description: "A plain file. This can only be used as a supporting resource for\ + \ the main script or unit tests." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "A plain file. This can only be used as a supporting resource\ + \ for the main script or unit tests." + const: "file" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "path" + additionalProperties: false + BashScript: + description: "An executable Bash script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Bash script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "bash_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + PythonScript: + description: "An executable Python script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable Python script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "python_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + RScript: + description: "An executable R script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component or\ + \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ + \ entries will be executed during `viash test`." + type: "object" + properties: + path: + description: "The path of the input file. Can be a relative or an absolute\ + \ path, or a URI. Mutually exclusive with `text`." + type: "string" + text: + description: "The content of the resulting file specified as a string. Mutually\ + \ exclusive with `path`." + type: "string" + is_executable: + description: "Whether the resulting resource file should be made executable." + type: "boolean" + type: + description: "An executable R script.\nWhen defined in functionality.resources,\ + \ only the first entry will be executed when running the built component\ + \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ + \ all entries will be executed during `viash test`." + const: "r_script" + dest: + description: "Resulting filename of the resource. From within a script, the\ + \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ + \ `dest` will be set to the basename of the `path` parameter." + type: "string" + required: + - "type" + additionalProperties: false + Resource: + anyOf: + - $ref: "#/definitions/JavaScriptScript" + - $ref: "#/definitions/CSharpScript" + - $ref: "#/definitions/Executable" + - $ref: "#/definitions/ScalaScript" + - $ref: "#/definitions/NextflowScript" + - $ref: "#/definitions/PlainFile" + - $ref: "#/definitions/BashScript" + - $ref: "#/definitions/PythonScript" + - $ref: "#/definitions/RScript" + NextflowDirectives: + description: "Directives are optional settings that affect the execution of the\ + \ process.\n" + type: "object" + properties: + beforeScript: + description: "The `beforeScript` directive allows you to execute a custom\ + \ (Bash) snippet before the main process script is run. This may be useful\ + \ to initialise the underlying cluster environment or for other custom initialisation.\n\ + \nSee [`beforeScript`](https://www.nextflow.io/docs/latest/process.html#beforeScript).\n" + type: "string" + module: + anyOf: + - description: "Environment Modules is a package manager that allows you to\ + \ dynamically configure your execution environment and easily switch between\ + \ multiple versions of the same software tool.\n\nIf it is available in\ + \ your system you can use it with Nextflow in order to configure the processes\ + \ execution environment in your pipeline.\n\nIn a process definition you\ + \ can use the `module` directive to load a specific module version to\ + \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" + type: "string" + - description: "Environment Modules is a package manager that allows you to\ + \ dynamically configure your execution environment and easily switch between\ + \ multiple versions of the same software tool.\n\nIf it is available in\ + \ your system you can use it with Nextflow in order to configure the processes\ + \ execution environment in your pipeline.\n\nIn a process definition you\ + \ can use the `module` directive to load a specific module version to\ + \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" + type: "array" + items: + type: "string" + queue: + anyOf: + - description: "The `queue` directory allows you to set the queue where jobs\ + \ are scheduled when using a grid based executor in your pipeline.\n\n\ + See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" + type: "string" + - description: "The `queue` directory allows you to set the queue where jobs\ + \ are scheduled when using a grid based executor in your pipeline.\n\n\ + See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" + type: "array" + items: + type: "string" + label: + anyOf: + - description: "The `label` directive allows the annotation of processes with\ + \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" + type: "string" + - description: "The `label` directive allows the annotation of processes with\ + \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" + type: "array" + items: + type: "string" + container: + anyOf: + - description: "The `container` directive allows you to execute the process\ + \ script in a Docker container.\n\nIt requires the Docker daemon to be\ + \ running in machine where the pipeline is executed, i.e. the local machine\ + \ when using the local executor or the cluster nodes when the pipeline\ + \ is deployed through a grid executor.\n\nViash implements allows either\ + \ a string value or a map. In case a map is used, the allowed keys are:\ + \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ + \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" + type: "object" + additionalProperties: + description: "The `container` directive allows you to execute the process\ + \ script in a Docker container.\n\nIt requires the Docker daemon to\ + \ be running in machine where the pipeline is executed, i.e. the local\ + \ machine when using the local executor or the cluster nodes when the\ + \ pipeline is deployed through a grid executor.\n\nViash implements\ + \ allows either a string value or a map. In case a map is used, the\ + \ allowed keys are: `registry`, `image`, and `tag`. The `image` value\ + \ must be specified.\n\nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" + type: "string" + - description: "The `container` directive allows you to execute the process\ + \ script in a Docker container.\n\nIt requires the Docker daemon to be\ + \ running in machine where the pipeline is executed, i.e. the local machine\ + \ when using the local executor or the cluster nodes when the pipeline\ + \ is deployed through a grid executor.\n\nViash implements allows either\ + \ a string value or a map. In case a map is used, the allowed keys are:\ + \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ + \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" + type: "string" + publishDir: + anyOf: + - anyOf: + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ + \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ + \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ + \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ + \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "object" + additionalProperties: + description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ + \ `path` key and value are required.\nThe allowed values for `mode`\ + \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ + \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are: `path`,\ + \ `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path` key\ + \ and value are required.\nThe allowed values for `mode` are: `symlink`,\ + \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "array" + items: + anyOf: + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ + \ `path` key and value are required.\nThe allowed values for `mode`\ + \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ + \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + - description: "The `publishDir` directive allows you to publish the process\ + \ output files to a specified folder.\n\nViash implements this directive\ + \ as a plain string or a map. The allowed keywords for the map are:\ + \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ + \ `path` key and value are required.\nThe allowed values for `mode`\ + \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ + \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "object" + additionalProperties: + description: "The `publishDir` directive allows you to publish the\ + \ process output files to a specified folder.\n\nViash implements\ + \ this directive as a plain string or a map. The allowed keywords\ + \ for the map are: `path`, `mode`, `overwrite`, `pattern`, `saveAs`,\ + \ `enabled`. The `path` key and value are required.\nThe allowed\ + \ values for `mode` are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`,\ + \ `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" + type: "string" + maxForks: + anyOf: + - description: "The `maxForks` directive allows you to define the maximum\ + \ number of process instances that can be executed in parallel. By default\ + \ this value is equals to the number of CPU cores available minus 1.\n\ + \nIf you want to execute a process in a sequential manner, set this directive\ + \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" + type: "string" + - description: "The `maxForks` directive allows you to define the maximum\ + \ number of process instances that can be executed in parallel. By default\ + \ this value is equals to the number of CPU cores available minus 1.\n\ + \nIf you want to execute a process in a sequential manner, set this directive\ + \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" + type: "integer" + maxErrors: + anyOf: + - description: "The `maxErrors` directive allows you to specify the maximum\ + \ number of times a process can fail when using the `retry` error strategy.\ + \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" + type: "string" + - description: "The `maxErrors` directive allows you to specify the maximum\ + \ number of times a process can fail when using the `retry` error strategy.\ + \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" + type: "integer" + cpus: + anyOf: + - description: "The `cpus` directive allows you to define the number of (logical)\ + \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" + type: "integer" + - description: "The `cpus` directive allows you to define the number of (logical)\ + \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" + type: "string" + accelerator: + description: "The `accelerator` directive allows you to specify the hardware\ + \ accelerator requirement for the task execution e.g. GPU processor.\n\n\ + Viash implements this directive as a map with accepted keywords: `type`,\ + \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" + type: "object" + additionalProperties: + description: "The `accelerator` directive allows you to specify the hardware\ + \ accelerator requirement for the task execution e.g. GPU processor.\n\ + \nViash implements this directive as a map with accepted keywords: `type`,\ + \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" + type: "string" + time: + description: "The `time` directive allows you to define how long a process\ + \ is allowed to run.\n\nSee [`time`](https://www.nextflow.io/docs/latest/process.html#time).\n" + type: "string" + afterScript: + description: "The `afterScript` directive allows you to execute a custom (Bash)\ + \ snippet immediately after the main process has run. This may be useful\ + \ to clean up your staging area.\n\nSee [`afterScript`](https://www.nextflow.io/docs/latest/process.html#afterscript).\n" + type: "string" + executor: + description: "The `executor` defines the underlying system where processes\ + \ are executed. By default a process uses the executor defined globally\ + \ in the nextflow.config file.\n\nThe `executor` directive allows you to\ + \ configure what executor has to be used by the process, overriding the\ + \ default configuration. The following values can be used:\n\n| Name | Executor\ + \ |\n|------|----------|\n| awsbatch | The process is executed using the\ + \ AWS Batch service. | \n| azurebatch | The process is executed using the\ + \ Azure Batch service. | \n| condor | The process is executed using the\ + \ HTCondor job scheduler. | \n| google-lifesciences | The process is executed\ + \ using the Google Genomics Pipelines service. | \n| ignite | The process\ + \ is executed using the Apache Ignite cluster. | \n| k8s | The process is\ + \ executed using the Kubernetes cluster. | \n| local | The process is executed\ + \ in the computer where Nextflow is launched. | \n| lsf | The process is\ + \ executed using the Platform LSF job scheduler. | \n| moab | The process\ + \ is executed using the Moab job scheduler. | \n| nqsii | The process is\ + \ executed using the NQSII job scheduler. | \n| oge | Alias for the sge\ + \ executor. | \n| pbs | The process is executed using the PBS/Torque job\ + \ scheduler. | \n| pbspro | The process is executed using the PBS Pro job\ + \ scheduler. | \n| sge | The process is executed using the Sun Grid Engine\ + \ / Open Grid Engine. | \n| slurm | The process is executed using the SLURM\ + \ job scheduler. | \n| tes | The process is executed using the GA4GH TES\ + \ service. | \n| uge | Alias for the sge executor. |\n\nSee [`executor`](https://www.nextflow.io/docs/latest/process.html#executor).\n" + type: "string" + containerOptions: + anyOf: + - description: "The `containerOptions` directive allows you to specify any\ + \ container execution option supported by the underlying container engine\ + \ (ie. Docker, Singularity, etc). This can be useful to provide container\ + \ settings only for a specific process e.g. mount a custom path.\n\nSee\ + \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" + type: "string" + - description: "The `containerOptions` directive allows you to specify any\ + \ container execution option supported by the underlying container engine\ + \ (ie. Docker, Singularity, etc). This can be useful to provide container\ + \ settings only for a specific process e.g. mount a custom path.\n\nSee\ + \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" + type: "array" + items: + type: "string" + disk: + description: "The `disk` directive allows you to define how much local disk\ + \ storage the process is allowed to use.\n\nSee [`disk`](https://www.nextflow.io/docs/latest/process.html#disk).\n" + type: "string" + tag: + description: "The `tag` directive allows you to associate each process execution\ + \ with a custom label, so that it will be easier to identify them in the\ + \ log file or in the trace execution report.\n\nSee [`tag`](https://www.nextflow.io/docs/latest/process.html#tag).\n" + type: "string" + conda: + anyOf: + - description: "The `conda` directive allows for the definition of the process\ + \ dependencies using the Conda package manager.\n\nNextflow automatically\ + \ sets up an environment for the given package names listed by in the\ + \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" + type: "string" + - description: "The `conda` directive allows for the definition of the process\ + \ dependencies using the Conda package manager.\n\nNextflow automatically\ + \ sets up an environment for the given package names listed by in the\ + \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" + type: "array" + items: + type: "string" + machineType: + description: " The `machineType` can be used to specify a predefined Google\ + \ Compute Platform machine type when running using the Google Life Sciences\ + \ executor.\n\nSee [`machineType`](https://www.nextflow.io/docs/latest/process.html#machinetype).\n" + type: "string" + stageInMode: + description: "The `stageInMode` directive defines how input files are staged-in\ + \ to the process work directory. The following values are allowed:\n\n|\ + \ Value | Description |\n|-------|-------------| \n| copy | Input files\ + \ are staged in the process work directory by creating a copy. | \n| link\ + \ | Input files are staged in the process work directory by creating an\ + \ (hard) link for each of them. | \n| symlink | Input files are staged in\ + \ the process work directory by creating a symbolic link with an absolute\ + \ path for each of them (default). | \n| rellink | Input files are staged\ + \ in the process work directory by creating a symbolic link with a relative\ + \ path for each of them. | \n\nSee [`stageInMode`](https://www.nextflow.io/docs/latest/process.html#stageinmode).\n" + type: "string" + cache: + anyOf: + - description: "The `cache` directive allows you to store the process results\ + \ to a local cache. When the cache is enabled and the pipeline is launched\ + \ with the resume option, any following attempt to execute the process,\ + \ along with the same inputs, will cause the process execution to be skipped,\ + \ producing the stored data as the actual results.\n\nThe caching feature\ + \ generates a unique key by indexing the process script and inputs. This\ + \ key is used to identify univocally the outputs produced by the process\ + \ execution.\n\nThe `cache` is enabled by default, you can disable it\ + \ for a specific process by setting the cache directive to `false`.\n\n\ + Accepted values are: `true`, `false`, `\"deep\"`, and `\"lenient\"`.\n\ + \nSee [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" + type: "boolean" + - description: "The `cache` directive allows you to store the process results\ + \ to a local cache. When the cache is enabled and the pipeline is launched\ + \ with the resume option, any following attempt to execute the process,\ + \ along with the same inputs, will cause the process execution to be skipped,\ + \ producing the stored data as the actual results.\n\nThe caching feature\ + \ generates a unique key by indexing the process script and inputs. This\ + \ key is used to identify univocally the outputs produced by the process\ + \ execution.\n\nThe `cache` is enabled by default, you can disable it\ + \ for a specific process by setting the cache directive to `false`.\n\n\ + Accepted values are: `true`, `false`, `\"deep\"`, and `\"lenient\"`.\n\ + \nSee [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" + type: "string" + pod: + anyOf: + - description: "The `pod` directive allows the definition of pods specific\ + \ settings, such as environment variables, secrets and config maps when\ + \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" + type: "object" + additionalProperties: + description: "The `pod` directive allows the definition of pods specific\ + \ settings, such as environment variables, secrets and config maps when\ + \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" + type: "string" + - description: "The `pod` directive allows the definition of pods specific\ + \ settings, such as environment variables, secrets and config maps when\ + \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" + type: "array" + items: + type: "object" + additionalProperties: + type: "string" + penv: + description: "The `penv` directive allows you to define the parallel environment\ + \ to be used when submitting a parallel task to the SGE resource manager.\n\ + \nSee [`penv`](https://www.nextflow.io/docs/latest/process.html#penv).\n" + type: "string" + scratch: + anyOf: + - description: "The `scratch` directive allows you to execute the process\ + \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" + type: "boolean" + - description: "The `scratch` directive allows you to execute the process\ + \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" + type: "string" + storeDir: + description: "The `storeDir` directive allows you to define a directory that\ + \ is used as a permanent cache for your process results.\n\nSee [`storeDir`](https://www.nextflow.io/docs/latest/process.html#storeDir).\n" + type: "string" + maxRetries: + anyOf: + - description: "The `maxRetries` directive allows you to define the maximum\ + \ number of times a process instance can be re-submitted in case of failure.\ + \ This value is applied only when using the retry error strategy. By default\ + \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" + type: "string" + - description: "The `maxRetries` directive allows you to define the maximum\ + \ number of times a process instance can be re-submitted in case of failure.\ + \ This value is applied only when using the retry error strategy. By default\ + \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" + type: "integer" + echo: + anyOf: + - description: "By default the stdout produced by the commands executed in\ + \ all processes is ignored. By setting the `echo` directive to true, you\ + \ can forward the process stdout to the current top running process stdout\ + \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" + type: "boolean" + - description: "By default the stdout produced by the commands executed in\ + \ all processes is ignored. By setting the `echo` directive to true, you\ + \ can forward the process stdout to the current top running process stdout\ + \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" + type: "string" + errorStrategy: + description: "The `errorStrategy` directive allows you to define how an error\ + \ condition is managed by the process. By default when an error status is\ + \ returned by the executed script, the process stops immediately. This in\ + \ turn forces the entire pipeline to terminate.\n\nTable of available error\ + \ strategies:\n| Name | Executor |\n|------|----------|\n| `terminate` |\ + \ Terminates the execution as soon as an error condition is reported. Pending\ + \ jobs are killed (default) |\n| `finish` | Initiates an orderly pipeline\ + \ shutdown when an error condition is raised, waiting the completion of\ + \ any submitted job. |\n| `ignore` | Ignores processes execution errors.\ + \ |\n| `retry` | Re-submit for execution a process returning an error condition.\ + \ |\n\nSee [`errorStrategy`](https://www.nextflow.io/docs/latest/process.html#errorstrategy).\n" + type: "string" + memory: + description: "The `memory` directive allows you to define how much memory\ + \ the process is allowed to use.\n\nSee [`memory`](https://www.nextflow.io/docs/latest/process.html#memory).\n" + type: "string" + stageOutMode: + description: "The `stageOutMode` directive defines how output files are staged-out\ + \ from the scratch directory to the process work directory. The following\ + \ values are allowed:\n\n| Value | Description |\n|-------|-------------|\ + \ \n| copy | Output files are copied from the scratch directory to the work\ + \ directory. | \n| move | Output files are moved from the scratch directory\ + \ to the work directory. | \n| rsync | Output files are copied from the\ + \ scratch directory to the work directory by using the rsync utility. |\n\ + \nSee [`stageOutMode`](https://www.nextflow.io/docs/latest/process.html#stageoutmode).\n" + type: "string" + required: [] + additionalProperties: false + NextflowAuto: + description: "Automated processing flags which can be toggled on or off." + type: "object" + properties: + simplifyInput: + description: "If `true`, an input tuple only containing only a single File\ + \ (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed to\ + \ a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`).\n\nDefault: `true`.\n" + type: "boolean" + simplifyOutput: + description: "If `true`, an output tuple containing a map with a File (e.g.\ + \ `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically transformed\ + \ to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`).\n\nDefault: `true`.\n" + type: "boolean" + publish: + description: "If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`.\nWill throw an error if `params.publishDir` is\ + \ not defined.\n\nDefault: `false`.\n" + type: "boolean" + transcript: + description: "If `true`, the module's transcripts from `work/` are automatically\ + \ published to `params.transcriptDir`.\nIf not defined, `params.publishDir\ + \ + \"/_transcripts\"` will be used.\nWill throw an error if neither are\ + \ defined.\n\nDefault: `false`.\n" + type: "boolean" + required: [] + additionalProperties: false + NextflowConfig: + description: "Allows tweaking how the Nextflow Config file is generated." + type: "object" + properties: + labels: + description: "A series of default labels to specify memory and cpu constraints.\n\ + \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"mem4gb\"\ + , ... upto \"mem512tb\" and follows powers of 2.\nThe default cpu labels\ + \ are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ... upto \"cpu1000\"\ + \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ + \ it is possible for a Viash Config to overwrite the full labels parameter,\ + \ however likely it is more efficient to add additional labels\nin the Viash\ + \ Project with a config mod.\n" + type: "object" + additionalProperties: + description: "A series of default labels to specify memory and cpu constraints.\n\ + \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"\ + mem4gb\", ... upto \"mem512tb\" and follows powers of 2.\nThe default\ + \ cpu labels are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ...\ + \ upto \"cpu1000\" and follows a semi logarithmic scale (1, 2, 5 per decade).\n\ + \nConceptually it is possible for a Viash Config to overwrite the full\ + \ labels parameter, however likely it is more efficient to add additional\ + \ labels\nin the Viash Project with a config mod.\n" + type: "string" + script: + anyOf: + - description: "Includes a single string or list of strings into the nextflow.config\ + \ file.\nThis can be used to add custom profiles or include an additional\ + \ config file.\n" + type: "string" + - description: "Includes a single string or list of strings into the nextflow.config\ + \ file.\nThis can be used to add custom profiles or include an additional\ + \ config file.\n" + type: "array" + items: + type: "string" + required: [] + additionalProperties: false + DockerSetupStrategy: + $comment: "TODO add descriptions to different strategies" + enum: + - "cb" + - "ifneedbepullelsecachedbuild" + - "donothing" + - "gentlepush" + - "alwayspullelsebuild" + - "build" + - "alwayspull" + - "alwaysbuild" + - "ifneedbebuild" + - "pullelsebuild" + - "p" + - "alwayspullelsecachedbuild" + - "pull" + - "maybepush" + - "ifneedbepullelsebuild" + - "cachedbuild" + - "pullelsecachedbuild" + - "push" + - "forcepush" + - "alwayspush" + - "b" + - "pushifnotpresent" + - "alwayscachedbuild" + - "meh" + - "ifneedbepull" + - "ifneedbecachedbuild" + description: "The Docker setup strategy to use when building a container." + Direction: + enum: + - "input" + - "output" + description: "Makes this argument an `input` or an `output`, as in does the file/folder\ + \ needs to be read or written. `input` by default." + Status: + enum: + - "enabled" + - "disabled" + - "deprecated" + description: "Allows setting a component to active, deprecated or disabled." + DockerResolveVolume: + $comment: "TODO make fully case insensitive" + enum: + - "manual" + - "automatic" + - "auto" + - "Manual" + - "Automatic" + - "Auto" + description: "Enables or disables automatic volume mapping. Enabled when set to\ + \ `Automatic` or disabled when set to `Manual`. Default: `Automatic`" diff --git a/src/common/schemas/task_control_method.yaml b/src/common/schemas/task_control_method.yaml new file mode 100644 index 0000000000..f4e760db50 --- /dev/null +++ b/src/common/schemas/task_control_method.yaml @@ -0,0 +1,70 @@ +title: Control Method +description: | + A control method is used to test the relative performance of all other methods, + and also as a quality control for the pipeline as a whole. A control method can + either be a positive control or a negative control. The positive control and + negative control methods set a maximum and minimum threshold for performance, + so any new method should perform better than the negative control methods and + worse than the positive control method. +type: object +required: [__merge__, functionality, platforms] +properties: + __merge__: + "$ref": "defs_common.yaml#/definitions/CompAPIMerge" + functionality: + type: object + description: Information regarding the functionality of the component. + required: [name, info, resources] + additionalProperties: false + properties: + name: + "$ref": "defs_common.yaml#/definitions/Name" + status: + "$ref": "defs_viash.yaml#/definitions/Status" + info: + type: object + description: Metadata of the component. + additionalProperties: false + required: [label, summary, description, preferred_normalization] + properties: + label: + "$ref": "defs_common.yaml#/definitions/Label" + summary: + "$ref": "defs_common.yaml#/definitions/Summary" + description: + "$ref": "defs_common.yaml#/definitions/Description" + preferred_normalization: + "$ref": "defs_common.yaml#/definitions/PreferredNormalization" + reference: + "$ref": "defs_common.yaml#/definitions/BibtexReference" + documentation_url: + "$ref": "defs_common.yaml#/definitions/DocumentationURL" + repository_url: + "$ref": "defs_common.yaml#/definitions/RepositoryURL" + v1: + "$ref": "defs_common.yaml#/definitions/MigrationV1" + variants: + "$ref": "defs_common.yaml#/definitions/MethodVariants" + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "defs_viash.yaml#/definitions/Argument" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + platforms: + type: array + description: A list of platforms which Viash generates target artifacts for. + items: + anyOf: + - "$ref": "defs_common.yaml#/definitions/PlatformDocker" + - "$ref": "defs_common.yaml#/definitions/PlatformNative" + - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/task_info.yaml b/src/common/schemas/task_info.yaml new file mode 100644 index 0000000000..143a5e3f93 --- /dev/null +++ b/src/common/schemas/task_info.yaml @@ -0,0 +1,24 @@ +title: Task info +description: A file format specification file. +type: object +additionalProperties: false +required: [name, label, summary, motivation, description] +properties: + name: + $ref: "defs_common.yaml#/definitions/Name" + label: + $ref: "defs_common.yaml#/definitions/Label" + summary: + $ref: "defs_common.yaml#/definitions/Summary" + image: + $ref: "defs_common.yaml#/definitions/Image" + motivation: + $ref: "defs_common.yaml#/definitions/Description" + description: + $ref: "defs_common.yaml#/definitions/Description" + v1: + $ref: "defs_common.yaml#/definitions/MigrationV1" + authors: + type: array + items: + $ref: "defs_common.yaml#/definitions/Author" diff --git a/src/common/schemas/task_method.yaml b/src/common/schemas/task_method.yaml new file mode 100644 index 0000000000..c74d8b762c --- /dev/null +++ b/src/common/schemas/task_method.yaml @@ -0,0 +1,67 @@ +title: Method +description: | + A method is a specific technique used to solve the task problem and is + compared to the control methods and other methods to determine the best + approach for the task depending on the type of dataset. +type: object +required: [__merge__, functionality, platforms] +properties: + __merge__: + "$ref": "defs_common.yaml#/definitions/CompAPIMerge" + functionality: + type: object + description: Information regarding the functionality of the component. + required: [name, info, resources] + additionalProperties: false + properties: + name: + "$ref": "defs_common.yaml#/definitions/Name" + status: + "$ref": "defs_viash.yaml#/definitions/Status" + info: + type: object + description: Metadata of the component. + additionalProperties: false + required: [label, summary, description, preferred_normalization, reference, documentation_url, repository_url] + properties: + label: + "$ref": "defs_common.yaml#/definitions/Label" + summary: + "$ref": "defs_common.yaml#/definitions/Summary" + description: + "$ref": "defs_common.yaml#/definitions/Description" + preferred_normalization: + "$ref": "defs_common.yaml#/definitions/PreferredNormalization" + reference: + "$ref": "defs_common.yaml#/definitions/BibtexReference" + documentation_url: + "$ref": "defs_common.yaml#/definitions/DocumentationURL" + repository_url: + "$ref": "defs_common.yaml#/definitions/RepositoryURL" + v1: + "$ref": "defs_common.yaml#/definitions/MigrationV1" + variants: + "$ref": "defs_common.yaml#/definitions/MethodVariants" + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "defs_viash.yaml#/definitions/Argument" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + platforms: + type: array + description: A list of platforms which Viash generates target artifacts for. + items: + anyOf: + - "$ref": "defs_common.yaml#/definitions/PlatformDocker" + - "$ref": "defs_common.yaml#/definitions/PlatformNative" + - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/schemas/task_metric.yaml b/src/common/schemas/task_metric.yaml new file mode 100644 index 0000000000..198646fc48 --- /dev/null +++ b/src/common/schemas/task_metric.yaml @@ -0,0 +1,88 @@ +title: Metric +description: | + A metric is a quantitative measure used to evaluate the performance of the + different methods in solving the specific task problem. +type: object +required: [__merge__, functionality, platforms] +properties: + __merge__: + "$ref": "defs_common.yaml#/definitions/CompAPIMerge" + functionality: + type: object + description: Information regarding the functionality of the component. + required: [name, info, resources] + additionalProperties: false + properties: + name: + "$ref": "defs_common.yaml#/definitions/Name" + status: + "$ref": "defs_viash.yaml#/definitions/Status" + info: + type: object + description: Metadata of the component. + additionalProperties: false + required: [metrics] + properties: + metrics: + type: array + minItems: 1 + items: + type: object + description: Metadata of each metric. + additionalProperties: false + required: [label, summary, description, reference, min, max, maximize] + properties: + name: + "$ref": "defs_common.yaml#/definitions/Name" + label: + "$ref": "defs_common.yaml#/definitions/Label" + summary: + "$ref": "defs_common.yaml#/definitions/Summary" + description: + "$ref": "defs_common.yaml#/definitions/Description" + reference: + "$ref": "defs_common.yaml#/definitions/BibtexReference" + documentation_url: + "$ref": "defs_common.yaml#/definitions/DocumentationURL" + repository_url: + "$ref": "defs_common.yaml#/definitions/RepositoryURL" + variants: + "$ref": "defs_common.yaml#/definitions/MethodVariants" + min: + description: The lowest possible value of the metric. + oneOf: + - type: number + - const: "-.inf" + max: + description: The highest possible value of the metric. + oneOf: + - type: number + - const: "+.inf" + maximize: + type: boolean + description: Whether a higher metric value is better. + v1: + "$ref": "defs_common.yaml#/definitions/MigrationV1" + arguments: + type: array + description: Component-specific parameters. + items: + "$ref": "defs_viash.yaml#/definitions/Argument" + resources: + type: array + description: Resources required to run the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + test_resources: + type: array + description: One or more scripts and resources used to test the component. + items: + "$ref": "defs_viash.yaml#/definitions/Resource" + platforms: + type: array + description: A list of platforms which Viash generates target artifacts for. + items: + anyOf: + - "$ref": "defs_common.yaml#/definitions/PlatformDocker" + - "$ref": "defs_common.yaml#/definitions/PlatformNative" + - "$ref": "defs_common.yaml#/definitions/PlatformVdsl3" diff --git a/src/common/sync_test_resources/config.vsh.yaml b/src/common/sync_test_resources/config.vsh.yaml new file mode 100644 index 0000000000..f443d634e8 --- /dev/null +++ b/src/common/sync_test_resources/config.vsh.yaml @@ -0,0 +1,44 @@ +functionality: + name: "sync_test_resources" + namespace: "common" + version: "dev" + description: Synchronise the test resources from s3 to resources_test + usage: | + sync_test_resources + sync_test_resources --input s3://openproblems-data/resources_test --output resources_test + arguments: + - name: "--input" + alternatives: ["-i"] + type: string + description: "Path to the S3 bucket to sync from." + default: "s3://openproblems-data/resources_test" + - name: "--output" + alternatives: ["-o"] + type: file + default: resources_test + direction: output + description: "Path to the test resource directory." + - name: "--quiet" + type: boolean_true + description: "Displays the operations that would be performed using the specified command without actually running them." + - name: "--dryrun" + type: boolean_true + description: "Does not display the operations performed from the specified command." + - name: "--delete" + type: boolean_true + description: "Files that exist in the destination but not in the source are deleted during sync." + - name: "--exclude" + type: "string" + multiple: true + description: Exclude all files or objects from the command that matches the specified pattern. + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: run_test.sh +platforms: + - type: docker + image: "amazon/aws-cli:2.7.12" + - type: native + - type: nextflow diff --git a/src/common/sync_test_resources/run_test.sh b/src/common/sync_test_resources/run_test.sh new file mode 100755 index 0000000000..67f2504531 --- /dev/null +++ b/src/common/sync_test_resources/run_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +echo ">> Run aws s3 sync" +./$meta_functionality_name \ + --input s3://openproblems-data/resources_test/common/pancreas \ + --output foo \ + --quiet + +echo ">> Check whether the right files were copied" +[ ! -f foo/dataset.h5ad ] && echo csv should have been copied && exit 1 + +echo ">> Test succeeded!" \ No newline at end of file diff --git a/src/common/sync_test_resources/script.sh b/src/common/sync_test_resources/script.sh new file mode 100644 index 0000000000..c97b9fcdfd --- /dev/null +++ b/src/common/sync_test_resources/script.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +## VIASH START +par_input='s3://openproblems-data/resources_test' +par_output='resources_test' +## VIASH END + +extra_params=( ) + +if [ "$par_quiet" == "true" ]; then + extra_params+=( "--quiet" ) +fi +if [ "$par_dryrun" == "true" ]; then + extra_params+=( "--dryrun" ) +fi +if [ "$par_delete" == "true" ]; then + extra_params+=( "--delete" ) +fi + +if [ ! -z ${par_exclude+x} ]; then + IFS=":" + for var in $par_exclude; do + unset IFS + extra_params+=( "--exclude" "$var" ) + done +fi + + +# Disable the use of the Amazon EC2 instance metadata service (IMDS). +# see https://florian.ec/blog/github-actions-awscli-errors/ +# or https://github.com/aws/aws-cli/issues/5234#issuecomment-705831465 +export AWS_EC2_METADATA_DISABLED=true + +aws s3 sync "$par_input" "$par_output" --no-sign-request "${extra_params[@]}" diff --git a/src/datasets/README.md b/src/datasets/README.md new file mode 100644 index 0000000000..a27e061326 --- /dev/null +++ b/src/datasets/README.md @@ -0,0 +1,219 @@ + +- Common datasets + - Pipeline + topology + - File format API + - Dataset+Pca+Hvg + - Normalized Dataset + - Dataset+Pca + - Raw Dataset + - Component API + - Dataset Loader + - Normalization + - Processor Hvg + - Processor Pca + +# Common datasets + +## Pipeline topology + +``` mermaid +%%| column: screen-inset-shaded +flowchart LR + file_dataset(Dataset+Pca+Hvg) + file_normalized(Normalized Dataset) + file_pca(Dataset+Pca) + file_raw(Raw Dataset) + comp_dataset_loader[/Dataset Loader/] + comp_normalization[/Normalization/] + comp_processor_hvg[/Processor Hvg/] + comp_processor_pca[/Processor Pca/] + file_raw---comp_normalization + file_pca---comp_processor_hvg + file_normalized---comp_processor_pca + comp_dataset_loader-->file_raw + comp_normalization-->file_normalized + comp_processor_hvg-->file_dataset + comp_processor_pca-->file_pca +``` + +## File format API + +### `Dataset+Pca+Hvg` + +A normalised data with a PCA embedding and HVG selection + +Used in: + +- [processor hvg](#processor%20hvg): output (as output) + +Slots: + +| struct | name | type | description | +|:-------|:-----------------|:--------|:------------------------------------------------------------------------| +| layers | counts | integer | Raw counts | +| layers | normalized | double | Normalised expression values | +| obs | celltype | string | Cell type information | +| obs | batch | string | Batch information | +| obs | tissue | string | Tissue information | +| obs | size_factors | double | The size factors created by the normalisation method, if any. | +| var | hvg | boolean | Whether or not the feature is considered to be a ‘highly variable gene’ | +| var | hvg_score | integer | A ranking of the features by hvg. | +| obsm | X_pca | double | The resulting PCA embedding. | +| varm | pca_loadings | double | The PCA loadings matrix. | +| uns | dataset_id | string | A unique identifier for the dataset | +| uns | normalization_id | string | Which normalization was used | +| uns | pca_variance | double | The PCA variance objects. | + +Example: + + AnnData object + obs: 'celltype', 'batch', 'tissue', 'size_factors' + var: 'hvg', 'hvg_score' + uns: 'dataset_id', 'normalization_id', 'pca_variance' + obsm: 'X_pca' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + +### `Normalized Dataset` + +A normalized dataset + +Used in: + +- [normalization](#normalization): output (as output) +- [processor pca](#processor%20pca): input (as input) + +Slots: + +| struct | name | type | description | +|:-------|:-----------------|:--------|:--------------------------------------------------------------| +| layers | counts | integer | Raw counts | +| layers | normalized | double | Normalised expression values | +| obs | celltype | string | Cell type information | +| obs | batch | string | Batch information | +| obs | tissue | string | Tissue information | +| obs | size_factors | double | The size factors created by the normalisation method, if any. | +| uns | dataset_id | string | A unique identifier for the dataset | +| uns | normalization_id | string | Which normalization was used | + +Example: + + AnnData object + obs: 'celltype', 'batch', 'tissue', 'size_factors' + uns: 'dataset_id', 'normalization_id' + layers: 'counts', 'normalized' + +### `Dataset+Pca` + +A normalised data with a PCA embedding + +Used in: + +- [processor hvg](#processor%20hvg): input (as input) +- [processor pca](#processor%20pca): output (as output) + +Slots: + +| struct | name | type | description | +|:-------|:-----------------|:--------|:--------------------------------------------------------------| +| layers | counts | integer | Raw counts | +| layers | normalized | double | Normalised expression values | +| obs | celltype | string | Cell type information | +| obs | batch | string | Batch information | +| obs | tissue | string | Tissue information | +| obs | size_factors | double | The size factors created by the normalisation method, if any. | +| obsm | X_pca | double | The resulting PCA embedding. | +| varm | pca_loadings | double | The PCA loadings matrix. | +| uns | dataset_id | string | A unique identifier for the dataset | +| uns | normalization_id | string | Which normalization was used | +| uns | pca_variance | double | The PCA variance objects. | + +Example: + + AnnData object + obs: 'celltype', 'batch', 'tissue', 'size_factors' + uns: 'dataset_id', 'normalization_id', 'pca_variance' + obsm: 'X_pca' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + +### `Raw Dataset` + +An unprocessed dataset as output by a dataset loader. + +Used in: + +- [dataset loader](#dataset%20loader): output (as output) +- [normalization](#normalization): input (as input) + +Slots: + +| struct | name | type | description | +|:-------|:-----------|:--------|:------------------------------------| +| layers | counts | integer | Raw counts | +| obs | celltype | string | Cell type information | +| obs | batch | string | Batch information | +| obs | tissue | string | Tissue information | +| uns | dataset_id | string | A unique identifier for the dataset | + +Example: + + AnnData object + obs: 'celltype', 'batch', 'tissue' + uns: 'dataset_id' + layers: 'counts' + +## Component API + +### `Dataset Loader` + +Arguments: + +| Name | Type | Direction | Description | +|:-----------|:------------------------------|:----------|:------------------------------------------------------| +| `--output` | [Raw Dataset](#Raw%20dataset) | output | An unprocessed dataset as output by a dataset loader. | + +### `Normalization` + +Arguments: + +| Name | Type | Direction | Description | +|:---------------------|:--------------------------------------------|:----------|:-------------------------------------------------------------| +| `--input` | [Raw Dataset](#Raw%20dataset) | input | An unprocessed dataset as output by a dataset loader. | +| `--output` | [Normalized Dataset](#Normalized%20dataset) | output | A normalized dataset | +| `--layer_output` | `string` | input | The name of the layer in which to store the normalized data. | +| `--obs_size_factors` | `string` | input | In which .obs slot to store the size factors (if any). | + +### `Processor Hvg` + +Arguments: + +| Name | Type | Direction | Description | +|:------------------|:------------------------------------|:----------|:---------------------------------------------------------------------------| +| `--input` | [Dataset+Pca](#Dataset+PCA) | input | A normalised data with a PCA embedding | +| `--layer_input` | `string` | input | Which layer to use as input for the PCA. | +| `--output` | [Dataset+Pca+Hvg](#Dataset+PCA+HVG) | output | A normalised data with a PCA embedding and HVG selection | +| `--var_hvg` | `string` | input | In which .var slot to store whether a feature is considered to be hvg. | +| `--var_hvg_score` | `string` | input | In which .var slot to store whether a ranking of the features by variance. | +| `--num_features` | `integer` | input | The number of HVG to select | + +### `Processor Pca` + +Arguments: + +| Name | Type | Direction | Description | +|:-------------------|:--------------------------------------------|:----------|:---------------------------------------------------------------------------------------------------------------------| +| `--input` | [Normalized Dataset](#Normalized%20dataset) | input | A normalized dataset | +| `--layer_input` | `string` | input | Which layer to use as input for the PCA. | +| `--output` | [Dataset+Pca](#Dataset+PCA) | output | A normalised data with a PCA embedding | +| `--obsm_embedding` | `string` | input | In which .obsm slot to store the resulting embedding. | +| `--varm_loadings` | `string` | input | In which .varm slot to store the resulting loadings matrix. | +| `--uns_variance` | `string` | input | In which .uns slot to store the resulting variance objects. | +| `--num_components` | `integer` | input | Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. | diff --git a/src/datasets/README.qmd b/src/datasets/README.qmd new file mode 100644 index 0000000000..c20045fadc --- /dev/null +++ b/src/datasets/README.qmd @@ -0,0 +1,203 @@ +--- +format: gfm +toc: true +--- + +```{r setup, include=FALSE} +library(tidyverse) +library(rlang) + +strip_margin <- function(text, symbol = "\\|") { + str_replace_all(text, paste0("(\n?)[ \t]*", symbol), "\\1") +} + +dir <- "src/datasets" +dir <- "." +``` + +# Common datasets + + + +## Pipeline topology + +```{r data, include=FALSE} +comp_yamls <- list.files(paste0(dir, "/api"), pattern = "comp_", full.names = TRUE) +file_yamls <- list.files(paste0(dir, "/api"), pattern = "file_", full.names = TRUE) + +comp_file <- map_df(comp_yamls, function(yaml_file) { + conf <- yaml::read_yaml(yaml_file) + + map_df(conf$functionality$arguments, function(arg) { + df <- tibble( + comp_name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + type = arg$type, + arg_name = str_replace_all(arg$name, "^-*", ""), + direction = arg$direction %||% "input", + description = arg$description + ) + if ("__merge__" %in% names(arg)) { + df$file_name <- basename(arg$`__merge__`) %>% gsub("\\.yaml", "", .) + } + df + }) +}) + +comp_info <- map_df(comp_yamls, function(yaml_file) { + conf <- yaml::read_yaml(yaml_file) + + tibble( + name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + label = name %>% gsub("comp_", "", .) %>% gsub("_", " ", .) + ) +}) + +file_info <- map_df(file_yamls, function(yaml_file) { + arg <- yaml::read_yaml(yaml_file) + + tibble( + name = basename(yaml_file) %>% gsub("\\.yaml", "", .), + description = arg$description, + example = arg$example, + label = arg$info$label %||% (name %>% gsub("file_", "", .) %>% gsub("_", " ", .)) + ) +}) + +file_slot <- map_df(file_yamls, function(yaml_file) { + arg <- yaml::read_yaml(yaml_file) + + map2_df(names(arg$info$slots), arg$info$slots, function(group_name, slot) { + df <- map_df(slot, as.data.frame) + df$struct <- group_name + df$file_name <- basename(yaml_file) %>% gsub("\\.yaml", "", .) + df$multiple <- df$multiple %||% FALSE %|% FALSE + as_tibble(df) + }) +}) +``` + +```{r flow, echo=FALSE,warning=FALSE,error=FALSE} +nodes <- bind_rows( + file_info %>% + transmute(id = name, label = str_to_title(label), is_comp = FALSE), + comp_info %>% + transmute(id = name, label = str_to_title(label), is_comp = TRUE) +) %>% + mutate(str = paste0( + " ", + id, + ifelse(is_comp, "[/", "("), + label, + ifelse(is_comp, "/]", ")") + )) +edges <- bind_rows( + comp_file %>% + filter(direction == "input", !is.na(file_name)) %>% + transmute( + from = file_name, + to = comp_name, + arrow = "---" + ), + comp_file %>% + filter(direction == "output", !is.na(file_name)) %>% + transmute( + from = comp_name, + to = file_name, + arrow = "-->" + ) +) %>% + mutate(str = paste0(" ", from, arrow, to)) + +# note: use ```{mermaid} instead of ```mermaid when rendering to html +out_str <- strip_margin(glue::glue(" + §```mermaid + §%%| column: screen-inset-shaded + §flowchart LR + §{paste(nodes$str, collapse = '\n')} + §{paste(edges$str, collapse = '\n')} + §``` + §"), symbol = "§") +knitr::asis_output(out_str) +``` + +## File format API + +```{r file_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"} +for (file_name in file_info$name) { + arg_info <- file_info %>% filter(name == file_name) + sub_out <- file_slot %>% + filter(file_name == !!file_name) %>% + select(struct, name, type, description) + + used_in <- comp_file %>% + filter(file_name == !!file_name) %>% + left_join(comp_info %>% select(comp_name = name, comp_label = label), by = "comp_name") %>% + mutate(str = paste0("* [", comp_label, "](#", comp_label, "): ", arg_name, " (as ", direction, ")")) %>% + pull(str) + + example <- sub_out %>% + group_by(struct) %>% + summarise( + str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", ")) + ) %>% + arrange(match(struct, c("obs", "var", "uns", "obsm", "obsp", "varm", "varp", "layers"))) + + example_str <- c(" AnnData object", paste0(" ", example$str)) + + out_str <- strip_margin(glue::glue(" + §### `{str_to_title(arg_info$label)}` + § + §{arg_info$description} + § + §Used in: + § + §{paste(used_in, collapse = '\n')} + § + §Slots: + § + §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')} + § + §Example: + § + §{paste(example_str, collapse = '\n')} + § + §"), symbol = "§") + cat(out_str) +} +``` + + + +## Component API + +```{r comp_api, echo=FALSE,warning=FALSE,error=FALSE,output="asis"} +# todo: add description +# todo: add required info fields +for (comp_name in comp_info$name) { + comp <- comp_info %>% filter(name == comp_name) + sub_out <- comp_file %>% + filter(comp_name == !!comp_name) %>% + left_join(file_info %>% select(file_name = name, file_desc = description, file_label = label), by = "file_name") %>% + transmute( + Name = paste0("`--", arg_name, "`"), + Type = ifelse( + is.na(file_label), + paste0("`", type, "`"), + paste0("[", str_to_title(file_label), "](#", file_label, ")") + ), + Direction = direction, + Description = description %|% file_desc + ) + + out_str <- strip_margin(glue::glue(" + §### `{str_to_title(comp$label)}` + § + §{ifelse(\"description\" %in% names(comp), comp$description, \"\")} + § + §Arguments: + § + §{paste(knitr::kable(sub_out, format = 'pipe'), collapse = '\n')} + §"), symbol = "§") + cat(out_str) +} +``` \ No newline at end of file diff --git a/src/datasets/api/README.md b/src/datasets/api/README.md new file mode 100644 index 0000000000..7c3b9c8d87 --- /dev/null +++ b/src/datasets/api/README.md @@ -0,0 +1,8 @@ +# Component and file format specifications + +This folder contains specifications for file formats and component +interfaces. + +These are not only used for documentation (i.e. to document the file +format of inputs and outputs of a component), but also for unit testing +and validation of output files. diff --git a/src/datasets/api/README.qmd b/src/datasets/api/README.qmd new file mode 100644 index 0000000000..d31a99367e --- /dev/null +++ b/src/datasets/api/README.qmd @@ -0,0 +1,8 @@ +--- +title: Component and file format specifications +format: gfm +--- + +This folder contains specifications for file formats and component interfaces. + +These are not only used for documentation (i.e. to document the file format of inputs and outputs of a component), but also for unit testing and validation of output files. \ No newline at end of file diff --git a/src/datasets/api/comp_dataset_loader.yaml b/src/datasets/api/comp_dataset_loader.yaml new file mode 100644 index 0000000000..75909b106a --- /dev/null +++ b/src/datasets/api/comp_dataset_loader.yaml @@ -0,0 +1,16 @@ +functionality: + namespace: "datasets/loaders" + info: + type: dataset_loader + type_info: + label: Dataset loader + summary: A component which generates a "Common dataset". + description: | + A dataset loader will typically have an identifier (e.g. a GEO identifier) + or URL as input argument and additional arguments to define where the script needs to download a dataset from and how to process it. + arguments: + - name: "--output" + __merge__: file_raw.yaml + direction: "output" + required: true + test_resources: [] \ No newline at end of file diff --git a/src/datasets/api/comp_normalization.yaml b/src/datasets/api/comp_normalization.yaml new file mode 100644 index 0000000000..6f2c1ffa64 --- /dev/null +++ b/src/datasets/api/comp_normalization.yaml @@ -0,0 +1,36 @@ +functionality: + namespace: "datasets/normalization" + info: + type: dataset_normalization + type_info: + label: Dataset normalization + summary: | + A normalization method which processes the raw counts into a normalized dataset. + description: + A component for normalizing the raw counts as output by dataset loaders into a normalized dataset. + arguments: + - name: "--input" + __merge__: file_raw.yaml + direction: input + required: true + - name: "--output" + __merge__: file_normalized.yaml + direction: output + required: true + - name: "--normalization_id" + type: string + description: "The normalization id to store in the dataset metadata. If not specified, the functionality name will be used." + required: false + - name: "--layer_output" + type: string + default: "normalized" + description: The name of the layer in which to store the normalized data. + - name: "--obs_size_factors" + type: string + default: "size_factors" + description: In which .obs slot to store the size factors (if any). + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py diff --git a/src/datasets/api/comp_processor_hvg.yaml b/src/datasets/api/comp_processor_hvg.yaml new file mode 100644 index 0000000000..2e24033aac --- /dev/null +++ b/src/datasets/api/comp_processor_hvg.yaml @@ -0,0 +1,40 @@ +functionality: + namespace: "datasets/processors" + info: + type: dataset_processor + type_info: + label: HVG + summary: | + Computes the highly variable genes scores. + description: | + The resulting AnnData will contain both a boolean 'hvg' column in 'var', as well as a numerical 'hvg_score' in 'var'. + arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_hvg.yaml + required: true + - name: "--var_hvg" + type: string + default: "hvg" + description: "In which .var slot to store whether a feature is considered to be hvg." + - name: "--var_hvg_score" + type: string + default: "hvg_score" + description: "In which .var slot to store the gene variance score (normalized dispersion)." + - name: "--num_features" + type: integer + default: 1000 + description: "The number of HVG to select" + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py diff --git a/src/datasets/api/comp_processor_knn.yaml b/src/datasets/api/comp_processor_knn.yaml new file mode 100644 index 0000000000..b0e16f8fc4 --- /dev/null +++ b/src/datasets/api/comp_processor_knn.yaml @@ -0,0 +1,39 @@ +functionality: + namespace: "datasets/processors" + info: + type: dataset_processor + type_info: + label: KNN + summary: | + Computes the k-nearest-neighbours for each cell. + description: | + The resulting AnnData will contain both the knn distances and the knn connectivities in 'obsp'. + arguments: + - name: "--input" + __merge__: file_pca.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_knn.yaml + required: true + - name: "--key_added" + type: string + default: "knn" + description: | + The neighbors data is added to `.uns[key_added]`, + distances are stored in `.obsp[key_added+'_distances']` and + connectivities in `.obsp[key_added+'_connectivities']`. + - name: "--num_neighbors" + type: integer + default: 15 + description: "The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation." + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py diff --git a/src/datasets/api/comp_processor_pca.yaml b/src/datasets/api/comp_processor_pca.yaml new file mode 100644 index 0000000000..a7ca82bc07 --- /dev/null +++ b/src/datasets/api/comp_processor_pca.yaml @@ -0,0 +1,49 @@ +functionality: + namespace: "datasets/processors" + info: + type: dataset_processor + type_info: + label: PCA + summary: | + Computes a PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm, as well as optional loadings in 'varm'. + arguments: + - name: "--input" + __merge__: file_hvg.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--input_var_features" + type: string + description: Column name in .var matrix that will be used to select which genes to run the PCA on. + default: hvg + - name: "--output" + direction: output + __merge__: file_pca.yaml + required: true + - name: "--obsm_embedding" + type: string + default: "X_pca" + description: "In which .obsm slot to store the resulting embedding." + - name: "--varm_loadings" + type: string + default: "pca_loadings" + description: "In which .varm slot to store the resulting loadings matrix." + - name: "--uns_variance" + type: string + default: "pca_variance" + description: "In which .uns slot to store the resulting variance objects." + - name: "--num_components" + type: integer + example: 25 + description: Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + diff --git a/src/datasets/api/comp_processor_subset.yaml b/src/datasets/api/comp_processor_subset.yaml new file mode 100644 index 0000000000..bad64a6762 --- /dev/null +++ b/src/datasets/api/comp_processor_subset.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "datasets/processors" + info: + type: dataset_processor + type_info: + label: Subset + summary: Sample cells and genes randomly. + description: This component subsets the layers, obs and var to create smaller test datasets. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_common_dataset.yaml + direction: input + required: false + - name: "--output" + __merge__: file_common_dataset.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_common_dataset.yaml + direction: output + required: false + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + diff --git a/src/datasets/api/comp_processor_svd.yaml b/src/datasets/api/comp_processor_svd.yaml new file mode 100644 index 0000000000..91413c2624 --- /dev/null +++ b/src/datasets/api/comp_processor_svd.yaml @@ -0,0 +1,45 @@ +functionality: + namespace: "datasets/processors" + info: + type: dataset_processor + type_info: + label: SVD + summary: | + Computes a SVD PCA embedding of the normalized data. + description: + The resulting AnnData will contain an embedding in obsm. + arguments: + - name: "--input" + __merge__: file_normalized.yaml + required: true + direction: input + - name: "--input_mod2" + __merge__: file_normalized.yaml + required: false + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + direction: output + __merge__: file_svd.yaml + required: true + - name: "--output_mod2" + direction: output + __merge__: file_svd.yaml + required: false + - name: "--obsm_embedding" + type: string + default: "X_svd" + description: "In which .obsm slot to store the resulting embedding." + - name: "--num_components" + type: integer + default: 100 + description: Number of principal components to compute. Defaults to 100, or 1 - minimum dimension size of selected representation. + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + diff --git a/src/datasets/api/file_common_dataset.yaml b/src/datasets/api/file_common_dataset.yaml new file mode 100644 index 0000000000..ed7836bf5c --- /dev/null +++ b/src/datasets/api/file_common_dataset.yaml @@ -0,0 +1,9 @@ +__merge__: file_knn.yaml +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common dataset" + summary: A dataset processed by the common dataset processing pipeline. + description: | + This dataset contains both raw counts and normalized data matrices, + as well as a PCA embedding, HVG selection and a kNN graph. diff --git a/src/datasets/api/file_hvg.yaml b/src/datasets/api/file_hvg.yaml new file mode 100644 index 0000000000..697be29e32 --- /dev/null +++ b/src/datasets/api/file_hvg.yaml @@ -0,0 +1,16 @@ +__merge__: file_normalized.yaml +type: file +example: "resources_test/common/pancreas/hvg.h5ad" +info: + label: "Dataset+HVG" + summary: "A normalised dataset with a PCA embedding and HVG selection." + slots: + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true diff --git a/src/datasets/api/file_knn.yaml b/src/datasets/api/file_knn.yaml new file mode 100644 index 0000000000..de7d2b8df5 --- /dev/null +++ b/src/datasets/api/file_knn.yaml @@ -0,0 +1,21 @@ +__merge__: file_pca.yaml +type: file +example: "resources_test/common/pancreas/knn.h5ad" +info: + label: "Dataset+HVG+PCA+kNN" + summary: "A normalised data with a PCA embedding, HVG selection and a kNN graph" + slots: + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + uns: + - type: object + name: knn + description: Supplementary K nearest neighbors data. + required: true diff --git a/src/datasets/api/file_multimodal_dataset.yaml b/src/datasets/api/file_multimodal_dataset.yaml new file mode 100644 index 0000000000..daac29d77b --- /dev/null +++ b/src/datasets/api/file_multimodal_dataset.yaml @@ -0,0 +1,243 @@ +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common dataset" + summary: A dataset processed by the common dataset processing pipeline. + description: | + This dataset contains both raw counts and normalized data matrices, + as well as a SVD embedding and a HVG selection. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + + - type: double + name: normalized + description: Normalised expression values + required: true + obs: + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + + - type: double + name: size_factors + description: The size factors created by the normalisation method, if any. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + + obsm: + - type: double + name: X_svd + description: The resulting SVD embedding. + required: true + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true + + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/datasets/api/file_normalized.yaml b/src/datasets/api/file_normalized.yaml new file mode 100644 index 0000000000..ea6f14e9fb --- /dev/null +++ b/src/datasets/api/file_normalized.yaml @@ -0,0 +1,22 @@ +__merge__: file_raw.yaml +type: file +example: "resources_test/common/pancreas/normalized.h5ad" +info: + label: "Normalized dataset" + summary: "A normalized dataset" + slots: + layers: + - type: double + name: normalized + description: Normalised expression values + required: true + obs: + - type: double + name: size_factors + description: The size factors created by the normalisation method, if any. + required: false + uns: + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/datasets/api/file_pca.yaml b/src/datasets/api/file_pca.yaml new file mode 100644 index 0000000000..daa26618e1 --- /dev/null +++ b/src/datasets/api/file_pca.yaml @@ -0,0 +1,22 @@ +__merge__: file_hvg.yaml +type: file +example: "resources_test/common/pancreas/pca.h5ad" +info: + label: "Dataset+HVG+PCA" + summary: "A normalised dataset with a PCA embedding" + slots: + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + varm: + - type: double + name: pca_loadings + description: The PCA loadings matrix. + required: true + uns: + - type: double + name: pca_variance + description: The PCA variance objects. + required: true diff --git a/src/datasets/api/file_raw.yaml b/src/datasets/api/file_raw.yaml new file mode 100644 index 0000000000..7ffab3b43e --- /dev/null +++ b/src/datasets/api/file_raw.yaml @@ -0,0 +1,205 @@ +type: file +example: "resources_test/common/pancreas/raw.h5ad" +info: + label: "Raw dataset" + summary: An unprocessed dataset as output by a dataset loader. + description: | + This dataset contains raw counts and metadata as output by a dataset loader. + + The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md). + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + obs: + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/datasets/api/file_svd.yaml b/src/datasets/api/file_svd.yaml new file mode 100644 index 0000000000..2a727369e3 --- /dev/null +++ b/src/datasets/api/file_svd.yaml @@ -0,0 +1,12 @@ +__merge__: file_normalized.yaml +type: file +example: "resources_test/common/pancreas/svd.h5ad" +info: + label: "Dataset+SVD" + summary: "A normalised dataset with a SVD embedding" + slots: + obsm: + - type: double + name: X_svd + description: The resulting SVD embedding. + required: true \ No newline at end of file diff --git a/src/datasets/loaders/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..667e1c6a6b --- /dev/null +++ b/src/datasets/loaders/cellxgene_census/config.vsh.yaml @@ -0,0 +1,167 @@ +functionality: + name: cellxgene_census + namespace: datasets/loaders + description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. + argument_groups: + - name: Input database + description: "Open CellxGene Census by version or URI." + arguments: + - name: "--input_uri" + type: string + description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." + required: false + example: "s3://bucket/path" + - name: "--census_version" + description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + type: string + example: "stable" + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: "--species" + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. + required: true + example: "homo_sapiens" + - name: "--obs_value_filter" + type: string + description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." + required: true + example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" + - name: Filter cells by grouping + description: + arguments: + - name: "--cell_filter_grouping" + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] + multiple: true + - name: "--cell_filter_minimum_count" + type: integer + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--cell_filter_min_genes" + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: "--cell_filter_min_counts" + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: "--gene_filter_min_cells" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: "--obs_batch" + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: "," + example: ["batch"] + - name: "--obs_batch_separator" + type: string + description: Separator to use when concatenating the values of the `--obs_batch` fields. + required: false + default: "+" + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: "--output_compression" + type: string + choices: ["gzip", "lzf"] + required: false + example: "gzip" + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/setup_logger.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy + - type: nextflow + directives: + label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/cellxgene_census/script.py b/src/datasets/loaders/cellxgene_census/script.py new file mode 100644 index 0000000000..49c44b6b32 --- /dev/null +++ b/src/datasets/loaders/cellxgene_census/script.py @@ -0,0 +1,190 @@ +import sys +import cellxgene_census +import scanpy as sc +import tiledbsoma as soma + +## VIASH START +par = { + "input_uri": None, + "census_version": "stable", + "species": "mus_musculus", + "obs_value_filter": "dataset_id == '49e4ffcc-5444-406d-bdee-577127404ba8'", + "cell_filter_grouping": None, + "cell_filter_minimum_count": None, + "obs_batch": [ "donor_id" ], + "obs_batch_separator": "+", + "dataset_name": "pretty name", + "dataset_url": "url", + "dataset_reference": "ref", + "dataset_summary": "summ", + "dataset_description": "desc", + "dataset_organism": "mus_musculus", + "output": "output.h5ad", + "output_compression": "gzip", +} +meta = {"resources_dir": "src/common/helper_functions"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + +from setup_logger import setup_logger +logger = setup_logger() + +def connect_census(uri, census_version): + """ + Connect to CellxGene Census or user-provided TileDBSoma object + """ + ver = census_version or "stable" + logger.info("Connecting to CellxGene Census at %s", f"'{uri}'" if uri else f"version '{ver}'") + return cellxgene_census.open_soma(uri=uri, census_version=ver) + +def get_anndata(census_connection, par): + logger.info("Getting gene expression data based on `%s` query.", par["obs_value_filter"]) + # workaround for https://github.com/chanzuckerberg/cellxgene-census/issues/891 + return cellxgene_census.get_anndata( + census=census_connection, + obs_value_filter=par["obs_value_filter"], + organism=par["species"] + ) + + # exp = census_connection["census_data"][par["species"]] + # query = exp.axis_query( + # "RNA", + # obs_query=soma.AxisQuery(value_filter=par["obs_value_filter"]), + # var_query=soma.AxisQuery(), + # ) + + # n_obs = query.n_obs + # n_vars = query.n_vars + # logger.info(f"Query yields {n_obs} cells and {n_vars} genes.") + + # logger.info("Fetching obs.") + # obs = query.obs().concat().to_pandas() + + # logger.info("Fetching var.") + # var = query.var().concat().to_pandas() + + # logger.info("Fetching X.") + # X = query.X("raw") + # Xcoo = X.coos().concat() + # Xcoos = Xcoo.to_scipy().tocsr() + # Xcoos_subset = Xcoos[obs["soma_joinid"]] + + # logger.info("Creating AnnData object.") + # return sc.AnnData( + # layers={"counts": Xcoos_subset}, + # obs=obs, + # var=var + # ) + +def filter_min_cells_per_group(adata, par): + n_cells_before, _ = adata.shape + cell_count = adata.obs \ + .groupby(par["cell_filter_grouping"])["soma_joinid"] \ + .transform("count") \ + + adata = adata[cell_count >= par["cell_filter_minimum_count"]] + n_cells_after, _ = adata.shape + logger.info( + "Removed %s cells based on %s cell_filter_minimum_count of %s cell_filter_grouping." + % ((n_cells_before - n_cells_after), par["cell_filter_minimum_count"], par["cell_filter_grouping"]) + ) + return adata + +def filter_by_counts(adata, par): + logger.info("Remove cells with few counts and genes with few counts.") + n_cells_before, n_genes_before = adata.shape + # remove cells with few counts and genes with few counts + scanpy_proc = { + par["cell_filter_min_counts"]: (sc.pp.filter_cells, "min_counts"), + par["cell_filter_min_genes"]: (sc.pp.filter_cells, "min_genes"), + par["gene_filter_min_counts"]: (sc.pp.filter_genes, "min_counts"), + par["gene_filter_min_cells"]: (sc.pp.filter_genes, "min_cells"), + } + for threshold, (func, arg) in scanpy_proc.items(): + if threshold: + func(adata, **{arg: threshold}) + n_cells_after, n_genes_after = adata.shape + logger.info("Removed %s cells and %s genes.", (n_cells_before - n_cells_after), (n_genes_before - n_genes_after)) + +def move_x_to_layers(adata): + logger.info("Move .X to .layers['counts']") + adata.layers["counts"] = adata.X + adata.X = None + +def add_batch_to_obs(adata, par): + logger.info("Add batch to the AnnData object.") + if par["obs_batch"]: + # fetch batch columns from obs + cols = [adata.obs[key] for key in par["obs_batch"]] + + # join cols + obs_batch = [par["obs_batch_separator"].join(row) for row in zip(*cols)] + + # store in adata + adata.obs["batch"] = obs_batch + +def add_metadata_to_uns(adata, par): + logger.info("Add metadata to the AnnData object.") + for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: + adata.uns[key] = par[key] + +def print_unique(adata, column): + formatted = "', '".join(adata.obs[column].unique()) + logger.info(f"Unique {column}: ['{formatted}']") + +def print_summary(adata): + logger.info(f"Resulting dataset: {adata}") + + logger.info("Summary of dataset:") + obs_fields = ["assay", "assay_ontology_term_id", "cell_type", "cell_type_ontology_term_id", "dataset_id", "development_stage", "development_stage_ontology_term_id", "disease", "disease_ontology_term_id", "tissue", "tissue_ontology_term_id", "tissue_general", "tissue_general_ontology_term_id"] + for field in obs_fields: + print_unique(adata, field) +def write_anndata(adata, par): + logger.info("Writing AnnData object to '%s'", par["output"]) + + adata.write_h5ad(par["output"], compression=par["output_compression"]) + +def main(par, meta): + # check arguments + if (par["cell_filter_grouping"] is None) != (par["cell_filter_minimum_count"] is None): + raise NotImplementedError( + "You need to specify either both or none of the following parameters: cell_filter_grouping, cell_filter_minimum_count" + ) + + with connect_census(uri=par["input_uri"], census_version=par["census_version"]) as conn: + adata = get_anndata(conn, par) + + print(f"AnnData: {adata}", flush=True) + + if par["cell_filter_grouping"] is not None: + adata = filter_min_cells_per_group(adata, par) + + # remove cells with few counts and genes with few counts + filter_by_counts(adata, par) + + # logger.log(f"Filtered AnnData: {adata}") + print(f"Filtered AnnData: {adata}", flush=True) + + # use feature_id as var_names + adata.var_names = adata.var["feature_id"] + + # not needed as long as we have our own implementation of `get_anndata` + # move .X to .layers["counts"] + move_x_to_layers(adata) + + # add batch to obs + add_batch_to_obs(adata, par) + + # add metadata to uns + add_metadata_to_uns(adata, par) + + # print summary + print_summary(adata) + + # write output to file + write_anndata(adata, par) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/datasets/loaders/cellxgene_census/test.py b/src/datasets/loaders/cellxgene_census/test.py new file mode 100644 index 0000000000..dba41bcc47 --- /dev/null +++ b/src/datasets/loaders/cellxgene_census/test.py @@ -0,0 +1,61 @@ +import sys +import os +import pytest +import anndata as ad +import numpy as np + +## VIASH START +meta = { + 'resources_dir': './resources_test/', + 'executable': './target/docker/query/cellxgene_census', + 'config': '/home/di/code/openpipeline/src/query/cellxgene_census/config.vsh.yaml' +} +## VIASH END + +def test_cellxgene_extract_metadata_expression(run_component, tmp_path): + output_file = tmp_path / "output.h5ad" + + run_component([ + "--species", "homo_sapiens", + "--obs_value_filter", "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'", + "--output", output_file, + "--obs_batch", "sex,sex", + "--dataset_id", "test_dataset_id", + "--dataset_name", "test_dataset_name", + "--dataset_url", "https://test_dataset_url.com", + "--dataset_reference", "test_dataset_reference", + "--dataset_summary", "test_dataset_summary", + "--dataset_description", "test_dataset_description", + "--dataset_organism", "test_homo_sapiens", + ]) + + # check whether file exists + assert os.path.exists(output_file), "Output file does not exist" + + adata = ad.read_h5ad(output_file) + + # check obs + assert not adata.obs.empty, ".obs should not be empty" + assert "is_primary_data" in adata.obs.columns + assert np.all(adata.obs["is_primary_data"] == True) + assert "cell_type_ontology_term_id" in adata.obs.columns + assert "disease" in adata.obs.columns + assert adata.n_obs > 10 + assert np.all([x in ["male+male", "female+female"] for x in adata.obs["batch"]]) + + # check var + assert "soma_joinid" in adata.var.columns + assert "feature_id" in adata.var.columns + + # check uns + assert adata.uns["dataset_id"] == "test_dataset_id", "Incorrect .uns['dataset_id']" + assert adata.uns["dataset_name"] == "test_dataset_name", "Incorrect .uns['dataset_name']" + assert adata.uns["dataset_url"] == "https://test_dataset_url.com", "Incorrect .uns['dataset_url']" + assert adata.uns["dataset_reference"] == "test_dataset_reference", "Incorrect .uns['dataset_reference']" + assert adata.uns["dataset_summary"] == "test_dataset_summary", "Incorrect .uns['dataset_summary']" + assert adata.uns["dataset_description"] == "test_dataset_description", "Incorrect .uns['dataset_description']" + assert adata.uns["dataset_organism"] == "test_homo_sapiens", "Incorrect .uns['dataset_organism']" + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml new file mode 100644 index 0000000000..7ee4166d9d --- /dev/null +++ b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml @@ -0,0 +1,130 @@ +functionality: + name: cellxgene_census_from_source_h5ad + namespace: datasets/loaders + description: | + Query cells from a CellxGene Census or custom TileDBSoma object. + Aside from fetching the cells' RNA counts (`.X`), cell metadata + (`.obs`) and gene metadata (`.var`), this component also fetches + the dataset metadata and joins it into the cell metadata. + argument_groups: + - name: Input + description: Input arguments + arguments: + - name: "--input_id" + type: string + description: | + The dataset ID of the CellxGene Census dataset to query. + required: true + example: "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4" + - name: Count filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--cell_filter_min_genes" + type: integer + description: Remove cells with less than this number of genes. + required: false + default: 50 + - name: "--cell_filter_min_counts" + type: integer + description: Remove cells with less than this number of counts. + required: false + default: 0 + - name: "--gene_filter_min_cells" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + default: 5 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + default: 0 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: "--obs_batch" + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: "," + example: ["batch"] + - name: "--obs_batch_separator" + type: string + description: Separator to use when concatenating the values of the `--obs_batch` fields. + required: false + default: "+" + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true + - name: Outputs + description: Output arguments. + arguments: + - name: "--output" + type: file + description: Output h5ad file. + direction: output + required: true + example: output.h5ad + - name: "--output_compression" + type: string + choices: ["gzip", "lzf"] + required: false + example: "gzip" + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/setup_logger.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + #image: openproblems/base_python:1.0.0 + image: python:3.11 + setup: + - type: python + packages: + - cellxgene-census + - scanpy + test_setup: + - type: python + packages: + - viashpy + - type: nextflow + directives: + label: [highmem, midcpu] \ No newline at end of file diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py b/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py new file mode 100644 index 0000000000..900232e6a4 --- /dev/null +++ b/src/datasets/loaders/cellxgene_census_from_source_h5ad/script.py @@ -0,0 +1,131 @@ +import sys +import cellxgene_census +import scanpy as sc +import tempfile + +## VIASH START +par = { + "input_id": "0895c838-e550-48a3-a777-dbcd35d30272", + "obs_batch": [ "donor_id" ], + "obs_batch_separator": "+", + "dataset_name": "pretty name", + "dataset_url": "url", + "dataset_reference": "ref", + "dataset_summary": "summ", + "dataset_description": "desc", + "dataset_organism": "mus_musculus", + "output": "output.h5ad", + "output_compression": "gzip", +} +meta = {"resources_dir": "src/common/helper_functions"} +## VIASH END + +sys.path.append(meta["resources_dir"]) + +from setup_logger import setup_logger +logger = setup_logger() + +def get_anndata(par): + with tempfile.TemporaryDirectory() as tmp: + path = tmp + "/source.h5ad" + logger.info("Downloading source h5ad for dataset '%s' to '%s'.", par["input_id"], path) + cellxgene_census.download_source_h5ad(par["input_id"], path) + return sc.read_h5ad(path) + +def filter_by_counts(adata, par): + logger.info("Remove cells with few counts and genes with few counts.") + t0 = adata.shape + # remove cells with few counts and genes with few counts + if par["cell_filter_min_counts"]: + sc.pp.filter_cells(adata, min_counts=par["cell_filter_min_counts"]) + if par["cell_filter_min_genes"]: + sc.pp.filter_cells(adata, min_genes=par["cell_filter_min_genes"]) + if par["gene_filter_min_counts"]: + sc.pp.filter_genes(adata, min_counts=par["gene_filter_min_counts"]) + if par["gene_filter_min_cells"]: + sc.pp.filter_genes(adata, min_cells=par["gene_filter_min_cells"]) + t1 = adata.shape + logger.info("Removed %s cells and %s genes.", (t0[0] - t1[0]), (t0[1] - t1[1])) + +def move_x_to_layers(adata): + logger.info("Move .X to .layers['counts']") + adata.layers["counts"] = adata.X + adata.X = None + +def add_batch_to_obs(adata, par): + logger.info("Add batch to the AnnData object.") + if par["obs_batch"]: + # fetch batch columns from obs + cols = [adata.obs[key] for key in par["obs_batch"]] + + # join cols + obs_batch = [par["obs_batch_separator"].join(row) for row in zip(*cols)] + + # store in adata + adata.obs["batch"] = obs_batch + +def add_metadata_to_uns(adata, par): + logger.info("Add metadata to the AnnData object.") + for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: + adata.uns[key] = par[key] + +def print_unique(adata, column): + if column not in adata.obs.columns: + logger.info(f"Column {column} not found in obs") + return + formatted = "', '".join(adata.obs[column].unique()) + logger.info(f"Unique {column}: ['{formatted}']") + +def print_summary(adata): + logger.info(f"Resulting dataset: {adata}") + + logger.info("Summary of dataset:") + print_unique(adata, "assay") + print_unique(adata, "assay_ontology_term_id") + print_unique(adata, "cell_type") + print_unique(adata, "cell_type_ontology_term_id") + print_unique(adata, "dataset_id") + print_unique(adata, "development_stage") + print_unique(adata, "development_stage_ontology_term_id") + print_unique(adata, "disease") + print_unique(adata, "disease_ontology_term_id") + print_unique(adata, "tissue") + print_unique(adata, "tissue_ontology_term_id") + print_unique(adata, "tissue_general") + print_unique(adata, "tissue_general_ontology_term_id") + +def write_anndata(adata, par): + logger.info("Writing AnnData object to '%s'", par["output"]) + + adata.write_h5ad(par["output"], compression=par["output_compression"]) + +def main(par, meta): + adata = get_anndata(par) + + logger.info("AnnData: %s", str(adata)) + + # remove cells with few counts and genes with few counts + filter_by_counts(adata, par) + + # this is not needed in source h5ads + # # use feature_id as var_names + # adata.var_names = adata.var["feature_id"] + + # move .X to .layers["counts"] + move_x_to_layers(adata) + + # add batch to obs + add_batch_to_obs(adata, par) + + # add metadata to uns + add_metadata_to_uns(adata, par) + + # print summary + print_summary(adata) + + # write output to file + write_anndata(adata, par) + + +if __name__ == "__main__": + main(par, meta) diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py b/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py new file mode 100644 index 0000000000..098e8017a9 --- /dev/null +++ b/src/datasets/loaders/cellxgene_census_from_source_h5ad/test.py @@ -0,0 +1,58 @@ +import sys +import os +import pytest +import anndata as ad +import numpy as np + +## VIASH START +meta = { + 'resources_dir': './resources_test/', + 'executable': './target/docker/datasets/loaders/cellxgene_census_from_source_h5ad/cellxgene_census_from_source_h5ad', + 'config': 'src/query/cellxgene_census/config.vsh.yaml' +} +## VIASH END + +def test_cellxgene_extract_metadata_expression(run_component, tmp_path): + output_file = tmp_path / "output.h5ad" + + run_component([ + "--input_id", "0895c838-e550-48a3-a777-dbcd35d30272", + "--output", output_file, + "--obs_batch", "donor_id", + "--dataset_id", "test_dataset_id", + "--dataset_name", "test_dataset_name", + "--dataset_url", "https://test_dataset_url.com", + "--dataset_reference", "test_dataset_reference", + "--dataset_summary", "test_dataset_summary", + "--dataset_description", "test_dataset_description", + "--dataset_organism", "test_homo_sapiens", + ]) + + # check whether file exists + assert os.path.exists(output_file), "Output file does not exist" + + adata = ad.read_h5ad(output_file) + + # check obs + assert not adata.obs.empty, ".obs should not be empty" + assert "is_primary_data" in adata.obs.columns + assert "cell_type_ontology_term_id" in adata.obs.columns + assert "disease" in adata.obs.columns + assert adata.n_obs > 10 + assert np.all([x in ["C41", "C58", "C70", "C72"] for x in adata.obs["batch"]]) + + # check var + assert "feature_name" in adata.var.columns + + # check uns + assert adata.uns["dataset_id"] == "test_dataset_id", "Incorrect .uns['dataset_id']" + assert adata.uns["dataset_name"] == "test_dataset_name", "Incorrect .uns['dataset_name']" + assert adata.uns["dataset_url"] == "https://test_dataset_url.com", "Incorrect .uns['dataset_url']" + assert adata.uns["dataset_reference"] == "test_dataset_reference", "Incorrect .uns['dataset_reference']" + assert adata.uns["dataset_summary"] == "test_dataset_summary", "Incorrect .uns['dataset_summary']" + assert adata.uns["dataset_description"] == "test_dataset_description", "Incorrect .uns['dataset_description']" + assert adata.uns["dataset_organism"] == "test_homo_sapiens", "Incorrect .uns['dataset_organism']" + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..96dad30e76 --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,74 @@ +functionality: + name: "openproblems_neurips2021_bmmc" + namespace: "datasets/loaders" + description: "Fetch a dataset from the OpenProblems NeurIPS2021 competition" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Processed h5ad file published at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122. + required: true + example: GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad + - name: "--mod1" + type: string + description: Name of the first modality. + required: true + example: GEX + - name: "--mod2" + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: "A unique identifier for the dataset" + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: "--output_mod1" + __merge__: ../../api/file_raw.yaml + direction: "output" + - name: "--output_mod2" + __merge__: ../../api/file_raw.yaml + direction: "output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + # - type: file + # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py b/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py new file mode 100644 index 0000000000..de62f039f6 --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2021_bmmc/script.py @@ -0,0 +1,126 @@ +import anndata as ad +import pandas as pd +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input": "GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad", + "mod1": "GEX", + "mod2": "ATAC", + "dataset_id": "openproblems/neurips2021_bmmc", + "dataset_name": "BMMC (CITE-seq)", + "dataset_url": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122", + "dataset_reference": "Neurips", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "homo_sapiens", + "output_mod1": "output/mod1.h5ad", + "output_mod2": "output/mod2.h5ad" +} +meta = { + "functionality_name": "openproblems_neurips2021_bmmc", + "resources_dir": "/tmp/viash_inject_openproblems_neurips2021_bmmc14365472827677740971", +} +## VIASH END + +def remove_mod_col(df, mod): + df.drop(list(df.filter(like=mod)), axis=1, inplace=True) + +def remove_mod_prefix(df, mod): + suffix = f"{mod}_" + df.columns = df.columns.str.removeprefix(suffix) + +def convert_matrix(adata): + for key in adata: + if isinstance(adata[key], sparse.csr_matrix): + adata[key] = sparse.csc_matrix(adata[key]) + + +print("load dataset file", flush=True) +adata = ad.read_h5ad(par["input"]) + +# Convert to sparse csc_matrix +convert_matrix(adata.layers) +convert_matrix(adata.obsm) + +# Add is_train to obs if it is missing +if "is_train" not in adata.obs.columns: + batch_info = adata.obs["batch"] + batch_categories = batch_info.dtype.categories + # From https://github.com/openproblems-bio/neurips2021_multimodal_viash/blob/75281c039ab98b459edbf52058a18597e710ed4d/src/common/datasets/process_inhouse_datasets/script.R#L14-L17 + train = ["s1d1", "s1d2", "s2d1", "s2d4", "s3d1", "s3d6", "s3d7"] + adata.obs["is_train"] = [ "train" if x in train else "test" for x in batch_info ] + +# Construct Modality datasets +print("Construct Mod datasets", flush=True) +mask_mod1 = adata.var['feature_types'] == par["mod1"] +mask_mod2 = adata.var['feature_types'] == par["mod2"] + +adata_mod1 = adata[:, mask_mod1] +adata_mod2 = adata[:, mask_mod2] + +# Remove other modality data from obs and var +mod1_var = pd.DataFrame(adata_mod1.var) +remove_mod_col(mod1_var, par["mod2"]) +remove_mod_prefix(mod1_var, par["mod1"]) +mod1_var.index.name = "feature_name" +mod1_var.reset_index("feature_name", inplace=True) +mod1_var["feature_id"] = np.where(mod1_var.gene_id.isna(), mod1_var.feature_name, mod1_var.gene_id.astype(str)) +mod1_var.drop("gene_id", axis=1, inplace=True) +mod1_var.set_index("feature_id", drop=False, inplace=True) + +mod1_obs = pd.DataFrame(adata_mod1.obs) +remove_mod_col(mod1_obs, par["mod2"]) +remove_mod_prefix(mod1_obs, par["mod1"]) + +adata_mod1.var = mod1_var +adata_mod1.obs = mod1_obs + +adata_mod1.uns = { key.replace(f"{par['mod1']}_", ""): value for key, value in adata.uns.items() if not key.startswith(par['mod2'])} +del adata_mod1.obsm +del adata_mod1.X + +mod2_var = pd.DataFrame(adata_mod2.var) +remove_mod_col(mod2_var, par["mod1"]) +remove_mod_prefix(mod2_var, par["mod2"]) +mod2_var.index.name = "feature_name" +mod2_var.reset_index("feature_name", inplace=True) +mod2_var["feature_id"] = np.where(mod2_var.gene_id.isna(), mod2_var.feature_name, mod2_var.gene_id.astype(str)) +mod2_var.drop("gene_id", axis=1, inplace=True) +mod2_var.set_index("feature_id", drop=False, inplace=True) + +mod2_obs = pd.DataFrame(adata_mod2.obs) +remove_mod_col(mod2_obs, par["mod1"]) +remove_mod_prefix(mod2_obs, par["mod2"]) + +adata_mod2.var = mod2_var +adata_mod2.obs = mod2_obs + +adata_mod2.uns = { key.replace(f"{par['mod2']}_", ""): value for key, value in adata.uns.items() if not key.startswith(par['mod1'])} +if par["mod2"] == "ATAC": + adata_mod2.obsm = { key.replace(f"{par['mod2']}_", ""): value for key, value in adata_mod2.obsm.items() if key.startswith(par['mod2'])} +else: + del adata_mod2.obsm + + +del adata_mod2.X + +print("Add metadata to uns", flush=True) +metadata_fields = [ + "dataset_id", "dataset_name", "dataset_url", "dataset_reference", + "dataset_summary", "dataset_description", "dataset_organism" +] +for key in metadata_fields: + if key in par: + print(f" Setting .uns['{key}']", flush=True) + adata_mod1.uns[key] = par[key] + adata_mod2.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata_mod1.write_h5ad(par["output_mod1"], compression="gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression="gzip") + + + + diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py b/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py new file mode 100644 index 0000000000..b194a52fe4 --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2021_bmmc/test.py @@ -0,0 +1,93 @@ +from os import path +import subprocess +import anndata as ad + +input = "neurips2021_bmmc_cite.h5ad" +mod1 = "GEX" +mod2 = "ADT" + +output_mod1_file = "output_mod1.h5ad" +output_mod2_file = "output_mod2.h5ad" + +input_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + +# download input +print(">> Downloading input", flush=True) +out = subprocess.run( + [ + "wget", + "-O", input + ".gz", + input_url, + ], + stderr=subprocess.STDOUT +) +# unzip input +print(">> Unzipping input", flush=True) +out = subprocess.run( + [ + "gunzip", + input + ".gz", + ], + stderr=subprocess.STDOUT +) + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta["executable"], + "--input", input, + "--mod1", mod1, + "--mod2", mod2, + "--output_mod1", output_mod1_file, + "--output_mod2", output_mod2_file, + "--dataset_id", "openproblems/neurips2021_bmmc", + "--dataset_name", "BMMC (Multiome)", + "--dataset_url", "http://foo.org", + "--dataset_reference", "foo2000bar", + "--dataset_summary", "A short summary.", + "--dataset_description", "A couple of paragraphs worth of text.", + "--dataset_organism", "homo_sapiens", + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether files exist", flush=True) +assert path.exists(output_mod1_file), "Output mod1 file does not exist" +assert path.exists(output_mod2_file), "Output mod2 file does not exist" + +print(">> Read output anndata", flush=True) +output_mod1 = ad.read_h5ad(output_mod1_file) +output_mod2 = ad.read_h5ad(output_mod2_file) + +print(f"output_mod1: {output_mod1}", flush=True) +print(f"output_mod2: {output_mod2}", flush=True) + +print(">> Check that output mod1 fits expected API", flush=True) +assert output_mod1.X is None, ".X is not None/empty in mod 1 output" +assert "counts" in output_mod1.layers, "'counts' not found in mod 1 output layers" +assert "cell_type" in output_mod1.obs.columns, "cell_type column not found in mod 1 output obs" +assert "batch" in output_mod1.obs.columns, "batch column not found in mod 1 output obs" +assert output_mod1.uns["dataset_name"] == "BMMC (Multiome)", "Expected: Pancreas as value for dataset_name in mod 1 output uns" +assert output_mod1.uns["dataset_url"] == "http://foo.org", "Expected: http://foo.org as value for dataset_url in mod 1 output uns" +assert output_mod1.uns["dataset_reference"] == "foo2000bar", "Expected: foo2000bar as value for dataset_reference in mod 1 output uns" +assert output_mod1.uns["dataset_summary"] == "A short summary.", "Expected: A short summary. as value for dataset_summary in mod 1 output uns" +assert output_mod1.uns["dataset_description"] == "A couple of paragraphs worth of text.", "Expected: A couple of paragraphs worth of text. as value for dataset_description in mod 1 output uns" + + +print(">> Check that output mod2 fits expected API", flush=True) +assert output_mod2.X is None, ".X is not None/empty in mod 2 output" +assert "counts" in output_mod2.layers, "'counts' not found in mod 2 output layers" +assert "cell_type" in output_mod2.obs.columns, "cell_type column not found in mod 2 output obs" +assert "batch" in output_mod2.obs.columns, "batch column not found in mod 2 output obs" +assert output_mod2.uns["dataset_name"] == "BMMC (Multiome)", "Expected: Pancreas as value for dataset_name in mod 2 output uns" +assert output_mod2.uns["dataset_url"] == "http://foo.org", "Expected: http://foo.org as value for dataset_url in mod 2 output uns" +assert output_mod2.uns["dataset_reference"] == "foo2000bar", "Expected: foo2000bar as value for dataset_reference in mod 2 output uns" +assert output_mod2.uns["dataset_summary"] == "A short summary.", "Expected: A short summary. as value for dataset_summary in mod 2 output uns" +assert output_mod2.uns["dataset_description"] == "A couple of paragraphs worth of text.", "Expected: A couple of paragraphs worth of text. as value for dataset_description in mod 2 output uns" \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..b2141482f1 --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,80 @@ +functionality: + name: "openproblems_neurips2022_pbmc" + namespace: "datasets/loaders" + description: "Fetch a dataset from the OpenProblems NeurIPS2022 competition" + argument_groups: + - name: Inputs + arguments: + - name: "--input_mod1" + type: file + description: "Processed RNA h5ad file" + required: true + example: cite_rna_merged.h5ad + - name: "--input_mod2" + type: file + description: "Processed ADT or ATAC h5ad file" + required: true + example: cite_prot_merged.h5ad + - name: "--mod1" + type: string + description: Name of the first modality. + required: true + example: GEX + - name: "--mod2" + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: "A unique identifier for the dataset" + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: "--output_mod1" + __merge__: ../../api/file_raw.yaml + direction: "output" + - name: "--output_mod2" + __merge__: ../../api/file_raw.yaml + direction: "output" + resources: + - type: python_script + path: script.py + # skip unit test until data is public + # test_resources: + # - type: python_script + # path: test.py + # - type: file + # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py b/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py new file mode 100644 index 0000000000..d0dd855b55 --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2022_pbmc/script.py @@ -0,0 +1,94 @@ +import anndata as ad +from scipy import sparse + +## VIASH START +par = { + "input_mod1": "cite_rna_merged.h5ad", + "input_mod2": "cite_prot_merged.h5ad", + "mod1": "GEX", + "mod2": "ADT", + "dataset_id": "openproblems/neurips2022_pbmc", + "dataset_name": "Kaggle22 PBMC (CITE-seq)", + "dataset_url": "https://www.kaggle.com/competitions/open-problems-multimodal/data", + "dataset_reference": "Neurips22", + "dataset_summary": "Neurips22 competition dataset", + "dataset_description": "The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors.", + "dataset_organism": "homo_sapiens", + "output_mod1": "output/mod1.h5ad", + "output_mod2": "output/mod2.h5ad" +} +meta = { + "functionality_name": "openproblems_neurips2022_pbmc", +} +## VIASH END + + +def convert_matrix(adata): + for key in adata: + if isinstance(adata[key], sparse.csr_matrix): + adata[key] = sparse.csc_matrix(adata[key]) + + +print("load dataset modality 1 file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) + +print("load dataset modality 2 file", flush=True) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + +# Convert to sparse csc_matrix +convert_matrix(adata_mod1.layers) +convert_matrix(adata_mod1.obsm) +convert_matrix(adata_mod2.layers) +convert_matrix(adata_mod2.obsm) + + +# Add is_train to obs (modality 1) +if "is_train" not in adata_mod1.obs.columns: + split_info = adata_mod1.obs["kaggle_dataset"] + train_sets = ["train", "test_public"] + adata_mod1.obs["is_train"] = [ "train" if x in train_sets else "test" for x in split_info ] + +# Add is_train to obs if it is missing (modality 2) +if "is_train" not in adata_mod2.obs.columns: + split_info = adata_mod2.obs["kaggle_dataset"] + train_sets = ["train", "test_public"] + adata_mod2.obs["is_train"] = [ "train" if x in train_sets else "test" for x in split_info ] + + +# split up index in modality 1 into feature ID and feature name +adata_mod1.var['feature_id'] = [str(s).split('_')[0] for s in adata_mod1.var.index.tolist()] +# TODO: index does not always contain an underscore. +if "_" in adata_mod1.var.index[0]: + adata_mod1.var['feature_name'] = [str(s).split('_')[1] for s in adata_mod1.var.index.tolist()] +adata_mod1.var.set_index('feature_id',drop=False, inplace=True) + +# set feature_name (proteins have only partial ensemble IDs)) +adata_mod2.var['feature_id'] = adata_mod2.var.index.tolist() # feature id needs to be filled in +adata_mod2.var['feature_name'] = adata_mod2.var.index.tolist() +adata_mod2.var.set_index('feature_name',drop=False, inplace=True) + + +# remove adata.X +del adata_mod1.X +del adata_mod2.X + + +print("Add metadata to uns", flush=True) +metadata_fields = [ + "dataset_id", "dataset_name", "dataset_url", "dataset_reference", + "dataset_summary", "dataset_description", "dataset_organism" +] +for key in metadata_fields: + if key in par: + print(f" Setting .uns['{key}']", flush=True) + adata_mod1.uns[key] = par[key] + adata_mod2.uns[key] = par[key] + + +print("Writing adata to file", flush=True) +adata_mod1.write_h5ad(par["output_mod1"], compression="gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression="gzip") + + + + diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py b/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py new file mode 100644 index 0000000000..3bb5c677eb --- /dev/null +++ b/src/datasets/loaders/openproblems_neurips2022_pbmc/test.py @@ -0,0 +1,100 @@ +from os import path +import subprocess +import anndata as ad + +# TODO: update once data is public + +input_mod1 = "cite_rna_merged.h5ad" #change data set path after loading manually? +input_mod2 = "cite_prot_merged.h5ad" #change data set path after loading manually? +mod1 = "GEX" +mod2 = "ADT" + +output_mod1_file = "output_mod1.h5ad" +output_mod2_file = "output_mod2.h5ad" + +input_url_mod1 = "s3://openproblems-nextflow/datasets_private/neurips2022/cite_rna_merged.h5ad" +input_url_mod2 = "s3://openproblems-nextflow/datasets_private/neurips2022/cite_prot_merged.h5ad" + +# download input +# print(">> Downloading input modality 1", flush=True) +# out = subprocess.run( +# [ +# "aws s3 cp", +# "-O", input_mod1, +# input_url_mod1, +# ], +# stderr=subprocess.STDOUT +# ) + +# print(">> Downloading input modality 2", flush=True) +# out = subprocess.run( +# [ +# "aws s3 cp", +# "-O", input_mod2, +# input_url_mod2, +# ], +# stderr=subprocess.STDOUT +# ) + + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta["executable"], + "--input_mod1", input_mod1, + "--input_mod2", input_mod2, + "--mod1", mod1, + "--mod2", mod2, + "--output_mod1", output_mod1_file, + "--output_mod2", output_mod2_file, + "--dataset_id", "openproblems/neurips2021_bmmc", + "--dataset_name", "Kaggle22 PBMC (CITE-seq)", + "--dataset_url", "https://www.kaggle.com/competitions/open-problems-multimodal/data", + "--dataset_reference", "Neurips22", + "--dataset_summary", "Neurips22 competition dataset", + "--dataset_description", "The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors.", + "--dataset_organism", "homo_sapiens", + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether files exist", flush=True) +assert path.exists(output_mod1_file), "Output mod1 file does not exist" +assert path.exists(output_mod2_file), "Output mod2 file does not exist" + +print(">> Read output anndata", flush=True) +output_mod1 = ad.read_h5ad(output_mod1_file) +output_mod2 = ad.read_h5ad(output_mod2_file) + +print(f"output_mod1: {output_mod1}", flush=True) +print(f"output_mod2: {output_mod2}", flush=True) + +print(">> Check that output mod1 fits expected API", flush=True) +assert output_mod1.X is None, ".X is not None/empty in mod 1 output" +assert "counts" in output_mod1.layers, "'counts' not found in mod 1 output layers" +assert "cell_type" in output_mod1.obs.columns, "cell_type column not found in mod 1 output obs" +assert "batch" in output_mod1.obs.columns, "batch column not found in mod 1 output obs" +assert output_mod1.uns["dataset_name"] == "Kaggle22 PBMC (CITE-seq)", "Expected: Kaggle22 PBMC (CITE-seq) as value for dataset_name in mod 1 output uns" +assert output_mod1.uns["dataset_url"] == "https://www.kaggle.com/competitions/open-problems-multimodal/data", "Expected: https://www.kaggle.com/competitions/open-problems-multimodal/data as value for dataset_url in mod 1 output uns" +assert output_mod1.uns["dataset_reference"] == "Neurips22", "Expected: Neurips22 as value for dataset_reference in mod 1 output uns" +assert output_mod1.uns["dataset_summary"] == "Neurips22 competition dataset", "Expected: Neurips22 competition dataset as value for dataset_summary in mod 1 output uns" +assert output_mod1.uns["dataset_description"] == "The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors.", "Expected: The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors. as value for dataset_description in mod 1 output uns" + + +print(">> Check that output mod2 fits expected API", flush=True) +assert output_mod2.X is None, ".X is not None/empty in mod 2 output" +assert "counts" in output_mod2.layers, "'counts' not found in mod 2 output layers" +assert "cell_type" in output_mod2.obs.columns, "cell_type column not found in mod 2 output obs" +assert "batch" in output_mod2.obs.columns, "batch column not found in mod 2 output obs" +assert output_mod2.uns["dataset_name"] == "Kaggle22 PBMC (CITE-seq)", "Expected: Kaggle22 PBMC (CITE-seq) as value for dataset_name in mod 2 output uns" +assert output_mod2.uns["dataset_url"] == "https://www.kaggle.com/competitions/open-problems-multimodal/data", "Expected: https://www.kaggle.com/competitions/open-problems-multimodal/data as value for dataset_url in mod 2 output uns" +assert output_mod2.uns["dataset_reference"] == "Neurips22", "Expected: Neurips22 as value for dataset_reference in mod 2 output uns" +assert output_mod2.uns["dataset_summary"] == "Neurips22 competition dataset", "Expected: Neurips22 competition dataset as value for dataset_summary in mod 2 output uns" +assert output_mod2.uns["dataset_description"] == "The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors.", "Expected: The dataset for this competition comprises single-cell multiomics data collected from mobilized peripheral CD34+ hematopoietic stem and progenitor cells (HSPCs) isolated from four healthy human donors. as value for dataset_description in mod 2 output uns" \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..a07a60d0ac --- /dev/null +++ b/src/datasets/loaders/openproblems_v1/config.vsh.yaml @@ -0,0 +1,86 @@ +__merge__: ../../api/comp_dataset_loader.yaml +functionality: + name: "openproblems_v1" + description: "Fetch a dataset from OpenProblems v1" + argument_groups: + - name: Inputs + arguments: + - name: "--input_id" + type: "string" + description: "The ID of the dataset in OpenProblems v1" + required: true + - name: "--obs_cell_type" + type: "string" + description: "Location of where to find the observation cell types." + - name: "--obs_batch" + type: "string" + description: "Location of where to find the observation batch IDs." + - name: "--obs_tissue" + type: "string" + description: "Location of where to find the observation tissue information." + - name: "--layer_counts" + type: "string" + description: "In which layer to find the counts matrix. Leave undefined to use `.X`." + example: counts + - name: "--sparse" + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: "--var_feature_id" + type: "string" + description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." + example: gene_ids + - name: "--var_feature_name" + type: "string" + description: "Location of where to find the feature names. Can be set to index if the feature names are the index." + default: index + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/openproblems_v1/script.py b/src/datasets/loaders/openproblems_v1/script.py new file mode 100644 index 0000000000..2cdae43a74 --- /dev/null +++ b/src/datasets/loaders/openproblems_v1/script.py @@ -0,0 +1,128 @@ +from typing import Any, Callable, Dict, Tuple +import openproblems as op +import scanpy as sc +import scipy + +## VIASH START +par = { + "input_id": "pancreas", + "dataset_id": "pancreas", + "obs_cell_type": "cell_type", + "obs_batch": "tech", + "obs_tissue": "tissue", + "layer_counts": "counts", + "output": "test_data.h5ad", +} +meta = { + "resources_dir": "src/datasets/loaders/openproblems_v1/" +} +## VIASH END + +# make dataset lookup table +# If need be, this could be stored in a separate yaml file +dataset_funs: Dict[str, Tuple[Callable, Dict[str, Any]]] = { + "allen_brain_atlas": (op.data.allen_brain_atlas.load_mouse_brain_atlas, {}), + "cengen": (op.data.cengen.load_cengen, {}), + "immune_cells": (op.data.immune_cells.load_immune, {}), + "mouse_blood_olsson_labelled": (op.data.mouse_blood_olsson_labelled.load_olsson_2016_mouse_blood, {}), + "mouse_hspc_nestorowa2016": (op.data.mouse_hspc_nestorowa2016.load_mouse_hspc_nestorowa2016, {}), + "pancreas": (op.data.pancreas.load_pancreas, {}), + # "tabula_muris_senis": op.data.tabula_muris_senis.load_tabula_muris_senis, + "tabula_muris_senis_droplet_lung": ( + op.data.tabula_muris_senis.load_tabula_muris_senis, + {"organ_list": ["lung"], "method_list": ["droplet"]} + ), + "tenx_1k_pbmc": (op.data.tenx.load_tenx_1k_pbmc, {}), + "tenx_5k_pbmc": (op.data.tenx.load_tenx_5k_pbmc, {}), + "tnbc_wu2021": (op.data.tnbc_wu2021.load_tnbc_data, {}), + "zebrafish": (op.data.zebrafish.load_zebrafish, {}) +} + +# fetch dataset +dataset_fun, kwargs = dataset_funs[par["input_id"]] + +print("Fetch dataset", flush=True) +adata = dataset_fun(**kwargs) + +# override values one by one because adata.uns and +# metadata are two different classes. +for key, value in dataset_fun.metadata.items(): + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = value + +print("Setting .obs['cell_type']", flush=True) +if par["obs_cell_type"]: + if par["obs_cell_type"] in adata.obs: + adata.obs["cell_type"] = adata.obs[par["obs_cell_type"]] + else: + print(f"Warning: key '{par['obs_cell_type']}' could not be found in adata.obs.", flush=True) + +print("Setting .obs['batch']", flush=True) +if par["obs_batch"]: + if par["obs_batch"] in adata.obs: + adata.obs["batch"] = adata.obs[par["obs_batch"]] + else: + print(f"Warning: key '{par['obs_batch']}' could not be found in adata.obs.", flush=True) + +print("Setting .obs['tissue']", flush=True) +if par["obs_tissue"]: + if par["obs_tissue"] in adata.obs: + adata.obs["tissue"] = adata.obs[par["obs_tissue"]] + else: + print(f"Warning: key '{par['obs_tissue']}' could not be found in adata.obs.", flush=True) + +if par["layer_counts"] and par["layer_counts"] in adata.layers: + print(f"Temporarily moving .layers['{par['layer_counts']}'] to .X", flush=True) + adata.X = adata.layers[par["layer_counts"]] + del adata.layers[par["layer_counts"]] + +if par["sparse"] and not scipy.sparse.issparse(adata.X): + print("Make counts sparse", flush=True) + adata.X = scipy.sparse.csr_matrix(adata.X) + +print("Removing empty genes", flush=True) +sc.pp.filter_genes(adata, min_cells=1) + +print("Removing empty cells", flush=True) +sc.pp.filter_cells(adata, min_counts=2) + +print("Moving .X to .layers['counts']", flush=True) +adata.layers["counts"] = adata.X +del adata.X + +print("Add metadata to uns", flush=True) +metadata_fields = [ + "dataset_id", "dataset_name", "dataset_url", "dataset_reference", + "dataset_summary", "dataset_description", "dataset_organism" +] +uns_metadata = { + id: par[id] + for id in metadata_fields + if id in par +} +adata.uns.update(uns_metadata) + +print("Setting .var['feature_name']", flush=True) + +if par["var_feature_name"] == "index": + adata.var["feature_name"] = adata.var.index +else: + if par["var_feature_name"] in adata.var: + adata.var["feature_name"] = adata.var[par["feature_name"]] + del adata.var[par["feature_name"]] + else: + print(f"Warning: key '{par['var_feature_name']}' could not be found in adata.var.", flush=True) + +print("Setting .var['feature_id']", flush=True) + +if par["var_feature_id"] == "index": + adata.var["feature_id"] = adata.var.index +else: + if par["var_feature_id"] in adata.var: + adata.var["feature_id"] = adata.var[par["feature_id"]] + del adata.var[par["feature_id"]] + else: + print(f"Warning: key '{par['var_feature_id']}' could not be found in adata.var.", flush=True) + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["output"], compression="gzip") diff --git a/src/datasets/loaders/openproblems_v1/test.py b/src/datasets/loaders/openproblems_v1/test.py new file mode 100644 index 0000000000..f1b0389837 --- /dev/null +++ b/src/datasets/loaders/openproblems_v1/test.py @@ -0,0 +1,55 @@ +from os import path +import subprocess +import anndata as ad + +input_id = "pancreas" +dataset_id = "openproblems_v1/" + input_id +output = "dataset.h5ad" +obs_cell_type = "celltype" +obs_batch = "tech" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta["executable"], + "--input_id", input_id, + "--dataset_id", dataset_id, + "--obs_cell_type", obs_cell_type, + "--obs_batch", obs_batch, + "--layer_counts", "counts", + "--output", output, + "--dataset_name", "Pancreas", + "--dataset_url", "http://foo.org", + "--dataset_reference", "foo2000bar", + "--dataset_summary", "A short summary.", + "--dataset_description", "A couple of paragraphs worth of text.", + "--dataset_organism", "homo_sapiens", + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.") + exit(out.returncode) + +print(">> Checking whether file exists", flush=True) +assert path.exists(output), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(output) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in output layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +if obs_cell_type: + assert "cell_type" in adata.obs.columns, "'cell_type' column not found in obs of anndata output" +if obs_batch: + assert "batch" in adata.obs.columns, "'batch' column not found in obs of anndata output" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..812e52be62 --- /dev/null +++ b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,94 @@ +functionality: + name: "openproblems_v1_multimodal" + namespace: "datasets/loaders" + description: "Fetch a dataset from OpenProblems v1" + argument_groups: + - name: Inputs + arguments: + - name: "--input_id" + type: "string" + description: "The ID of the dataset in OpenProblems v1" + required: true + - name: "--obs_cell_type" + type: "string" + description: "Location of where to find the observation cell types." + - name: "--obs_batch" + type: "string" + description: "Location of where to find the observation batch IDs." + - name: "--obs_tissue" + type: "string" + description: "Location of where to find the observation tissue information." + - name: "--layer_counts" + type: "string" + description: "In which layer to find the counts matrix. Leave undefined to use `.X`." + example: counts + - name: "--sparse" + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: "--var_feature_id" + type: "string" + description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." + example: gene_ids + - name: "--var_feature_name" + type: "string" + description: "Location of where to find the feature names. Can be set to index if the feature names are the index." + default: index + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Outputs + arguments: + - name: "--output_mod1" + __merge__: ../../api/file_raw.yaml + direction: "output" + - name: "--output_mod2" + __merge__: ../../api/file_raw.yaml + direction: "output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: | + git clone https://github.com/openproblems-bio/openproblems.git /opt/openproblems && \ + pip install --no-cache-dir -r /opt/openproblems/docker/openproblems/requirements.txt && \ + pip install --no-cache-dir --editable /opt/openproblems + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/datasets/loaders/openproblems_v1_multimodal/script.py b/src/datasets/loaders/openproblems_v1_multimodal/script.py new file mode 100644 index 0000000000..f70e92d048 --- /dev/null +++ b/src/datasets/loaders/openproblems_v1_multimodal/script.py @@ -0,0 +1,169 @@ +from typing import Any, Callable, Dict, Tuple +import openproblems as op +import scanpy as sc +import scipy +import pandas as pd + +## VIASH START +par = { + "dataset_id": "scicar_mouse_kidney", + "obs_tissue": "source", + "obs_cell_type": "cell_type", + "layer_counts": "counts", + "output": "test_data.h5ad", + "dataset_name": "name", + "dataset_url": "https://some.url", + "dataset_reference": "reference", + "dataset_summary": "summary", + "dataset_description": "description", + "dataset_organism": "[homo_sapiens, mus_musculus]", +} +meta = { + "resources_dir": "src/datasets/loaders/openproblems_v1/" +} +## VIASH END + + +# make dataset lookup table +# If need be, this could be stored in a separate yaml file +dataset_funs: Dict[str, Tuple[Callable, Dict[str, Any]]] = { + "citeseq_cbmc": (op.data.multimodal.citeseq.load_citeseq_cbmc, {}), + "scicar_cell_lines": (op.data.multimodal.scicar.load_scicar_cell_lines, {}), + "scicar_mouse_kidney": (op.data.multimodal.scicar.load_scicar_mouse_kidney, {}), +} + +# fetch dataset +dataset_fun, kwargs = dataset_funs[par["input_id"]] + +print("Fetch dataset", flush=True) +adata = dataset_fun(**kwargs) + +print(f"source adata: {adata}", flush=True) + +# construct modality2 dataset +mod2_var_data = { + key.replace("mode2_var_", ""): adata.uns[key] + for key in adata.uns.keys() + if key.startswith("mode2_var_") +} +mod2_var = pd.DataFrame( + mod2_var_data, + index=adata.uns["mode2_var"] +) +mod2_obs = adata.obs.loc[adata.uns["mode2_obs"]] +mod2 = sc.AnnData( + obs=mod2_obs, + var=mod2_var, + layers={ "counts": adata.obsm["mode2"] } +) + +# construct modality1 dataset +mod1 = adata.copy() +mod1.uns = { key: value for key, value in mod1.uns.items() if not key.startswith("mode2_")} +mod1.obsm = { key: value for key, value in mod1.obsm.items() if not key.startswith("mode2_")} +mod1.obsp = { key: value for key, value in mod1.obsp.items() if not key.startswith("mode2_")} +mod1.varm = { key: value for key, value in mod1.varm.items() if not key.startswith("mode2_")} +mod1.varp = { key: value for key, value in mod1.varp.items() if not key.startswith("mode2_")} + +# override values one by one because adata.uns and +# metadata are two different classes. +for key, value in dataset_fun.metadata.items(): + print(f"Setting .uns['{key}']", flush=True) + mod1.uns[key] = value + mod2.uns[key] = value + +print("Setting .obs['cell_type']", flush=True) +if par["obs_cell_type"]: + if par["obs_cell_type"] in mod1.obs: + mod1.obs["cell_type"] = mod1.obs[par["obs_cell_type"]] + mod2.obs["cell_type"] = mod2.obs[par["obs_cell_type"]] + else: + print(f"Warning: key '{par['obs_cell_type']}' could not be found in adata.obs.", flush=True) + +print("Setting .obs['batch']", flush=True) +if par["obs_batch"]: + if par["obs_batch"] in mod1.obs: + mod1.obs["batch"] = mod1.obs[par["obs_batch"]] + mod2.obs["batch"] = mod2.obs[par["obs_batch"]] + else: + print(f"Warning: key '{par['obs_batch']}' could not be found in adata.obs.", flush=True) + +print("Setting .obs['tissue']", flush=True) +if par["obs_tissue"]: + if par["obs_tissue"] in mod1.obs: + mod1.obs["tissue"] = mod1.obs[par["obs_tissue"]] + mod2.obs["tissue"] = mod2.obs[par["obs_tissue"]] + else: + print(f"Warning: key '{par['obs_tissue']}' could not be found in adata.obs.", flush=True) + +if par["layer_counts"] and par["layer_counts"] in mod1.layers: + print(f"Temporarily moving mod1.layers['{par['layer_counts']}']", flush=True) + mod1_X = mod1.layers[par["layer_counts"]] + del mod1.layers[par["layer_counts"]] +else: + print("Temporarily moving mod1.X", flush=True) + mod1_X = mod1.X + del mod1.X + +if par["sparse"] and not scipy.sparse.issparse(mod1_X): + print("Make mod1 counts sparse", flush=True) + mod1_X = scipy.sparse.csr_matrix(mod1_X) + +if par["sparse"] and not scipy.sparse.issparse(mod2.layers["counts"]): + print("Make mod2 counts sparse", flush=True) + mod2.layers["counts"] = scipy.sparse.csr_matrix(mod2.layers["counts"]) + +print("Moving .X to .layers['counts']", flush=True) +mod1.layers["counts"] = mod1_X + +# just in case +del mod1.X +del mod2.X + +print("Setting .var['feature_name']", flush=True) +if par["var_feature_name"] == "index": + mod1.var["feature_name"] = mod1.var.index + mod2.var["feature_name"] = mod2.var.index +else: + if par["var_feature_name"] in mod1.var: + mod1.var["feature_name"] = mod1.var[par["feature_name"]] + del mod1.var[par["feature_name"]] + else: + print(f"Warning: key '{par['var_feature_name']}' could not be found in adata_mod1.var.", flush=True) + if par["var_feature_name"] in mod2.var: + mod2.var["feature_name"] = mod2.var[par["feature_name"]] + del mod2.var[par["feature_name"]] + else: + print(f"Warning: key '{par['var_feature_name']}' could not be found in adata_mod2.var.", flush=True) + +print("Setting .var['feature_id']", flush=True) +if par["var_feature_id"] == "index": + mod1.var["feature_id"] = mod1.var.index + mod2.var["feature_id"] = mod2.var.index +else: + if par["var_feature_id"] in mod1.var: + mod1.var["feature_id"] = mod1.var[par["feature_id"]] + del mod1.var[par["feature_id"]] + else: + print(f"Warning: key '{par['var_feature_id']}' could not be found in adata_mod1.var.", flush=True) + if par["var_feature_id"] in mod2.var: + mod2.var["feature_id"] = mod2.var[par["feature_id"]] + del mod2.var[par["feature_id"]] + else: + print(f"Warning: key '{par['var_feature_id']}' could not be found in adata_mod2.var.", flush=True) + + +print("Add metadata to uns", flush=True) +metadata_fields = [ + "dataset_id", "dataset_name", "dataset_url", "dataset_reference", + "dataset_summary", "dataset_description", "dataset_organism" +] +for key in metadata_fields: + if key in par: + print(f" Setting .uns['{key}']", flush=True) + mod1.uns[key] = par[key] + mod2.uns[key] = par[key] + +print("Writing adata to file", flush=True) +mod1.write_h5ad(par["output_mod1"], compression="gzip") +mod2.write_h5ad(par["output_mod2"], compression="gzip") diff --git a/src/datasets/loaders/openproblems_v1_multimodal/test.py b/src/datasets/loaders/openproblems_v1_multimodal/test.py new file mode 100644 index 0000000000..d6ead5c88d --- /dev/null +++ b/src/datasets/loaders/openproblems_v1_multimodal/test.py @@ -0,0 +1,85 @@ +from os import path +import subprocess +import anndata as ad + +input_id = "scicar_mouse_kidney" +dataset_id = "openproblems_v1_multimodal/" + input_id +obs_cell_type = "cell_name" +obs_batch = "replicate" +obs_tissue = None + +output_mod1_file = "output_mod1.h5ad" +output_mod2_file = "output_mod2.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta["executable"], + "--input_id", input_id, + "--dataset_id", dataset_id, + "--obs_cell_type", obs_cell_type, + "--obs_batch", obs_batch, + "--layer_counts", "counts", + "--output_mod1", output_mod1_file, + "--output_mod2", output_mod2_file, + "--dataset_name", "Pancreas", + "--dataset_url", "http://foo.org", + "--dataset_reference", "foo2000bar", + "--dataset_summary", "A short summary.", + "--dataset_description", "A couple of paragraphs worth of text.", + "--dataset_organism", "homo_sapiens", + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether files exist", flush=True) +assert path.exists(output_mod1_file), "Output mod1 file does not exist" +assert path.exists(output_mod2_file), "Output mod2 file does not exist" + +print(">> Read output anndata", flush=True) +output_mod1 = ad.read_h5ad(output_mod1_file) +output_mod2 = ad.read_h5ad(output_mod2_file) + +print(f"output_mod1: {output_mod1}", flush=True) +print(f"output_mod2: {output_mod2}", flush=True) + +print(">> Check that output mod1 fits expected API", flush=True) +assert output_mod1.X is None, ".X is not None/empty in mod 1 output" +assert "counts" in output_mod1.layers, "'counts' not found in mod 1 output layers" +if obs_cell_type: + assert "cell_type" in output_mod1.obs.columns, "cell_type column not found in mod 1 output obs" +if obs_batch: + assert "batch" in output_mod1.obs.columns, "batch column not found in mod 1 output obs" +if obs_tissue: + assert "tissue" in output_mod1.obs.columns, "tissue column not found in mod 1 output obs" +assert output_mod1.uns["dataset_id"] == dataset_id, f"Expected: {dataset_id} as value for dataset_id in mod 1 output uns" +assert output_mod1.uns["dataset_name"] == "Pancreas", "Expected: Pancreas as value for dataset_name in mod 1 output uns" +assert output_mod1.uns["dataset_url"] == "http://foo.org", "Expected: http://foo.org as value for dataset_url in mod 1 output uns" +assert output_mod1.uns["dataset_reference"] == "foo2000bar", "Expected: foo2000bar as value for dataset_reference in mod 1 output uns" +assert output_mod1.uns["dataset_summary"] == "A short summary.", "Expected: A short summary. as value for dataset_summary in mod 1 output uns" +assert output_mod1.uns["dataset_description"] == "A couple of paragraphs worth of text.", "Expected: A couple of paragraphs worth of text. as value for dataset_description in mod 1 output uns" + +print(">> Check that output mod2 fits expected API", flush=True) +assert output_mod2.X is None, ".X is not None/empty in mod 2 output" +assert "counts" in output_mod2.layers, "'counts' not found in mod 2 output layers" +if obs_cell_type: + assert "cell_type" in output_mod2.obs.columns, "cell_type column not found in mod 2 output obs" +if obs_batch: + assert "batch" in output_mod2.obs.columns, "batch column not found in mod 2 output obs" +if obs_tissue: + assert "tissue" in output_mod2.obs.columns, "tissue column not found in mod 2 output obs" +assert output_mod2.uns["dataset_id"] == dataset_id, f"Expected: {dataset_id} as value for dataset_id in mod 2 output uns" +assert output_mod2.uns["dataset_name"] == "Pancreas", "Expected: Pancreas as value for dataset_name in mod 2 output uns" +assert output_mod2.uns["dataset_url"] == "http://foo.org", "Expected: http://foo.org as value for dataset_url in mod 2 output uns" +assert output_mod2.uns["dataset_reference"] == "foo2000bar", "Expected: foo2000bar as value for dataset_reference in mod 2 output uns" +assert output_mod2.uns["dataset_summary"] == "A short summary.", "Expected: A short summary. as value for dataset_summary in mod 2 output uns" +assert output_mod2.uns["dataset_description"] == "A couple of paragraphs worth of text.", "Expected: A couple of paragraphs worth of text. as value for dataset_description in mod 2 output uns" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/tenx_visium/config.vsh.yaml b/src/datasets/loaders/tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..ba28b32b89 --- /dev/null +++ b/src/datasets/loaders/tenx_visium/config.vsh.yaml @@ -0,0 +1,96 @@ +functionality: + name: tenx_visium + namespace: datasets/loaders + description: | + Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). + + argument_groups: + - name: Inputs + arguments: + - name: "--input_expression" + type: string + description: URL to the feature / barcode matrix HDF5 of the 10x dataset. + required: true + - name: "--input_spatial" + type: string + description: URL to the Spatial imaging data of the 10x dataset. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - squidpy + - type: nextflow diff --git a/src/datasets/loaders/tenx_visium/script.py b/src/datasets/loaders/tenx_visium/script.py new file mode 100644 index 0000000000..7de04e6b5e --- /dev/null +++ b/src/datasets/loaders/tenx_visium/script.py @@ -0,0 +1,82 @@ +import subprocess +import squidpy as sq +import tempfile +import scanpy as sc + +## VIASH START +par = { + "input_expression": "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5", + "input_spatial": "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz", + "dataset_id": "tenx_visium/mouse_brain_coronal_section1_visium", + "dataset_name": "Mouse Brain Coronal Section 1 (FFPE)", + "dataset_url": "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard", + "dataset_summary": "Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set", + "dataset_organism": "Mus musculus", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 200, + "gene_filter_min_spots": 50, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "tenx_visium" +} +## VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_exp = "feature_bc_matrix.h5" + input_sp = "image_data.tar.gz" + epx_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_exp}", par['input_expression']], stderr=subprocess.STDOUT) + sp_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_sp}", par['input_spatial']], stderr=subprocess.STDOUT) + extract_spatial = subprocess.run(["tar", "-xzf", f"{tempdir}/{input_sp}", "-C", tempdir], stderr=subprocess.STDOUT) + + # Read visium data and create anndata object + adata = sq.read.visium(path=tempdir, counts_file=input_exp) + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells(adata, min_counts=par["spot_filter_min_counts"], inplace=True) +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells(adata, min_genes=par["spot_filter_min_genes"], inplace=True) +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes(adata, min_counts=par["gene_filter_min_counts"], inplace=True) +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes(adata, min_cells=par["gene_filter_min_spots"], inplace=True) +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not (name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +adata.var.set_index(adata.var['gene_ids'], inplace=True) +adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") \ No newline at end of file diff --git a/src/datasets/loaders/tenx_visium/test.py b/src/datasets/loaders/tenx_visium/test.py new file mode 100644 index 0000000000..a559ae1d3d --- /dev/null +++ b/src/datasets/loaders/tenx_visium/test.py @@ -0,0 +1,57 @@ +import os +import subprocess +import anndata as ad + +input_expression ="https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" +input_spatial = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" +dataset_id = "10x_visium/mouse_brain_coronal_section1" +dataset_name = "Mouse Brain Coronal Section 1 (FFPE)" +dataset_url = "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" +dataset_summary = "Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set" +dataset_description = "CytAssist_FFPE_Mouse_Brain_Rep1 - Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set" +dataset_organism = "Mus musculus" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_expression", input_expression, + "--input_spatial", input_spatial, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml new file mode 100644 index 0000000000..e4204802e1 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml @@ -0,0 +1,87 @@ +functionality: + name: zenodo_spatial + namespace: datasets/loaders + description: | + Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. + argument_groups: + - name: Inputs + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/script.py b/src/datasets/loaders/zenodo_spatial/script.py new file mode 100644 index 0000000000..83aeb86056 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/script.py @@ -0,0 +1,85 @@ +import subprocess +import tempfile +import scanpy as sc + +# VIASH START +par = { + "input_data": "ps://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1", + "dataset_id": "zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2", + "dataset_name": "Mouse Olfactory Bulk Puck", + "dataset_url": "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary", + "dataset_summary": "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2", + "dataset_organism": "Mus musculus", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 10, + "gene_filter_min_spots": 500, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "zenodo_spatial" +} +# VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_data = "input_data.h5ad" + epx_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_data}", par['input_data']], stderr=subprocess.STDOUT) + adata = sc.read_h5ad(filename=f"{tempdir}/{input_data}") + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True, percent_top=None) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells( + adata, min_counts=par["spot_filter_min_counts"], inplace=True) + +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells( + adata, min_genes=par["spot_filter_min_genes"], inplace=True) + +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes( + adata, min_counts=par["gene_filter_min_counts"], inplace=True) + +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes( + adata, min_cells=par["gene_filter_min_spots"], inplace=True) + +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not ( + name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +if('gene_ids' in adata.var): + adata.var.set_index(adata.var['gene_ids'], inplace=True) + adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") diff --git a/src/datasets/loaders/zenodo_spatial/test.py b/src/datasets/loaders/zenodo_spatial/test.py new file mode 100644 index 0000000000..07dcd953a8 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/test.py @@ -0,0 +1,55 @@ +import os +import subprocess +import anndata as ad + +input_data ="https://zenodo.org/records/12784832/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" +dataset_id = "zenodo_spatial/mouse_olfactory_bulb_puck" +dataset_name = "mouse_olfactory_bulb_puck" +dataset_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +dataset_summary = "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2" +dataset_description = "Gene expression library of mouse olfactory bulk puck profiled using Slide-seq V2" +dataset_organism = "Mus musculus" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_data", input_data, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..8a6dfb189e --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml @@ -0,0 +1,88 @@ +functionality: + name: zenodo_spatial_slidetags + namespace: datasets/loaders + description: | + Download a compressed file containing gene expression matrix and spatial locations from zenodo. + + argument_groups: + - name: Inputs + arguments: + - name: "--input_data" + type: string + description: URL to the file. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/script.py b/src/datasets/loaders/zenodo_spatial_slidetags/script.py new file mode 100644 index 0000000000..5a8cf212fa --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/script.py @@ -0,0 +1,103 @@ +import subprocess +import pandas as pd +import tempfile +import scanpy as sc + +# VIASH START +par = { + "input_data": "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1", + "dataset_id": "zenodo_spatial_slidetags/human_cortex_slidetags", + "dataset_name": "slidetag_human_cortex", + "dataset_url": "https://www.nature.com/articles/s41586-023-06837-4", + "dataset_summary": "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics", + "dataset_organism": "Homo sapiens", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 200, + "gene_filter_min_spots": 50, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "zenodo_spatial_slidetags" +} +# VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_data = "input_data.tar.gz" + dataset_name = par['dataset_name'] + epx_data = subprocess.run( + ["wget", "-O", f"{tempdir}/{input_data}", par['input_data']], stderr=subprocess.STDOUT) + extract_spatial = subprocess.run( + ["tar", "-xzf", f"{tempdir}/{input_data}", "-C", tempdir, "--strip-components=1"], stderr=subprocess.STDOUT) + + # Read gene expression and create anndata object + adata = sc.read_10x_mtx(path=tempdir) + + # Read spatial locations + df = pd.read_csv(f"{tempdir}/spatial.csv", skiprows=1) + df = df.set_index('TYPE') + df.columns = ['spatial1', 'spatial2', 'cell_type'] + + # add spatial locations to anndata object + sel_cells = list(set(df.index) & set(adata.obs_names)) + + df = df.loc[sel_cells, ] + adata = adata[sel_cells, ] + + adata.obs = df + adata.obsm['spatial'] = df[['spatial2', 'spatial1']].values + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells( + adata, min_counts=par["spot_filter_min_counts"], inplace=True) +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells( + adata, min_genes=par["spot_filter_min_genes"], inplace=True) +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes( + adata, min_counts=par["gene_filter_min_counts"], inplace=True) +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes( + adata, min_cells=par["gene_filter_min_spots"], inplace=True) +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not ( + name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +adata.var.set_index(adata.var['gene_ids'], inplace=True) +adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", + "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/test.py b/src/datasets/loaders/zenodo_spatial_slidetags/test.py new file mode 100644 index 0000000000..9f859ebea6 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/test.py @@ -0,0 +1,55 @@ +import os +import subprocess +import anndata as ad + +input_data ="https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" +dataset_id = "zenodo_spatial_slidetags/human_cortex" +dataset_name = "slidetag_human_cortex" +dataset_url = "https://www.nature.com/articles/s41586-023-06837-4" +dataset_summary = "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics" +dataset_description = "A 100 mm2 region of the human prefrontal cortex from a neurotypical donor aged 78 years was profiled by Slide-tags" +dataset_organism = "Homo sapiens" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_data", input_data, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml new file mode 100644 index 0000000000..5a8f56306a --- /dev/null +++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "atac_tfidf" + description: | + Transform peak counts with TF-IDF (Term Frequency - Inverse Document Frequency). + + TF: peak counts are normalised by total number of counts per cell DF: total number of counts for each peak IDF: number of cells divided by DF + + By default, log(TF) * log(IDF) is returned. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - muon + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/atac_tfidf/script.py b/src/datasets/normalization/atac_tfidf/script.py new file mode 100644 index 0000000000..ecb772bd64 --- /dev/null +++ b/src/datasets/normalization/atac_tfidf/script.py @@ -0,0 +1,26 @@ +import anndata as ad +from muon import atac as ac + +## VIASH START +par = { + 'input': "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad", + 'output': "output_norm.h5ad" +} +meta = { + 'functionality_name': "tfidf" +} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par['input']) + +print("Normalize data", flush=True) +input_adata = ad.AnnData(X=adata.layers["counts"]) +normalized_counts = ac.pp.tfidf(input_adata, inplace=False) + +print("Store output in adata", flush=True) +adata.layers[par["layer_output"]] = normalized_counts +adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] + +print("Write data", flush=True) +adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/l1_sqrt/config.vsh.yaml b/src/datasets/normalization/l1_sqrt/config.vsh.yaml new file mode 100644 index 0000000000..212eadc968 --- /dev/null +++ b/src/datasets/normalization/l1_sqrt/config.vsh.yaml @@ -0,0 +1,27 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "l1_sqrt" + description: | + Scaled L1 sqrt normalization. + + This normalization method causes all cells to have the same sum of values. + + Steps: + + * Compute the square root of the counts. + * Apply L1 normalization (rescaled such that the sum of the values of each cell sum to 1). + * Multiply by the median UMI count per cell, causing all cells to have the sum of values. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scprep + - numpy<2 + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/l1_sqrt/script.py b/src/datasets/normalization/l1_sqrt/script.py new file mode 100644 index 0000000000..76c69cf897 --- /dev/null +++ b/src/datasets/normalization/l1_sqrt/script.py @@ -0,0 +1,29 @@ +import anndata as ad +import scprep +import numpy as np + +## VIASH START +par = { + 'input': "output_train.h5ad", + 'output': "output_norm.h5ad" +} +meta = { + 'functionality_name': "l1_sqrt" +} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par['input']) + +print("Normalize data", flush=True) +# libsize and sqrt L1 norm +sqrt_data = scprep.utils.matrix_transform(adata.layers['counts'], np.sqrt) +l1_sqrt, libsize = scprep.normalize.library_size_normalize(sqrt_data, rescale=1, return_library_size=True) +l1_sqrt = l1_sqrt.tocsr() + +print("Store output in adata", flush=True) +adata.layers[par["layer_output"]] = l1_sqrt +adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] + +print("Write data", flush=True) +adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml new file mode 100644 index 0000000000..89b2a283f9 --- /dev/null +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "log_cp" + description: "Normalize data using Log CP" + resources: + - type: python_script + path: script.py + arguments: + - name: "--n_cp" + type: integer + default: 1e4 + description: "Number of counts per cell. When set to -1, will use None." +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_cp/script.py b/src/datasets/normalization/log_cp/script.py new file mode 100644 index 0000000000..39ddf61636 --- /dev/null +++ b/src/datasets/normalization/log_cp/script.py @@ -0,0 +1,42 @@ +import scanpy as sc + +## VIASH START +par = { + 'input': "resources_test/common/pancreas/dataset.h5ad", + 'output': "output.h5ad", + 'layer_output': "log_cp10k", + 'obs_size_factors': "log_cp10k_size_factors", + 'n_cp': 1e6, +} +meta = { + "functionality_name": "normalize_log_cp10k" +} +## VIASH END + +print(">> Load data", flush=True) +adata = sc.read_h5ad(par['input']) + +print(">> Normalize data", flush=True) +if par["n_cp"] == -1: + norm = sc.pp.normalize_total( + adata, + target_sum=None, + layer="counts", + inplace=False + ) +else: + norm = sc.pp.normalize_total( + adata, + target_sum=par["n_cp"], + layer="counts", + inplace=False + ) +lognorm = sc.pp.log1p(norm["X"]) + +print(">> Store output in adata", flush=True) +adata.layers[par["layer_output"]] = lognorm +adata.obs[par["obs_size_factors"]] = norm["norm_factor"] +adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] + +print(">> Write data", flush=True) +adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml new file mode 100644 index 0000000000..4cbf81ff5a --- /dev/null +++ b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "log_scran_pooling" + description: "Normalize data using scran pooling" + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ Matrix, rlang, scran, BiocParallel ] + - type: python + pip: scanpy + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_scran_pooling/script.R b/src/datasets/normalization/log_scran_pooling/script.R new file mode 100644 index 0000000000..be51e21f38 --- /dev/null +++ b/src/datasets/normalization/log_scran_pooling/script.R @@ -0,0 +1,38 @@ +cat(">> Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +requireNamespace("scran", quietly = TRUE) +requireNamespace("BiocParallel", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input = "resources_test/label_projection/pancreas/datas.h5ad", + output = "output.scran.h5ad", + layer_output = "log_scran_pooling", + obs_size_factors = "size_factors_log_scran_pooling" +) +## VIASH END + +cat(">> Load data\n") +adata <- anndata::read_h5ad(par$input) +counts <- as(t(adata$layers[["counts"]]), "CsparseMatrix") + +cat(">> Normalizing data\n") +size_factors <- scran::calculateSumFactors( + counts, + min.mean = 0.1, + BPPARAM = BiocParallel::MulticoreParam() +) +lognorm <- log1p(sweep(adata$layers[["counts"]], 1, size_factors, "*")) + +cat(">> Storing in anndata\n") +adata$obs[[par$obs_size_factors]] <- size_factors +adata$layers[[par$layer_output]] <- lognorm +norm_id <- par[["normalization_id"]] +if (is.null(norm_id)) { + norm_id <- meta[["functionality_name"]] +} +adata$uns[["normalization_id"]] <- norm_id + +cat(">> Writing to file\n") +zzz <- adata$write_h5ad(par$output, compression = "gzip") diff --git a/src/datasets/normalization/prot_clr/config.vsh.yaml b/src/datasets/normalization/prot_clr/config.vsh.yaml new file mode 100644 index 0000000000..8f6bbe269f --- /dev/null +++ b/src/datasets/normalization/prot_clr/config.vsh.yaml @@ -0,0 +1,26 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "prot_clr" + description: | + Perform center log ratio (CLR) normalization on input CITE-seq data (Stoeckius et al. 2017). + + The CLR transformation is defined as: + + $$ + x_{\text{clr}} = \log\left(\frac{x}{g(x)}\right) + $$ + + where $\(g(x)\)$ is the geometric mean of the row $\(x\)$. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - muon + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/prot_clr/script.py b/src/datasets/normalization/prot_clr/script.py new file mode 100644 index 0000000000..3f0a2fb3fd --- /dev/null +++ b/src/datasets/normalization/prot_clr/script.py @@ -0,0 +1,28 @@ +import anndata as ad +from muon import prot as pt + +## VIASH START +par = { + 'input': "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad", + 'output': "output_norm.h5ad" +} +meta = { + 'functionality_name': "clr" +} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par['input']) + +print("Normalize data", flush=True) +input_adata = ad.AnnData(X=adata.layers["counts"]) +normalized_counts = pt.pp.clr(input_adata, inplace=False) +if not normalized_counts: + raise RuntimeError("CLR failed to return the requested output layer") + +print("Store output in adata", flush=True) +adata.layers[par["layer_output"]] = normalized_counts.X +adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] + +print("Write data", flush=True) +adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/normalization/sqrt_cp/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml new file mode 100644 index 0000000000..4d95636f4c --- /dev/null +++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../../api/comp_normalization.yaml +functionality: + name: "sqrt_cp" + description: "Normalize data using Log Sqrt" + resources: + - type: python_script + path: script.py + arguments: + - name: "--n_cp" + type: integer + default: 1e4 + description: "Number of counts per cell" +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/sqrt_cp/script.py b/src/datasets/normalization/sqrt_cp/script.py new file mode 100644 index 0000000000..84afdaa19d --- /dev/null +++ b/src/datasets/normalization/sqrt_cp/script.py @@ -0,0 +1,35 @@ +import scanpy as sc +import numpy as np + +## VIASH START +par = { + 'input': "resources_test/common/pancreas/dataset.h5ad", + 'output': "output.h5ad", + 'layer_output': "sqrt_cpm", + 'obs_size_factors': "size_factors_sqrt_cpm", + 'n_cp': 1e6, +} +meta = { + "functionality_name": "normalize_sqrt_cpm" +} +## VIASH END + +print(">> Load data", flush=True) +adata = sc.read_h5ad(par['input']) + +print(">> Normalize data", flush=True) +norm = sc.pp.normalize_total( + adata, + target_sum=par['n_cp'], + layer="counts", + inplace=False +) +lognorm = np.sqrt(norm['X']) + +print(">> Store output in adata", flush=True) +adata.layers[par["layer_output"]] = lognorm +adata.obs[par["obs_size_factors"]] = norm["norm_factor"] +adata.uns["normalization_id"] = par["normalization_id"] or meta['functionality_name'] + +print(">> Write data", flush=True) +adata.write_h5ad(par['output'], compression="gzip") diff --git a/src/datasets/processors/hvg/config.vsh.yaml b/src/datasets/processors/hvg/config.vsh.yaml new file mode 100644 index 0000000000..aed18c6d38 --- /dev/null +++ b/src/datasets/processors/hvg/config.vsh.yaml @@ -0,0 +1,13 @@ +__merge__: ../../api/comp_processor_hvg.yaml +functionality: + name: "hvg" + description: "Compute HVG" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/hvg/script.py b/src/datasets/processors/hvg/script.py new file mode 100644 index 0000000000..60af4317bb --- /dev/null +++ b/src/datasets/processors/hvg/script.py @@ -0,0 +1,36 @@ + +import scanpy as sc + +### VIASH START +par = { + 'input': 'work/ca/0751ff85df6f9478cb7bda5a705cad/zebrafish.sqrt_cpm.pca.output.h5ad', + 'input_layer': 'normalized', + 'output': 'dataset.h5ad', + 'var_hvg': 'hvg', + 'var_hvg_score': 'hvg_score', + 'num_features': 100 +} +### VIASH END + +print(">> Load data", flush=True) +adata = sc.read_h5ad(par['input']) + +print(">> Look for layer", flush=True) +layer = adata.X if not par['input_layer'] else adata.layers[par['input_layer']] + +print(">> Run HVG", flush=True) +out = sc.pp.highly_variable_genes( + adata, + layer=par["input_layer"], + n_top_genes=par["num_features"], + flavor='cell_ranger', + inplace=False +) + +print(">> Storing output", flush=True) +adata.var[par["var_hvg"]] = out['highly_variable'].values +adata.var[par["var_hvg_score"]] = out['dispersions_norm'].values + +print(">> Writing data", flush=True) +adata.write_h5ad(par['output']) + diff --git a/src/datasets/processors/knn/config.vsh.yaml b/src/datasets/processors/knn/config.vsh.yaml new file mode 100644 index 0000000000..9908fe9086 --- /dev/null +++ b/src/datasets/processors/knn/config.vsh.yaml @@ -0,0 +1,13 @@ +__merge__: ../../api/comp_processor_knn.yaml +functionality: + name: "knn" + description: "Compute KNN" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/knn/script.py b/src/datasets/processors/knn/script.py new file mode 100644 index 0000000000..ae364f6ba3 --- /dev/null +++ b/src/datasets/processors/knn/script.py @@ -0,0 +1,27 @@ + +import scanpy as sc + +### VIASH START +par = { + 'input': 'work/ca/0751ff85df6f9478cb7bda5a705cad/zebrafish.sqrt_cpm.pca.output.h5ad', + 'input_layer': 'normalized', + 'output': 'dataset.h5ad', + 'key_added': 'knn', + 'n_neighbors': 15 +} +### VIASH END + +print(">> Load data", flush=True) +adata = sc.read(par['input']) + +print(">> Run kNN", flush=True) +sc.pp.neighbors( + adata, + use_rep='X_pca', + key_added=par['key_added'], + n_neighbors=par['num_neighbors'] +) + +print(">> Writing data", flush=True) +adata.write_h5ad(par['output']) + diff --git a/src/datasets/processors/pca/config.vsh.yaml b/src/datasets/processors/pca/config.vsh.yaml new file mode 100644 index 0000000000..7f0213b922 --- /dev/null +++ b/src/datasets/processors/pca/config.vsh.yaml @@ -0,0 +1,17 @@ +__merge__: ../../api/comp_processor_pca.yaml +functionality: + name: "pca" + description: "Compute PCA" + resources: + - type: python_script + path: script.py + # test_resources: + # - type: python_script + # path: test_script.py + # - path: "../../../resources_test/common/pancreas" +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/pca/script.py b/src/datasets/processors/pca/script.py new file mode 100644 index 0000000000..d56d376259 --- /dev/null +++ b/src/datasets/processors/pca/script.py @@ -0,0 +1,39 @@ + +import scanpy as sc + +### VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'input_layer': 'log_cp10k', + 'output': 'dataset.h5ad', + 'obsm_embedding': 'X_pca', + 'varm_loadings': 'pca_loadings', + 'uns_variance': 'pca_variance', + 'num_components': 25 +} +### VIASH END + +print(">> Load data", flush=True) +adata = sc.read(par['input']) + +print(">> Look for layer", flush=True) +layer = adata.X if not par['input_layer'] else adata.layers[par['input_layer']] + +print(">> Run PCA", flush=True) +X_pca, loadings, variance, variance_ratio = sc.tl.pca( + layer, + n_comps=par["num_components"], + return_info=True +) + +print(">> Storing output", flush=True) +adata.obsm[par["obsm_embedding"]] = X_pca +adata.varm[par["varm_loadings"]] = loadings.T +adata.uns[par["uns_variance"]] = { + "variance": variance, + "variance_ratio": variance_ratio +} + +print(">> Writing data", flush=True) +adata.write_h5ad(par['output']) + diff --git a/src/datasets/processors/subsample/config.vsh.yaml b/src/datasets/processors/subsample/config.vsh.yaml new file mode 100644 index 0000000000..4e52e93db5 --- /dev/null +++ b/src/datasets/processors/subsample/config.vsh.yaml @@ -0,0 +1,51 @@ +__merge__: ../../api/comp_processor_subset.yaml +functionality: + name: "subsample" + description: "Subsample an h5ad file" + arguments: + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Cell type indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test_script.py + - path: /resources_test/common/pancreas +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: python + packages: + - viashpy + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/script.py b/src/datasets/processors/subsample/script.py new file mode 100644 index 0000000000..c2347349c0 --- /dev/null +++ b/src/datasets/processors/subsample/script.py @@ -0,0 +1,145 @@ +import scanpy as sc +import random +import numpy as np + +### VIASH START +par = { + "input": "resources_test/common/scicar_cell_lines/temp_mod1_full.h5ad", + "input_mod2": "resources_test/common/scicar_cell_lines/temp_mod2_full.h5ad", + "n_obs": 600, + "n_vars": 1500, + "keep_cell_type_categories": None, + "keep_batch_categories": None, + "keep_features": None, + "keep_cell_type_categories": None, + "keep_batch_categories": None, + "even": False, + "output": "subsample_mod1.h5ad", + "output_mod2": "subsample_mod2.h5ad", + "seed": 123 +} +### VIASH END + +if par["seed"]: + print(f">> Setting seed to {par['seed']}", flush=True) + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata_input = sc.read_h5ad(par["input"]) + +if par["input_mod2"] is not None: + adata_mod2 = sc.read_h5ad(par["input_mod2"]) + +# copy counts to .X because otherwise filter_genes and filter_cells won't work +adata_input.X = adata_input.layers["counts"] +if par["input_mod2"] is not None: + adata_mod2.X = adata_mod2.layers["counts"] + +print(">> Determining output shape", flush=True) +min_obs_list = [par["n_obs"], adata_input.shape[0]] +if par["input_mod2"] is not None: + min_obs_list.append(adata_mod2.shape[0]) +n_obs = min(min_obs_list) + +min_vars_list = [par["n_vars"], adata_input.shape[1]] +if par["input_mod2"] is not None: + min_vars_list.append(adata_mod2.shape[1]) +n_vars = min(min_vars_list) + +print(">> Subsampling the observations", flush=True) +obs_filt = np.ones(dtype=np.bool_, shape=adata_input.n_obs) + +# subset by cell_type +if par.get("keep_cell_type_categories"): + print(f">> Selecting cell_type_categories {par['keep_cell_type_categories']}") + obs_filt = obs_filt & adata_input.obs["cell_type"].isin(par["keep_cell_type_categories"]) + +# subset by batch +if par.get("keep_batch_categories"): + print(f">> Selecting cell_type_categories {par['keep_batch_categories']}") + obs_filt = obs_filt & adata_input.obs["batch"].isin(par["keep_batch_categories"]) + +# subsample evenly across batches or not +if par.get("even"): + obs_evenly = "batch" + choice_ix = np.where(obs_filt)[0] + choice_batch = adata_input[choice_ix].obs[obs_evenly] + names, counts = np.unique(choice_batch, return_counts=True) + probs = dict(zip(names, 1 / counts / len(names))) + + choice_probs = [ probs[batch] for batch in choice_batch ] + obs_index = np.random.choice(choice_ix, size=n_obs, replace=False, p=choice_probs) +else: + obs_index = np.random.choice(np.where(obs_filt)[0], n_obs, replace=False) + +# subsample obs +adata_output = adata_input[obs_index].copy() +if par["input_mod2"] is not None: + adata_output_mod2 = adata_mod2[obs_index].copy() + +# filter cells and genes +if par["input_mod2"] is not None: + n_cells = adata_output.X.sum(axis=1).A.flatten() + n_cells_mod2 = adata_output_mod2.X.sum(axis=1).A.flatten() + keep_cells = np.minimum(n_cells, n_cells_mod2) > 1 + adata_output = adata_output[keep_cells, :].copy() + adata_output_mod2 = adata_output_mod2[keep_cells, :].copy() + + sc.pp.filter_genes(adata_output, min_cells=1) + sc.pp.filter_genes(adata_output_mod2, min_cells=1) + +else: + # todo: this should not remove features in keep_features! + print(">> Remove empty observations and features", flush=True) + sc.pp.filter_genes(adata_output, min_cells=1) + sc.pp.filter_cells(adata_output, min_counts=2) + +print(">> Subsampling the features", flush=True) +if par.get("keep_features"): + initial_filt = adata_output.var_names.isin(par["keep_features"]) + initial_idx, *_ = initial_filt.nonzero() + remaining_idx, *_ = (~initial_filt).nonzero() + rest_idx = remaining_idx[np.random.choice(len(remaining_idx), n_vars - len(initial_idx), replace=False)] + var_ix = np.concatenate([initial_idx, rest_idx]) +else: + var_ix = np.random.choice(adata_output.shape[1], n_vars, replace=False) + if par["input_mod2"] is not None: + var_ix_mod2 = np.random.choice(adata_output_mod2.shape[1], n_vars, replace=False) + +# subsample vars +adata_output = adata_output[:, var_ix].copy() +if par["input_mod2"] is not None: + adata_output_mod2 = adata_output_mod2[:, var_ix_mod2].copy() + +# filter cells and genes +if par["input_mod2"] is not None: + n_cells = adata_output.X.sum(axis=1).A.flatten() + n_cells_mod2 = adata_output_mod2.X.sum(axis=1).A.flatten() + keep_cells = np.minimum(n_cells, n_cells_mod2) > 1 + adata_output = adata_output[keep_cells, :].copy() + adata_output_mod2 = adata_output_mod2[keep_cells, :].copy() + + sc.pp.filter_genes(adata_output, min_cells=1) + sc.pp.filter_genes(adata_output_mod2, min_cells=1) + + +else: + # todo: this should not remove features in keep_features! + print(">> Remove empty observations and features", flush=True) + sc.pp.filter_genes(adata_output, min_cells=1) + sc.pp.filter_cells(adata_output, min_counts=2) + +print(">> Update dataset_id", flush=True) +adata_output.uns["dataset_id"] = adata_output.uns["dataset_id"] + "_subsample" +if par["input_mod2"] is not None: + adata_output_mod2.uns["dataset_id"] = adata_output_mod2.uns["dataset_id"] + "_subsample" + +# remove previously copied .X +del adata_output.X +if par["input_mod2"] is not None: + del adata_output_mod2.X + +print(">> Writing data", flush=True) +adata_output.write_h5ad(par["output"]) +if par["output_mod2"] is not None: + adata_output_mod2.write_h5ad(par["output_mod2"]) diff --git a/src/datasets/processors/subsample/test_script.py b/src/datasets/processors/subsample/test_script.py new file mode 100644 index 0000000000..80dde5d383 --- /dev/null +++ b/src/datasets/processors/subsample/test_script.py @@ -0,0 +1,64 @@ +import sys +import os +import pytest +import anndata as ad + +## VIASH START +meta = { + "resources_dir": "resources_test/common" +} +## VIASH END + +input_path = f"{meta['resources_dir']}/pancreas/dataset.h5ad" +input = ad.read_h5ad(input_path) + +def test_even_sampling(run_component): + output_path = "output.h5ad" + run_component([ + "--input", input_path, + "--output", output_path, + "--even", + "--seed", "123", + "--n_obs", "100", + "--n_vars", "120" + ]) + + # Checking whether file exists + assert os.path.exists(output_path), "Output file not found" + + # Check that test output fits expected API + output = ad.read_h5ad(output_path) + + assert output.n_obs <= 100, "n_obs should be <= 100" + assert output.n_vars <= 120, "n_vars should be <= 100" + + +def test_keep_functionality(run_component): + output_path = "output.h5ad" + + # keep_features = list(input.var_names[:10]) + # use genes with high enough expression + keep_features = ["ANP32E", "CBX5", "HMGB2"] + + run_component([ + "--input", input_path, + "--keep_cell_type_categories", "acinar:beta", + "--keep_batch_categories", "celseq:inDrop4:smarter", + "--keep_features", ":".join(keep_features), + "--output", output_path, + "--seed", "123" + ]) + + # Checking whether file exists + assert os.path.exists(output_path), "Output file not found" + + # Check that test output fits expected API + output = ad.read_h5ad(output_path) + + assert output.n_obs <= 500, "n_obs should be <= 500" + assert output.n_vars <= 500, "n_vars should be <= 500" + for feat in keep_features: + assert feat in output.var_names, f"{feat} should be in output.var_names" + +if __name__ == '__main__': + sys.exit(pytest.main([__file__, "--capture=no"], plugins=["viashpy"])) diff --git a/src/datasets/processors/svd/config.vsh.yaml b/src/datasets/processors/svd/config.vsh.yaml new file mode 100644 index 0000000000..bbad17f58c --- /dev/null +++ b/src/datasets/processors/svd/config.vsh.yaml @@ -0,0 +1,16 @@ +__merge__: ../../api/comp_processor_svd.yaml +functionality: + name: "svd" + description: "Compute SVD pca reduction" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: [scikit-learn] + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/svd/script.py b/src/datasets/processors/svd/script.py new file mode 100644 index 0000000000..8c94be407a --- /dev/null +++ b/src/datasets/processors/svd/script.py @@ -0,0 +1,45 @@ +import anndata as ad +import sklearn.decomposition + + +## VIASH START +par = { + "input": "resources_test/common/scicar_cell_lines/normalized_mod1.h5ad", + "input_mod2": "resources_test/common/scicar_cell_lines/normalized_mod2.h5ad", + "output": "output.h5ad", + "input_layer": "normalized", + "obsm_embedding": "X_svd", + "num_components": 100, +} +## VIASH END + +print(">> Load data", flush=True) +adata = ad.read(par["input"]) +if par["input_mod2"] is not None: + adata2 = ad.read(par["input_mod2"]) + +print(">> check parameters", flush=True) +min_list = [par["num_components"], min(adata.layers[par["input_layer"]].shape) - 1] + +if par["input_mod2"] is not None: + min_list.append(min(adata2.layers[par["input_layer"]].shape) - 1) + +n_svd = min(min_list) + + +print(">> Run SVD", flush=True) +svd1 = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.layers[par["input_layer"]]) +if par["input_mod2"] is not None: + svd2 = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata2.layers[par["input_layer"]]) + +print(">> Storing output", flush=True) +adata.obsm[par["obsm_embedding"]] = svd1 +if par["input_mod2"] is not None: + adata2.obsm[par["obsm_embedding"]] = svd2 + + +print(">> Writing data", flush=True) +adata.write_h5ad(par["output"]) +if par["input_mod2"] is not None: + adata2.write_h5ad(par["output_mod2"]) + diff --git a/src/datasets/resource_scripts/cellxgene_census.sh b/src/datasets/resource_scripts/cellxgene_census.sh new file mode 100755 index 0000000000..f0d93c9210 --- /dev/null +++ b/src/datasets/resource_scripts/cellxgene_census.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# template for adding new datasets +# - id: cellxgene_census/ +# species: +# census_version: "2023-07-25" +# obs_value_filter: "dataset_id == ''" +# obs_batch: +# dataset_name: +# dataset_summary: +# dataset_description: +# dataset_url: +# dataset_reference: +# dataset_organism: + +# not sure which dataset ids to use +# - id: cellxgene_census/human_brain_atlas +# species: homo_sapiens +# census_version: "2023-07-25" +# obs_value_filter: "dataset_id == ''" # <--- ? +# obs_batch: donor_id +# dataset_name: Human Brain Atlas +# dataset_summary: Single-Cell DNA Methylation and 3D Genome Human Brain Atlas +# dataset_description: Delineating the gene regulatory programs underlying complex cell types is fundamental for understanding brain functions in health and disease. Here, we comprehensively examine human brain cell epigenomes by probing DNA methylation and chromatin conformation at single-cell resolution in over 500,000 cells from 46 brain regions. We identified 188 cell types and characterized their molecular signatures. Integrative analyses revealed concordant changes in DNA methylation, chromatin accessibility, chromatin organization, and gene expression across cell types, cortical areas, and basal ganglia structures. With these resources, we developed scMCodes that reliably predict brain cell types using their methylation status at select genomic sites. This multimodal epigenomic brain cell atlas provides new insights into the complexity of cell type-specific gene regulation in the adult human brain. +# dataset_url: https://cellxgene.cziscience.com/collections/fdebfda9-bb9a-4b4b-97e5-651097ea07b0 +# dataset_reference: tian2023singlecell +# dataset_organism: homo_sapiens + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: cellxgene_census/mouse_pancreas_atlas + species: mus_musculus + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '49e4ffcc-5444-406d-bdee-577127404ba8'" + obs_batch: donor_id + dataset_name: Mouse Pancreatic Islet Atlas + dataset_summary: Mouse pancreatic islet scRNA-seq atlas across sexes, ages, and stress conditions including diabetes + dataset_description: To better understand pancreatic β-cell heterogeneity we generated a mouse pancreatic islet atlas capturing a wide range of biological conditions. The atlas contains scRNA-seq datasets of over 300,000 mouse pancreatic islet cells, of which more than 100,000 are β-cells, from nine datasets with 56 samples, including two previously unpublished datasets. The samples vary in sex, age (ranging from embryonic to aged), chemical stress, and disease status (including T1D NOD model development and two T2D models, mSTZ and db/db) together with different diabetes treatments. Additional information about data fields is available in anndata uns field 'field_descriptions' and on https://github.com/theislab/mm_pancreas_atlas_rep/blob/main/resources/cellxgene.md. + dataset_url: https://cellxgene.cziscience.com/collections/296237e2-393d-4e31-b590-b03f74ac5070 + dataset_reference: hrovatin2023delineating + dataset_organism: mus_musculus + - id: cellxgene_census/hcla + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '066943a2-fdac-4b29-b348-40cede398e4e'" + obs_batch: donor_id + dataset_name: Human Lung Cell Atlas + dataset_summary: An integrated cell atlas of the human lung in health and disease (core) + dataset_description: The integrated Human Lung Cell Atlas (HLCA) represents the first large-scale, integrated single-cell reference atlas of the human lung. It consists of over 2 million cells from the respiratory tract of 486 individuals, and includes 49 different datasets. It is split into the HLCA core, and the extended or full HLCA. The HLCA core includes data of healthy lung tissue from 107 individuals, and includes manual cell type annotations based on consensus across 6 independent experts, as well as demographic, biological and technical metadata. + dataset_url: https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293 + dataset_reference: sikkema2023integrated + dataset_organism: homo_sapiens + - id: cellxgene_census/tabula_sapiens + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '53d208b0-2cfd-4366-9866-c3c6114081bc'" + obs_batch: [donor_id, assay] + dataset_name: Tabula Sapiens + dataset_summary: A multiple-organ, single-cell transcriptomic atlas of humans + dataset_description: Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects. This work is the product of the Tabula Sapiens Consortium. Taking the organs from the same individual controls for genetic background, age, environment, and epigenetic effects and allows detailed analysis and comparison of cell types that are shared between tissues. Our work creates a detailed portrait of cell types as well as their distribution and variation in gene expression across tissues and within the endothelial, epithelial, stromal and immune compartments. + dataset_url: https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5 + dataset_reference: consortium2022tabula + dataset_organism: homo_sapiens + - id: cellxgene_census/immune_cell_atlas + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '1b9d8702-5af8-4142-85ed-020eb06ec4f6'" + obs_batch: donor_id + dataset_name: Immune Cell Atlas + dataset_summary: Cross-tissue immune cell analysis reveals tissue-specific features in humans + dataset_description: Despite their crucial role in health and disease, our knowledge of immune cells within human tissues remains limited. We surveyed the immune compartment of 16 tissues from 12 adult donors by single-cell RNA sequencing and VDJ sequencing generating a dataset of ~360,000 cells. To systematically resolve immune cell heterogeneity across tissues, we developed CellTypist, a machine learning tool for rapid and precise cell type annotation. Using this approach, combined with detailed curation, we determined the tissue distribution of finely phenotyped immune cell types, revealing hitherto unappreciated tissue-specific features and clonal architecture of T and B cells. Our multitissue approach lays the foundation for identifying highly resolved immune cell types by leveraging a common reference dataset, tissue-integrated expression analysis, and antigen receptor sequencing. + dataset_url: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3 + dataset_reference: dominguez2022crosstissue + dataset_organism: homo_sapiens + - id: cellxgene_census/gtex_v9 + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '4ed927e9-c099-49af-b8ce-a2652d069333'" + obs_batch: donor_id + dataset_name: GTEX v9 + dataset_summary: Single-nucleus cross-tissue molecular reference maps to decipher disease gene function + dataset_description: Understanding the function of genes and their regulation in tissue homeostasis and disease requires knowing the cellular context in which genes are expressed in tissues across the body. Single cell genomics allows the generation of detailed cellular atlases in human tissues, but most efforts are focused on single tissue types. Here, we establish a framework for profiling multiple tissues across the human body at single-cell resolution using single nucleus RNA-Seq (snRNA-seq), and apply it to 8 diverse, archived, frozen tissue types (three donors per tissue). We apply four snRNA-seq methods to each of 25 samples from 16 donors, generating a cross-tissue atlas of 209,126 nuclei profiles, and benchmark them vs. scRNA-seq of comparable fresh tissues. We use a conditional variational autoencoder (cVAE) to integrate an atlas across tissues, donors, and laboratory methods. We highlight shared and tissue-specific features of tissue-resident immune cells, identifying tissue-restricted and non-restricted resident myeloid populations. These include a cross-tissue conserved dichotomy between LYVE1- and HLA class II-expressing macrophages, and the broad presence of LAM-like macrophages across healthy tissues that is also observed in disease. For rare, monogenic muscle diseases, we identify cell types that likely underlie the neuromuscular, metabolic, and immune components of these diseases, and biological processes involved in their pathology. For common complex diseases and traits analyzed by GWAS, we identify the cell types and gene modules that potentially underlie disease mechanisms. The experimental and analytical frameworks we describe will enable the generation of large-scale studies of how cellular and molecular processes vary across individuals and populations. + dataset_url: https://cellxgene.cziscience.com/collections/a3ffde6c-7ad2-498a-903c-d58e732f7470 + dataset_reference: eraslan2022singlenucleus + dataset_organism: homo_sapiens + - id: cellxgene_census/human_retina_cell_atlas + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id == 'd6505c89-c43d-4c28-8c4f-7351a5fd5528'" + obs_batch: donor_id + dataset_name: Human Retina Cell Atlas + dataset_summary: Single cell atlas of the human retina + dataset_description: As the light sensing part of the visual system, the human retina is composed of five classes of neuron, including photoreceptors, horizontal cells, amacrine, bipolar, and retinal ganglion cells. Each class of neuron can be further classified into subgroups with the abundance varying three orders of magnitude. Therefore, to capture all cell types in the retina and generate a complete single cell reference atlas, it is essential to scale up from currently published single cell profiling studies to improve the sensitivity. In addition, to gain a better understanding of gene regulation at single cell level, it is important to include sufficient scATAC-seq data in the reference. To fill the gap, we performed snRNA-seq and snATAC-seq for the retina from healthy donors. To further increase the size of the dataset, we then collected and incorporated publicly available datasets. All data underwent a unified preprocessing pipeline and data integration. Multiple integration methods were benchmarked by scIB, and scVI was chosen. To harness the power of multiomics, snATAC-seq datasets were also preprocessed, and scGlue was used to generate co-embeddings between snRNA-seq and snATAC-seq cells. To facilitate the public use of references, we employ CELLxGENE and UCSC Cell Browser for visualization. By combining previously published and newly generated datasets, a single cell atlas of the human retina that is composed of 2.5 million single cells from 48 donors has been generated. As a result, over 90 distinct cell types are identified based on the transcriptomics profile with the rarest cell type accounting for about 0.01% of the cell population. In addition, open chromatin profiling has been generated for over 400K nuclei via single nuclei ATAC-seq, allowing systematic characterization of cis-regulatory elements for individual cell type. Integrative analysis reveals intriguing differences in the transcriptome, chromatin landscape, and gene regulatory network among cell class, subgroup, and type. In addition, changes in cell proportion, gene expression and chromatin openness have been observed between different gender and over age. Accessible through interactive browsers, this study represents the most comprehensive reference cell atlas of the human retina to date. As part of the human cell atlas project, this resource lays the foundation for further research in understanding retina biology and diseases. + dataset_url: https://cellxgene.cziscience.com/collections/4c6eaf5c-6d57-4c76-b1e9-60df8c655f1e + dataset_reference: li2023integrated + dataset_organism: homo_sapiens + - id: cellxgene_census/dkd + species: homo_sapiens + census_version: "2023-07-25" + obs_value_filter: "dataset_id in ['ad0bf220-dd49-4b71-bb5c-576fee675d2b', 'e067e5ca-e53e-485f-aa8e-efd5435229c8']" + obs_batch: donor_id + dataset_name: Diabetic Kidney Disease + dataset_summary: Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression + dataset_description: Multimodal single cell sequencing is a powerful tool for interrogating cell-specific changes in transcription and chromatin accessibility. We performed single nucleus RNA (snRNA-seq) and assay for transposase accessible chromatin sequencing (snATAC-seq) on human kidney cortex from donors with and without diabetic kidney disease (DKD) to identify altered signaling pathways and transcription factors associated with DKD. Both snRNA-seq and snATAC-seq had an increased proportion of VCAM1+ injured proximal tubule cells (PT_VCAM1) in DKD samples. PT_VCAM1 has a pro-inflammatory expression signature and transcription factor motif enrichment implicated NFkB signaling. We used stratified linkage disequilibrium score regression to partition heritability of kidney-function-related traits using publicly-available GWAS summary statistics. Cell-specific PT_VCAM1 peaks were enriched for heritability of chronic kidney disease (CKD), suggesting that genetic background may regulate chromatin accessibility and DKD progression. snATAC-seq found cell-specific differentially accessible regions (DAR) throughout the nephron that change accessibility in DKD and these regions were enriched for glucocorticoid receptor (GR) motifs. Changes in chromatin accessibility were associated with decreased expression of insulin receptor, increased gluconeogenesis, and decreased expression of the GR cytosolic chaperone, FKBP5, in the diabetic proximal tubule. Cleavage under targets and release using nuclease (CUT&RUN) profiling of GR binding in bulk kidney cortex and an in vitro model of the proximal tubule (RPTEC) showed that DAR co-localize with GR binding sites. CRISPRi silencing of GR response elements (GRE) in the FKBP5 gene body reduced FKBP5 expression in RPTEC, suggesting that reduced FKBP5 chromatin accessibility in DKD may alter cellular response to GR. We developed an open-source tool for single cell allele specific analysis (SALSA) to model the effect of genetic background on gene expression. Heterozygous germline single nucleotide variants (SNV) in proximal tubule ATAC peaks were associated with allele-specific chromatin accessibility and differential expression of target genes within cis-coaccessibility networks. Partitioned heritability of proximal tubule ATAC peaks with a predicted allele-specific effect was enriched for eGFR, suggesting that genetic background may modify DKD progression in a cell-specific manner. + dataset_url: https://cellxgene.cziscience.com/collections/b3e2c6e3-9b05-4da9-8f42-da38a664b45b + dataset_reference: wilson2022multimodal + dataset_organism: homo_sapiens + - id: cellxgene_census/hypomap + species: mus_musculus + census_version: "2023-07-25" + obs_value_filter: "dataset_id == 'dbb4e1ed-d820-4e83-981f-88ef7eb55a35'" + obs_batch: donor_id + dataset_name: HypoMap + dataset_summary: A unified single cell gene expression atlas of the murine hypothalamus + dataset_description: The hypothalamus plays a key role in coordinating fundamental body functions. Despite recent progress in single-cell technologies, a unified catalogue and molecular characterization of the heterogeneous cell types and, specifically, neuronal subtypes in this brain region are still lacking. Here we present an integrated reference atlas “HypoMap” of the murine hypothalamus consisting of 384,925 cells, with the ability to incorporate new additional experiments. We validate HypoMap by comparing data collected from SmartSeq2 and bulk RNA sequencing of selected neuronal cell types with different degrees of cellular heterogeneity. + dataset_url: https://cellxgene.cziscience.com/collections/d86517f0-fa7e-4266-b82e-a521350d6d36 + dataset_reference: steuernagel2022hypomap + dataset_organism: mus_musculus + +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +output_pca: force_null +output_hvg: force_null +output_knn: force_null +publish_dir: s3://openproblems-data/resources/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config \ + --labels cellxgene_census,dataset_loader diff --git a/src/datasets/resource_scripts/dataset_info.sh b/src/datasets/resource_scripts/dataset_info.sh new file mode 100755 index 0000000000..6ec2de9963 --- /dev/null +++ b/src/datasets/resource_scripts/dataset_info.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +DATASETS_DIR="s3://openproblems-data/resources/datasets" + +cat > "/tmp/params.yaml" << HERE +param_list: + - id: openproblems_v1 + input_states: "$DATASETS_DIR/openproblems_v1/**/log_cp10k/state.yaml" + rename_keys: 'input:output_dataset' + - id: openproblems_v1_multimodal + input_states: "$DATASETS_DIR/openproblems_v1_multimodal/**/log_cp10k/state.yaml" + rename_keys: 'input:output_mod1' + - id: cellxgene_census + input_states: "$DATASETS_DIR/cellxgene_census/**/log_cp10k/state.yaml" + rename_keys: 'input:output_dataset' +settings: '{"output": "dataset_info.yaml"}' +output_state: state.yaml +publish_dir: "$DATASETS_DIR" +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --entry-name auto \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config + + +# # run locally after the above has finished +# nextflow run . \ +# -main-script target/nextflow/common/process_task_results/get_dataset_info/main.nf \ +# -profile docker \ +# -resume \ +# --input "$DATASETS_DIR/dataset_info.yaml" \ +# --task_id "common" \ +# --output "dataset_info.json" \ +# --output_state state.yaml \ +# --publish_dir "../website/documentation/reference/datasets/data/" \ No newline at end of file diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh new file mode 100755 index 0000000000..8fd7e3a72d --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +params_file="/tmp/datasets_openproblems_neurips2021_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_neurips2021/bmmc_cite + # input: "/tmp/neurips2021_bmmc_cite.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ADT + dataset_name: NeurIPS2021 CITE-Seq + dataset_organism: homo_sapiens + dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + + - id: openproblems_neurips2021/bmmc_multiome + # input: "/tmp/neurips2021_bmmc_multiome.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ATAC + dataset_name: NeurIPS2021 Multiome + dataset_organism: homo_sapiens + dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + +dataset_url: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122" +dataset_reference: luecken2021neurips +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +publish_dir: s3://openproblems-data/resources/datasets +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "$params_file" \ + --config src/wf_utils/labels_tw.config \ + --labels neurips2021,dataset_loader \ diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh new file mode 100755 index 0000000000..be8444371b --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal_test.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +params_file="/tmp/datasets_openproblems_neurips2021_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_neurips2021/bmmc_cite + # input: "/tmp/neurips2021_bmmc_cite.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ADT + dataset_name: OpenProblems NeurIPS2021 CITE-Seq + dataset_organism: homo_sapiens + dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + + - id: openproblems_neurips2021/bmmc_multiome + # input: "/tmp/neurips2021_bmmc_multiome.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ATAC + dataset_name: OpenProblems NeurIPS2021 Multiome + dataset_organism: homo_sapiens + dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + +dataset_url: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122" +dataset_reference: luecken2021neurips +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +publish_dir: resources/datasets/openproblems_neurips2021 +HERE + +export NXF_VER=23.10.1 +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -profile docker \ + -resume \ + -params-file "$params_file" diff --git a/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh new file mode 100755 index 0000000000..56b61ca104 --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_neurips2022_pbmc.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +set -e + +params_file="/tmp/datasets_openproblems_neurips2022_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_neurips2022/pbmc_cite + input_mod1: s3://openproblems-nextflow/datasets_private/neurips2022/cite_rna_merged.h5ad + input_mod2: s3://openproblems-nextflow/datasets_private/neurips2022/cite_prot_merged.h5ad + mod1: GEX + mod2: ADT + dataset_name: OpenProblems NeurIPS2022 CITE-Seq + dataset_organism: homo_sapiens + dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + + - id: openproblems_neurips2022/pbmc_multiome + input_mod1: s3://openproblems-nextflow/datasets_private/neurips2022/multiome_rna_merged.h5ad + input_mod2: s3://openproblems-nextflow/datasets_private/neurips2022/multiome_atac_merged.h5ad + mod1: GEX + mod2: ATAC + dataset_name: OpenProblems NeurIPS2022 Multiome + dataset_organism: homo_sapiens + dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + +dataset_url: "https://www.kaggle.com/competitions/open-problems-multimodal/data" +dataset_reference: lance2024predicting +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +publish_dir: s3://openproblems-data/resources/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --workspace 53907369739130 \ + --compute-env 1pK56PjjzeraOOC2LDZvN2 \ + --params-file "$params_file" \ + --config /tmp/nextflow.config \ + --labels openproblems_neurips2022_pbmc,dataset_loader \ diff --git a/src/datasets/resource_scripts/openproblems_v1.sh b/src/datasets/resource_scripts/openproblems_v1.sh new file mode 100755 index 0000000000..1a01e2120e --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_v1.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +params_file="/tmp/datasets_openproblems_v1_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_v1/allen_brain_atlas + obs_cell_type: label + layer_counts: counts + input_id: allen_brain_atlas + dataset_name: Mouse Brain Atlas + dataset_url: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585 + dataset_reference: tasic2016adult + dataset_summary: Adult mouse primary visual cortex + dataset_description: A murine brain atlas with adjacent cell types as assumed benchmark truth, inferred from deconvolution proportion correlations using matching 10x Visium slides (see Dimitrov et al., 2022). + dataset_organism: mus_musculus + var_feature_name: index + + - id: openproblems_v1/cengen + obs_cell_type: cell_type + obs_batch: experiment_code + obs_tissue: tissue + layer_counts: counts + input_id: cengen + dataset_name: CeNGEN + dataset_url: https://www.cengen.org + dataset_reference: hammarlund2018cengen + dataset_summary: Complete Gene Expression Map of an Entire Nervous System + dataset_description: 100k FACS-isolated C. elegans neurons from 17 experiments sequenced on 10x Genomics. + dataset_organism: caenorhabditis_elegans + var_feature_name: index + + - id: openproblems_v1/immune_cells + obs_cell_type: final_annotation + obs_batch: batch + obs_tissue: tissue + layer_counts: counts + input_id: immune_cells + dataset_name: Human immune + dataset_url: https://theislab.github.io/scib-reproducibility/dataset_immune_cell_hum.html + dataset_reference: luecken2022benchmarking + dataset_summary: Human immune cells dataset from the scIB benchmarks + dataset_description: Human immune cells from peripheral blood and bone marrow taken from 5 datasets comprising 10 batches across technologies (10X, Smart-seq2). + dataset_organism: homo_sapiens + var_feature_name: index + + - id: openproblems_v1/mouse_blood_olsson_labelled + obs_cell_type: celltype + layer_counts: counts + input_id: mouse_blood_olsson_labelled + dataset_name: Mouse myeloid + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70245 + dataset_reference: olsson2016single + dataset_summary: Myeloid lineage differentiation from mouse blood + dataset_description: 660 FACS-isolated myeloid cells from 9 experiments sequenced using C1 Fluidigm and SMARTseq in 2016 by Olsson et al. + dataset_organism: mus_musculus + var_feature_name: index + + - id: openproblems_v1/mouse_hspc_nestorowa2016 + obs_cell_type: cell_type_label + layer_counts: counts + input_id: mouse_hspc_nestorowa2016 + dataset_name: Mouse HSPC + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81682 + dataset_reference: nestorowa2016single + dataset_summary: Haematopoeitic stem and progenitor cells from mouse bone marrow + dataset_description: 1656 hematopoietic stem and progenitor cells from mouse bone marrow. Sequenced by Smart-seq2. + dataset_organism: mus_musculus + var_feature_name: name + var_feature_id: converted_alias + + + - id: openproblems_v1/pancreas + obs_cell_type: celltype + obs_batch: tech + layer_counts: counts + input_id: pancreas + dataset_name: Human pancreas + dataset_url: https://theislab.github.io/scib-reproducibility/dataset_pancreas.html + dataset_reference: luecken2022benchmarking + dataset_summary: Human pancreas cells dataset from the scIB benchmarks + dataset_description: Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). + dataset_organism: homo_sapiens + var_feature_name: index + + # disabled as this is not working in openproblemsv1 + # - id: openproblems_v1/tabula_muris_senis_droplet_lung + # obs_cell_type: cell_type + # obs_batch: donor_id + # layer_counts: counts + # input_id: tabula_muris_senis_droplet_lung + # dataset_name: Tabula Muris Senis Lung + # dataset_url: https://tabula-muris-senis.ds.czbiohub.org + # dataset_reference: tabula2020single + # dataset_summary: Aging mouse lung cells from Tabula Muris Senis + # dataset_description: All lung cells from 10x profiles in Tabula Muris Senis, a 500k cell-atlas from 18 organs and tissues across the mouse lifespan. + # dataset_organism: mus_musculus + + - id: openproblems_v1/tenx_1k_pbmc + layer_counts: counts + input_id: tenx_1k_pbmc + dataset_name: 1k PBMCs + dataset_url: https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0 + dataset_reference: 10x2018pbmc + dataset_summary: 1k peripheral blood mononuclear cells from a healthy donor + dataset_description: 1k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. Sequenced on 10X v3 chemistry in November 2018 by 10X Genomics. + dataset_organism: homo_sapiens + var_feature_name: index + + - id: openproblems_v1/tenx_5k_pbmc + layer_counts: counts + input_id: tenx_5k_pbmc + dataset_name: 5k PBMCs + dataset_url: https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0 + dataset_reference: 10x2019pbmc + dataset_summary: 5k peripheral blood mononuclear cells from a healthy donor + dataset_description: 5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics. + dataset_organism: homo_sapiens + var_feature_name: index + var_feature_id: gene_ids + + + - id: openproblems_v1/tnbc_wu2021 + obs_cell_type: celltype_minor + layer_counts: counts + input_id: tnbc_wu2021 + dataset_name: Triple-Negative Breast Cancer + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389 + dataset_reference: wu2021single + dataset_summary: 1535 cells from six fresh triple-negative breast cancer tumors. + dataset_description: 1535 cells from six TNBC donors by (Wu et al., 2021). This dataset includes cytokine activities, inferred using a multivariate linear model with cytokine-focused signatures, as assumed true cell-cell communication (Dimitrov et al., 2022). + dataset_organism: homo_sapiens + var_feature_name: index + + - id: openproblems_v1/zebrafish + obs_cell_type: cell_type + obs_batch: lab + layer_counts: counts + input_id: zebrafish + dataset_name: Zebrafish embryonic cells + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294 + dataset_reference: wagner2018single + dataset_summary: Single-cell mRNA sequencing of zebrafish embryonic cells. + dataset_description: 90k cells from zebrafish embryos throughout the first day of development, with and without a knockout of chordin, an important developmental gene. + dataset_organism: danio_rerio + var_feature_name: index + var_feature_id: index + + +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +output_pca: force_null +output_hvg: force_null +output_knn: force_null +publish_dir: s3://openproblems-data/resources/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "$params_file" \ + --config /tmp/nextflow.config \ + --labels openproblems_v1,dataset_loader \ No newline at end of file diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh new file mode 100755 index 0000000000..3efb960c45 --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +params_file="/tmp/datasets_openproblems_v1_multimodal_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_v1_multimodal/citeseq_cbmc + input_id: citeseq_cbmc + dataset_name: "CITE-Seq CBMC" + dataset_summary: "CITE-seq profiles of 8k Cord Blood Mononuclear Cells" + dataset_description: "8k cord blood mononuclear cells profiled by CITEseq using a panel of 13 antibodies." + dataset_reference: stoeckius2017simultaneous + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866 + dataset_organism: homo_sapiens + layer_counts: counts + var_feature_name: index + mod1: GEX + mod2: ADT + + - id: openproblems_v1_multimodal/scicar_cell_lines + input_id: scicar_cell_lines + dataset_name: "sci-CAR Cell Lines" + dataset_summary: "sci-CAR profiles of 5k cell line cells (HEK293T, NIH/3T3, A549) across three treatment conditions (DEX 0h, 1h and 3h)" + dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling for HEK293T cells, NIH/3T3 cells, A549 cells across three treatment conditions (DEX 0 hour, 1 hour and 3 hour treatment)." + dataset_reference: cao2018joint + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089 + dataset_organism: "[homo_sapiens, mus_musculus]" + obs_cell_type: cell_name + layer_counts: counts + var_feature_id: index + var_feature_name: gene_short_name + mod1: GEX + mod2: ATAC + + - id: openproblems_v1_multimodal/scicar_mouse_kidney + input_id: scicar_mouse_kidney + dataset_name: "sci-CAR Mouse Kidney" + dataset_summary: "sci-CAR profiles of 11k mouse kidney cells" + dataset_description: "Single cell RNA-seq and ATAC-seq co-profiling of 11k mouse kidney cells." + dataset_reference: cao2018joint + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089 + dataset_organism: mus_musculus + obs_cell_type: cell_name + obs_batch: replicate + layer_counts: counts + var_feature_id: index + var_feature_name: gene_short_name + mod1: GEX + mod2: ATAC + +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +publish_dir: s3://openproblems-data/resources/datasets +HERE + + +cat > /tmp/nextflow.config << HERE +process { + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + errorStrategy = "ignore" +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "$params_file" \ + --labels openproblems_v1_multimodal,dataset_loader \ + --config /tmp/nextflow.config \ No newline at end of file diff --git a/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh new file mode 100755 index 0000000000..268a17cf7d --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_v1_multimodal_test.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +export TOWER_WORKSPACE_ID=53907369739130 + +OUTPUT_DIR="resources/datasets" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="/tmp/datasets_openproblems_v1_multimodal_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_v1_multimodal/citeseq_cbmc + dataset_name: "CITE-Seq CBMC" + dataset_summary: "CITE-seq profiles of 8k Cord Blood Mononuclear Cells" + dataset_description: "8k cord blood mononuclear cells profiled by CITEseq using a panel of 13 antibodies." + dataset_reference: stoeckius2017simultaneous + dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866 + dataset_organism: homo_sapiens + layer_counts: counts + +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +HERE + +export NXF_VER=22.04.5 +nextflow \ + run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -profile docker \ + -resume \ + -params-file "$params_file" \ + --publish_dir "$OUTPUT_DIR" diff --git a/src/datasets/resource_scripts/openproblems_v1_test.sh b/src/datasets/resource_scripts/openproblems_v1_test.sh new file mode 100755 index 0000000000..a79545f052 --- /dev/null +++ b/src/datasets/resource_scripts/openproblems_v1_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +export TOWER_WORKSPACE_ID=53907369739130 + +OUTPUT_DIR="resources/datasets" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +params_file="/tmp/datasets_openproblems_v1_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_v1/pancreas + obs_cell_type: celltype + obs_batch: tech + layer_counts: counts + dataset_name: Human pancreas + dataset_url: https://theislab.github.io/scib-reproducibility/dataset_pancreas.html + dataset_reference: luecken2022benchmarking + dataset_summary: Human pancreas cells dataset from the scIB benchmarks + dataset_description: Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq). + dataset_organism: homo_sapiens + +normalization_methods: [log_cp10k, sqrt_cp10k, l1_sqrt] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +output_pca: force_null +output_hvg: force_null +output_knn: force_null +HERE + +export NXF_VER=23.04.2 +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -profile docker \ + -resume \ + -params-file "$params_file" \ + --publish_dir "$OUTPUT_DIR" + + # -with-tower diff --git a/src/datasets/resource_scripts/tenx_visium.sh b/src/datasets/resource_scripts/tenx_visium.sh new file mode 100755 index 0000000000..7993cebd4b --- /dev/null +++ b/src/datasets/resource_scripts/tenx_visium.sh @@ -0,0 +1,316 @@ +#!/bin/bash + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: tenx_visium/mouse_brain_coronal_section1_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Brain Coronal +# dataset_url: "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" +# dataset_summary: Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set +# dataset_description: "FFPE Mouse Brain tissue blocks sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide Demonstrated Protocol. The H&E stained glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument. The H&E image was acquired using Olympus VS200 Slide Scanning Microscope. Sequencing depth was 53,497 reads per spot. Sequencing configuration: 28bp read 1 (16bp Visium spatial barcode, 12bp UMI), 90bp read 2 (transcript), 10bp i7 sample barcode and 10bp i5 sample barcode. Key metrics include: 2,310 spots detected under tissue; 6,736 median genes per spot; 24,862 median UMI counts per spot." +# dataset_reference: 10x2022brain +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_colorectal_cancer_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Colorectal_Cancer/CytAssist_11mm_FFPE_Human_Colorectal_Cancer_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Colorectal_Cancer/CytAssist_11mm_FFPE_Human_Colorectal_Cancer_spatial.tar.gz" +# dataset_name: 10X Visium - Human Colorectal Cancer +# dataset_url: "https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard" +# dataset_summary: Gene expression library of Human Colorectal Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set +# dataset_description: "The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." +# dataset_reference: 10x2023colorectal +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_heart_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_spatial.tar.gz" +# dataset_name: 10X Visium - Human Heart +# dataset_url: "https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0" +# dataset_summary: V1_Human_Heart +# dataset_description: "10x Genomics obtained fresh frozen human heart tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides." +# dataset_reference: 10x2019heart +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/mouse_embryo_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_11mm_FFPE_Mouse_Embryo/CytAssist_11mm_FFPE_Mouse_Embryo_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_11mm_FFPE_Mouse_Embryo/CytAssist_11mm_FFPE_Mouse_Embryo_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Embryo +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard" +# dataset_summary: Gene expression library of Mouse Embryo (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set +# dataset_description: "The tissue was sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol CG000518. Tissue sections of 5 µm was placed on a standard glass slide, and H&E-stained following deparaffinization. Sections were coverslipped with 85% glycerol, imaged, decoverslipped, followed by dehydration & decrosslinking (Demonstrated Protocol CG000520). The glass slide with the tissue section was processed with the Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide (11 mm Capture Area). The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument." +# dataset_reference: 10x2023embryo +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: tenx_visium/mouse_olfactory_bulb_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Olfactory Bulb +# dataset_url: "https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1" +# dataset_summary: 10X Genomics obtained fresh frozen mouse olfactory bulb tissue from BioIVT. +# dataset_description: "The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides, then fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." +# dataset_reference: 10x2022olfactory +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 30 +# remove_mitochondrial: false + +# - id: tenx_visium/human_breast_cancer_1_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_spatial.tar.gz" +# dataset_name: 10X Visium - Human Breast Cancer 1 +# dataset_url: "https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0" +# dataset_summary: Whole transcriptome analysis, Adult Human Breast Cancer (Visium) +# dataset_description: "10X Genomics obtained fresh frozen human Invasive Lobular Carcinoma breast tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides and fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." +# dataset_reference: 10x2020breast +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_lymph_node_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_spatial.tar.gz" +# dataset_name: 10X Visium - Human Lymph Node +# dataset_url: "https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0" +# dataset_summary: Whole transcriptome analysis, Human Lymph Node +# dataset_description: "10x Genomics obtained fresh frozen human lymph node from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides." +# dataset_reference: 10x2019lymph +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_normal_prostate_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_spatial.tar.gz" +# dataset_name: 10X Visium - Human Normal Prostate +# dataset_url: "https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0" +# dataset_summary: Gene expression library of Human Normal Prostate (Visium FFPE) using the Human Whole Transcriptome Probe Set +# dataset_description: "10x Genomics obtained FFPE human prostate tissue from Indivumed Human Tissue Specimens. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE – Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 5 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000409)." +# dataset_reference: 10x2021prostate +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 30 +# remove_mitochondrial: true + +# - id: tenx_visium/human_prostate_cancer_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_spatial.tar.gz" +# dataset_name: 10X Visium - Human Prostate Cancer +# dataset_url: "https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard" +# dataset_summary: Gene expression library of Human Prostate Cancer (Visium FFPE) with an IF image using the Human Whole Transcriptome Probe Set +# dataset_description: "10x Genomics obtained FFPE human prostate tissue from Indivumed Human Tissue Specimens. Original diagnosis with adenocarcinoma. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 10 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, Decrosslinking, Immunofluorescence Staining & Imaging Demonstrated Protocol (CG000410)." +# dataset_reference: 10x2022prostate +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: tenx_visium/human_cerebellum_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_spatial.tar.gz" + dataset_name: 10X Visium - Adult Human Cerebellum + dataset_url: "https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0" + dataset_summary: Human Cerebellum Whole Transcriptome Analysis + dataset_description: "10X Genomics obtained fresh frozen human cerebellum tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides and fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." + dataset_reference: 10x2020cerebellum + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/mouse_kidney_v1_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.1.0/V1_Mouse_Kidney/V1_Mouse_Kidney_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.1.0/V1_Mouse_Kidney/V1_Mouse_Kidney_spatial.tar.gz" + dataset_name: 10X Visium - Mouse Kidney 1 + dataset_url: "https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0" + dataset_summary: Mouse Kidney Whole Transcriptome Analysis + dataset_description: "10x Genomics obtained fresh frozen mouse kidney tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness from a slice of the coronal plane were placed on Visium Gene Expression slides, then stained following the Methanol Fixation, H&E Staining & Imaging Demonstrated Protocol (CG000160)." + dataset_reference: 10x2020kidney + dataset_organism: Mus musculus + spot_filter_min_genes: 100 + gene_filter_min_spots: 30 + remove_mitochondrial: false + + - id: tenx_visium/human_lung_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Lung_Cancer/CytAssist_11mm_FFPE_Human_Lung_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Lung_Cancer/CytAssist_11mm_FFPE_Human_Lung_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Lung Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Lung Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human lung cancer tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023lung + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_brain_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Glioblastoma/CytAssist_11mm_FFPE_Human_Glioblastoma_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Glioblastoma/CytAssist_11mm_FFPE_Human_Glioblastoma_spatial.tar.gz" + dataset_name: 10X Visium - Human Brain Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Glioblastoma (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human brain cancer tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023brain + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 100 + remove_mitochondrial: true + + - id: tenx_visium/human_kidney_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Kidney/CytAssist_11mm_FFPE_Human_Kidney_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Kidney/CytAssist_11mm_FFPE_Human_Kidney_spatial.tar.gz" + dataset_name: 10X Visium - Human Kidney + dataset_url: "https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Kidney (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human kidney tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE – Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023kidney + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_intestinal_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Intestinal_Cancer/Visium_FFPE_Human_Intestinal_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Intestinal_Cancer/Visium_FFPE_Human_Intestinal_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Intestine Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard" + dataset_summary: Gene expression library of Human Intestinal Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "5 µm section from Human Intestinal Cancer. FFPE tissue purchased from BioIVT Asterand Human Tissue Specimens. Libraries were prepared following the Visium Spatial Gene Expression Reagent Kits for FFPE User Guide (CG000407 Rev A)." + dataset_reference: 10x2022intestine + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 30 + remove_mitochondrial: true + + - id: tenx_visium/human_skin_melanoma_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Human_Skin_Melanoma/CytAssist_FFPE_Human_Skin_Melanoma_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Human_Skin_Melanoma/CytAssist_FFPE_Human_Skin_Melanoma_spatial.tar.gz" + dataset_name: 10X Visium - Human Skin Melanoma + dataset_url: "https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard" + dataset_summary: Gene expression library of Human Skin Melanoma (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE Human Melanoma tissue blocks from Avaden Biosciences. The tissue was sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol (CG000518). Tissue sections of 5 µm was placed on a standard glass slide, deparaffinized followed by immunofluorescence (IF) staining. Sections were coverslipped with 85% glycerol, imaged, decoverslipped, followed by dehydration & decrosslinking Demonstrated Protocol (CG000519). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument." + dataset_reference: 10x2022melanoma + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_cervical_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Cervical_Cancer/Visium_FFPE_Human_Cervical_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Cervical_Cancer/Visium_FFPE_Human_Cervical_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Cervical Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard" + dataset_summary: Gene expression library of Human Cervical Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "5 µm section from squamous cell carcinoma of human cervical cancer. FFPE tissue purchased from Discovery Life Sciences." + dataset_reference: 10x2022cervical + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_breast_cancer_2_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Breast Cancer 2 + dataset_url: "https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0" + dataset_summary: Gene expression library of Human Breast Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human breast tissue from BioIVT Asterand Human Tissue Specimens. The tissue was annotated with Ductal Carcinoma In Situ, Invasive Carcinoma. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE – Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 5 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000409)." + dataset_reference: 10x2021breast + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: tenx_visium/human_colon_cancer_xenium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1_spatial.tar.gz" +# dataset_name: 10X Xenium - Human Colon +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard" +# dataset_summary: Gene expression library of Post Xenium Human Colon Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set - Replicate 1 +# dataset_description: "This dataset is provided as part of the Technical Note: Post-Xenium In Situ Applications: Immunofluorescence, H&E, and Visium CytAssist Spatial Gene Expression (CG000709). Post-Xenium samples were compared to controls (samples not processed through the Xenium workflow) using 5 µm (FFPE) serial sections." +# dataset_reference: 10x2023colon +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/mouse_brain_xenium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1_spatial.tar.gz" +# dataset_name: 10X Xenium - Mouse Brain +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard" +# dataset_summary: Gene expression library of Post Xenium Mouse Brain (CytAssist Fresh Frozen) using the Mouse Whole Transcriptome Probe Set - Replicate 1 +# dataset_description: "This dataset is provided as part of the Technical Note: Post-Xenium In Situ Applications: Immunofluorescence, H&E, and Visium CytAssist Spatial Gene Expression (CG000709). Post-Xenium samples were compared to controls (samples not processed through the Xenium workflow) using 10 µm fresh-frozen (FF) serial sections." +# dataset_reference: 10x2023mousebrain +# dataset_organism: Mus musculus +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision integration_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_scripts/zenodo_spatial.sh.sh b/src/datasets/resource_scripts/zenodo_spatial.sh.sh new file mode 100755 index 0000000000..192cb8cc9c --- /dev/null +++ b/src/datasets/resource_scripts/zenodo_spatial.sh.sh @@ -0,0 +1,414 @@ +#!/bin/bash + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: zenodo_spatial/human_heart_myocardial_infarction_1_visium + input_data: "https://zenodo.org/records/13328275/files/10X0018.h5ad?download=1" + dataset_name: 10X Visium - Human Heart MI 1 + dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" + dataset_summary: Gene expression library of human heart using 10x Visium. + dataset_description: "Frozen heart samples were embedded in OCT (Tissue-Tek) and cryosectioned (Thermo Cryostar). The 10-µm section was placed on the pre-chilled Optimization slides (Visium, 10X Genomics, PN-1000193) and the optimal lysis time was determined. The tissues were treated as recommended by 10X Genomics and the optimization procedure showed an optimal permeabilization time of 12 or 18 min of digestion and release of RNA from the tissue slide. Spatial gene expression slides (Visium, 10X Genomics, PN-1000187) were used for spatial transcriptomics following the Visium User Guides" + dataset_reference: kuppe2022spatial + dataset_organism: Homo sapiens + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial/human_heart_myocardial_infarction_2_visium + input_data: "https://zenodo.org/records/13328275/files/10X009.h5ad?download=1" + dataset_name: 10X Visium - Human Heart MI 2 + dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" + dataset_summary: Gene expression library of human heart using 10x Visium. + dataset_description: "Frozen heart samples were embedded in OCT (Tissue-Tek) and cryosectioned (Thermo Cryostar). The 10-µm section was placed on the pre-chilled Optimization slides (Visium, 10X Genomics, PN-1000193) and the optimal lysis time was determined. The tissues were treated as recommended by 10X Genomics and the optimization procedure showed an optimal permeabilization time of 12 or 18 min of digestion and release of RNA from the tissue slide. Spatial gene expression slides (Visium, 10X Genomics, PN-1000187) were used for spatial transcriptomics following the Visium User Guides" + dataset_reference: kuppe2022spatial + dataset_organism: Homo sapiens + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +remove_mitochondrial: true +HERE + +# catt > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_e10_brain_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_brain_gene_25um_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Brain (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue (brain in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e10_eye_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_eye_and_nearby_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Eye (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue (eye in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e10_whole_body_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_whole_gene_best_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_lower_body_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E11_lower_body_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Lower Body (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue (lower body in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_1_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364244_E11-FL-1L_gene_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body 1 (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_2_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364245_E11-FL-2L_gene_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body 2 (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/human_cortex_1_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.250.expand.rep1_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 1 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 1 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (250 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 100 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_2_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep1_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 2 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 1 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_3_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep2_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 3 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 2 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_4_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep3_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 4 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 3 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/mouse_cortex_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_mouse1.AUD_TEA_VIS.242.unexpand_data.h5ad?download=1" +# dataset_name: MERFISH - Mouse Cortex +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of mouse cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of mouse cerebral cortex (visual cortex (VIS), auditory cortex (AUD) and temporal association area (TEa) unexpanded sections) using multiplexed error-robust fluorescence in situ hybridization (MERFISH)." +# dataset_organism: Mus musculus +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_organogenesis_seqfish +# input_data: "https://zenodo.org/records/12785822/files/seqfish.h5ad?download=1" +# dataset_name: Seqfish - Mouse Organogenesis +# dataset_url: "https://www.nature.com/articles/s41587-021-01006-2" +# dataset_summary: Single-cell spatial expression of mouse organogenesis. +# dataset_description: "Sagittal sections from mouse embryo at the 8-12 ss was profiled by seqFISH." +# dataset_organism: Mus musculus +# dataset_reference: lohoff2021integration +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 10 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# remove_mitochondrial: true +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Olfactory Bulb Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse olfactory bulk puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_cortex_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_palla2021squidpy_Slide-seqV2_Mouse_Cortex_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Cortex +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of Mouse cortex profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_cerebellum_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Cerebellum_SCP948_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Cerebellum +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse cerebellum profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_hippocampus_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Hippocampus_Puck_200115_08_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Hippocampus Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse hippocampus puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_somatosensory_cortex_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_SomatosensoryCortex_Puck_200306_03_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Somatosensory Cortex Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse somatosensory cortex puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_brain_2d_zstep10_0_starmap +# input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep10_0_data.h5ad?download=1" +# dataset_name: STARmap - Mouse Brain 1 +# dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" +# dataset_summary: Three-dimensional intact-tissue sequencing of single-cell transcriptional states. +# dataset_description: "3D architecture of cell types in visual cortex volumes." +# dataset_organism: Mus musculus +# dataset_reference: wang2018three +# spot_filter_min_genes: 1 +# gene_filter_min_spots: 1 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_brain_2d_zstep15_0_starmap +# input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep15_0_data.h5ad?download=1" +# dataset_name: STARmap - Mouse Brain 2 +# dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" +# dataset_summary: Three-dimensional intact-tissue sequencing of single-cell transcriptional states. +# dataset_description: "3D architecture of cell types in visual cortex volumes." +# dataset_organism: Mus musculus +# dataset_reference: wang2018three +# spot_filter_min_genes: 1 +# gene_filter_min_spots: 1 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/drosophila_embryo_e5_6_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_5.6.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E5_6 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e6_3_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_6.3.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E6_3 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e7_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_7.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E7 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e9_1_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_9.1.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E9_1 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e10_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_10.5.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E10 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_zenodo_spatial/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh b/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh new file mode 100755 index 0000000000..d8654ce439 --- /dev/null +++ b/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: zenodo_spatial_slidetags/human_cortex_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" + dataset_name: Slide-tags - Human Cortex + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A 100 mm2 region of the human prefrontal cortex from a neurotypical donor aged 78 years was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/human_skin_melanoma_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_skin_melanoma.tar.gz?download=1" + dataset_name: Slide-tags - Human Skin Melanoma + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A metastatic melanoma sample was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/human_tonsil_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_tonsil.tar.gz?download=1" + dataset_name: Slide-tags - Human Tonsil + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A human tonsil was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/mouse_embryo_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_mouse_embryo.tar.gz?download=1" + dataset_name: Slide-tags - Mouse Embryo + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "Mouse embryo tonsil was profiled by Slide-tags." + dataset_organism: Mus musculus + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: false + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_zenodo_spatial_slidetags/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh new file mode 100755 index 0000000000..3b5d35ee5c --- /dev/null +++ b/src/datasets/resource_test_scripts/cxg_mouse_pancreas_atlas.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +DATASET_DIR=resources_test/common + + +mkdir -p $DATASET_DIR + +wget https://raw.githubusercontent.com/theislab/scib/c993ffd9ccc84ae0b1681928722ed21985fb91d1/scib/resources/g2m_genes_tirosh.txt -O $DATASET_DIR/temp_g2m_genes_tirosh_mm.txt +wget https://raw.githubusercontent.com/theislab/scib/c993ffd9ccc84ae0b1681928722ed21985fb91d1/scib/resources/s_genes_tirosh.txt -O $DATASET_DIR/temp_s_genes_tirosh_mm.txt +KEEP_FEATURES=`cat $DATASET_DIR/temp_g2m_genes_tirosh_mm.txt $DATASET_DIR/temp_s_genes_tirosh_mm.txt | paste -sd ":" -` + +cat > "/tmp/params.yaml" << HERE +param_list: + - id: cxg_mouse_pancreas_atlas + species: mus_musculus + census_version: "2023-07-25" + obs_value_filter: "dataset_id == '49e4ffcc-5444-406d-bdee-577127404ba8' and donor_id in ['mouse_pancreatic_islet_atlas_Hrovatin__Fltp_2y__MUC13974', 'mouse_pancreatic_islet_atlas_Hrovatin__Fltp_2y__MUC13975', 'mouse_pancreatic_islet_atlas_Hrovatin__Fltp_2y__MUC13976']" + obs_batch: donor_id + dataset_name: Mouse Pancreatic Islet Atlas + dataset_summary: Mouse pancreatic islet scRNA-seq atlas across sexes, ages, and stress conditions including diabetes + dataset_description: To better understand pancreatic β-cell heterogeneity we generated a mouse pancreatic islet atlas capturing a wide range of biological conditions. The atlas contains scRNA-seq datasets of over 300,000 mouse pancreatic islet cells, of which more than 100,000 are β-cells, from nine datasets with 56 samples, including two previously unpublished datasets. The samples vary in sex, age (ranging from embryonic to aged), chemical stress, and disease status (including T1D NOD model development and two T2D models, mSTZ and db/db) together with different diabetes treatments. Additional information about data fields is available in anndata uns field 'field_descriptions' and on https://github.com/theislab/mm_pancreas_atlas_rep/blob/main/resources/cellxgene.md. + dataset_url: https://cellxgene.cziscience.com/collections/296237e2-393d-4e31-b590-b03f74ac5070 + dataset_reference: hrovatin2023delineating + dataset_organism: mus_musculus + +normalization_methods: [log_cp10k] +n_obs: 600 +n_vars: 1500 +output_dataset: '\$id/dataset.h5ad' +output_meta: '\$id/dataset_metadata.yaml' +output_state: '\$id/state.yaml' +output_raw: force_null +output_normalized: force_null +output_pca: force_null +output_hvg: force_null +output_knn: force_null +publish_dir: $DATASET_DIR +do_subsample: true +keep_features: '$KEEP_FEATURES' +HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_cellxgene_census/main.nf \ + -c src/wf_utils/labels_ci.config \ + -profile docker \ + -params-file "/tmp/params.yaml" + +rm -r $DATASET_DIR/temp_* + +# src/tasks/batch_integration/resources_test_scripts/process.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh b/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh new file mode 100755 index 0000000000..e4b889e063 --- /dev/null +++ b/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e + +cat > /tmp/params.yaml << 'HERE' +param_list: + - id: mouse_brain_coronal_section1 + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" + dataset_name: Mouse Brain Coronal Section 1 (FFPE) + dataset_url: "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" + dataset_summary: Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set + dataset_description: "FFPE Mouse Brain tissue blocks sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide Demonstrated Protocol. The H&E stained glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument. The H&E image was acquired using Olympus VS200 Slide Scanning Microscope. Sequencing depth was 53,497 reads per spot. Sequencing configuration: 28bp read 1 (16bp Visium spatial barcode, 12bp UMI), 90bp read 2 (transcript), 10bp i7 sample barcode and 10bp i5 sample barcode. Key metrics include: 2,310 spots detected under tissue; 6,736 median genes per spot; 24,862 median UMI counts per spot." + dataset_reference: 10x2022brain + dataset_organism: Mus musculus + +normalization_methods: [log_cp10k] +n_obs: 600 +n_vars: 500 +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources_test/common +do_subsample: true +spot_filter_min_genes: 200 +gene_filter_min_spots: 50 +remove_mitochondrial: true +HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + -c src/wf_utils/labels_ci.config \ + -profile docker \ + -params-file "/tmp/params.yaml" + diff --git a/src/datasets/resource_test_scripts/neurips2021_bmmc.sh b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh new file mode 100755 index 0000000000..7922f634cb --- /dev/null +++ b/src/datasets/resource_test_scripts/neurips2021_bmmc.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -e + +params_file="/tmp/datasets_openproblems_neurips2021_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_neurips2021/bmmc_cite + # input: "/tmp/neurips2021_bmmc_cite.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ADT + dataset_name: OpenProblems NeurIPS2021 CITE-Seq + dataset_organism: homo_sapiens + dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + + - id: openproblems_neurips2021/bmmc_multiome + # input: "/tmp/neurips2021_bmmc_multiome.h5ad" + input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz" + mod1: GEX + mod2: ATAC + dataset_name: OpenProblems NeurIPS2021 Multiome + dataset_organism: homo_sapiens + dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + +dataset_url: "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE194122" +dataset_reference: luecken2021neurips +normalization_methods: [log_cp10k] +do_subsample: true +even: true +n_obs: 600 +n_vars: 1500 +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +# publish_dir: s3://openproblems-data/resources_test/common +HERE + +# cat > /tmp/nextflow.config << HERE +# process { +# withName:'.*publishStatesProc' { +# memory = '16GB' +# disk = '100GB' +# } +# } +# HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ + -profile docker \ + -resume \ + --publish_dir resources_test/common \ + -params-file "$params_file" \ + -c src/wf_utils/labels.config + +# tw launch https://github.com/openproblems-bio/openproblems-v2.git \ +# --revision main_build \ +# --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \ +# --workspace 53907369739130 \ +# --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ +# --params-file "$params_file" \ +# --config /tmp/nextflow.config \ +# --labels predict_modality + +# run task process dataset components +src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/neurips2022_pbmc.sh b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh new file mode 100755 index 0000000000..ef2e0523e1 --- /dev/null +++ b/src/datasets/resource_test_scripts/neurips2022_pbmc.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -e + +params_file="/tmp/datasets_openproblems_neurips2022_params.yaml" + +cat > "$params_file" << 'HERE' +param_list: + - id: openproblems_neurips2022/pbmc_cite + input_mod1: s3://openproblems-nextflow/datasets_private/neurips2022/cite_rna_merged.h5ad + input_mod2: s3://openproblems-nextflow/datasets_private/neurips2022/cite_prot_merged.h5ad + mod1: GEX + mod2: ADT + dataset_name: OpenProblems NeurIPS2022 CITE-Seq + dataset_organism: homo_sapiens + dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + + - id: openproblems_neurips2022/pbmc_multiome + input_mod1: s3://openproblems-nextflow/datasets_private/neurips2022/multiome_rna_merged.h5ad + input_mod2: s3://openproblems-nextflow/datasets_private/neurips2022/multiome_atac_merged.h5ad + mod1: GEX + mod2: ATAC + dataset_name: OpenProblems NeurIPS2022 Multiome + dataset_organism: homo_sapiens + dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. + dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2022. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." + +dataset_url: "https://www.kaggle.com/competitions/open-problems-multimodal/data" +dataset_reference: lance2024predicting +normalization_methods: [log_cp10k] +do_subsample: true +even: true +n_obs: 600 +n_vars: 1500 +output_mod1: '$id/dataset_mod1.h5ad' +output_mod2: '$id/dataset_mod2.h5ad' +output_meta_mod1: '$id/dataset_metadata_mod1.yaml' +output_meta_mod2: '$id/dataset_metadata_mod2.yaml' +output_state: '$id/state.yaml' +publish_dir: s3://openproblems-data/resources_test/common +HERE + +# nextflow run . \ +# -main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ +# -profile docker \ +# -resume \ +# --publish_dir resources_test/common \ +# -params-file "$params_file" \ +# -c src/wf_utils/labels.config + + +cat > /tmp/nextflow.config << HERE +process { + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf \ + --workspace 53907369739130 \ + --compute-env 1pK56PjjzeraOOC2LDZvN2 \ + --params-file "$params_file" \ + --config /tmp/nextflow.config \ + --labels openproblems_neurips2022_pbmc,dataset_loader \ + + + +# run task process dataset components +# src/tasks/predict_modality/resources_test_scripts/neurips2022_pbmc.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/pancreas.sh b/src/datasets/resource_test_scripts/pancreas.sh new file mode 100755 index 0000000000..fb26f7ef30 --- /dev/null +++ b/src/datasets/resource_test_scripts/pancreas.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +DATASET_DIR=resources_test/common + +set -e + +mkdir -p $DATASET_DIR + +wget https://raw.githubusercontent.com/theislab/scib/c993ffd9ccc84ae0b1681928722ed21985fb91d1/scib/resources/g2m_genes_tirosh_hm.txt -O $DATASET_DIR/temp_g2m_genes_tirosh_hm.txt +wget https://raw.githubusercontent.com/theislab/scib/c993ffd9ccc84ae0b1681928722ed21985fb91d1/scib/resources/s_genes_tirosh_hm.txt -O $DATASET_DIR/temp_s_genes_tirosh_hm.txt +KEEP_FEATURES=`cat $DATASET_DIR/temp_g2m_genes_tirosh_hm.txt $DATASET_DIR/temp_s_genes_tirosh_hm.txt | paste -sd ":" -` + +# download dataset +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_v1/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + -resume \ + --id pancreas \ + --input_id pancreas \ + --obs_cell_type "celltype" \ + --obs_batch "tech" \ + --var_feature_name "index" \ + --layer_counts "counts" \ + --dataset_name "Human pancreas" \ + --dataset_url "https://theislab.github.io/scib-reproducibility/dataset_pancreas.html" \ + --dataset_reference "luecken2022benchmarking" \ + --dataset_summary "Human pancreas cells dataset from the scIB benchmarks" \ + --dataset_description "Human pancreatic islet scRNA-seq data from 6 datasets across technologies (CEL-seq, CEL-seq2, Smart-seq2, inDrop, Fluidigm C1, and SMARTER-seq)." \ + --dataset_organism "homo_sapiens" \ + --keep_cell_type_categories "acinar:beta" \ + --keep_batch_categories "celseq:inDrop4:smarter" \ + --keep_features "$KEEP_FEATURES" \ + --seed 123 \ + --normalization_methods log_cp10k \ + --do_subsample true \ + --n_obs 600 \ + --n_vars 1500 \ + --output_raw '$id/raw.h5ad' \ + --output_normalized '$id/normalized.h5ad' \ + --output_hvg '$id/hvg.h5ad' \ + --output_pca '$id/pca.h5ad' \ + --output_knn '$id/knn.h5ad' \ + --output_dataset '$id/dataset.h5ad' \ + --output_meta '$id/dataset_meta.yaml' \ + --output_state '$id/state.yaml' \ + --publish_dir "$DATASET_DIR" + +rm -r $DATASET_DIR/temp_* + +# run task process dataset components +src/tasks/batch_integration/resources_test_scripts/process.sh +src/tasks/denoising/resources_test_scripts/pancreas.sh +src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh +src/tasks/label_projection/resources_test_scripts/pancreas.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/scicar_cell_lines.sh b/src/datasets/resource_test_scripts/scicar_cell_lines.sh new file mode 100755 index 0000000000..f765744136 --- /dev/null +++ b/src/datasets/resource_test_scripts/scicar_cell_lines.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +DATASET_DIR=resources_test/common + +set -e + +mkdir -p $DATASET_DIR + +# download dataset +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_openproblems_v1_multimodal/main.nf \ + -profile docker \ + -resume \ + --id scicar_cell_lines \ + --input_id scicar_cell_lines \ + --obs_tissue "source" \ + --layer_counts "counts" \ + --obs_cell_type "cell_name" \ + --var_feature_id "index" \ + --var_feature_name "gene_short_name" \ + --dataset_name "sci-CAR cell lines" \ + --dataset_url "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089" \ + --dataset_reference "cao2018joint" \ + --dataset_summary "sciCAR is a combinatorial indexing-based assay that jointly measures cellular transcriptomes and the accessibility of cellular chromatin in the same cells" \ + --dataset_description "sciCAR is a combinatorial indexing-based assay that jointly measures cellular transcriptomes and the accessibility of cellular chromatin in the same cells. Here, we use two sciCAR datasets that were obtained from the same study. The first dataset contains 4,825 cells from three cell lines (HEK293T cells, NIH/3T3 cells, and A549 cells) at multiple timepoints (0, 1 hour, 3 hours) after dexamethasone treatment. The second dataset contains 11,233 cells from wild-type adult mouse kidney." \ + --dataset_organism "[homo_sapiens, mus_musculus]" \ + --mod1 GEX \ + --mod2 ATAC \ + --do_subsample true \ + --n_obs 600 \ + --n_vars 1500 \ + --seed 123 \ + --normalization_methods log_cp10k \ + --output_mod1 '$id/dataset_mod1.h5ad' \ + --output_mod2 '$id/dataset_mod2.h5ad' \ + --output_meta_mod1 '$id/dataset_metadata_mod1.yaml' \ + --output_meta_mod2 '$id/dataset_metadata_mod2.yaml' \ + --output_state '$id/state.yaml' \ + --publish_dir "$DATASET_DIR" + +# run task process dataset components +src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh \ No newline at end of file diff --git a/src/datasets/resource_test_scripts/slideseq_test.sh b/src/datasets/resource_test_scripts/slideseq_test.sh new file mode 100755 index 0000000000..a9050be40a --- /dev/null +++ b/src/datasets/resource_test_scripts/slideseq_test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e + +cat > /tmp/params.yaml << 'HERE' +param_list: + - id: mouse_cerebellum + input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" + dataset_name: Mouse cerebellum + dataset_url: "..." + dataset_summary: ... + dataset_description: "..." + dataset_reference: ref + dataset_organism: Mus musculus + +normalization_methods: [log_cp10k] +n_obs: 600 +n_vars: 500 +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources_test/common +do_subsample: true +spot_filter_min_genes: 200 +gene_filter_min_spots: 50 +remove_mitochondrial: true +HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_spatial_from_zenodo/main.nf \ + -c src/wf_utils/labels_ci.config \ + -profile docker \ + -params-file "/tmp/params.yaml" + diff --git a/src/datasets/workflows/extract_dataset_info/config.vsh.yaml b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml new file mode 100644 index 0000000000..58433db567 --- /dev/null +++ b/src/datasets/workflows/extract_dataset_info/config.vsh.yaml @@ -0,0 +1,34 @@ +functionality: + name: "extract_dataset_info" + namespace: "datasets/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Filter arguments + arguments: + - name: "--filter_normalization_id" + type: string + required: false + direction: input + description: If defined, only the normalization with this ID will be included in the output. + multiple: true + example: [ log_cp10k ] + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + example: dataset_uns.yaml + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + dependencies: + - name: common/extract_metadata +platforms: + - type: nextflow diff --git a/src/datasets/workflows/extract_dataset_info/main.nf b/src/datasets/workflows/extract_dataset_info/main.nf new file mode 100644 index 0000000000..887812760e --- /dev/null +++ b/src/datasets/workflows/extract_dataset_info/main.nf @@ -0,0 +1,58 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + // only keep one of the normalization methods + | filter{ id, state -> + if (state.filter_normalization_id) { + state.filter_normalization_id.contains(state.dataset_uns.normalization_id) + } else { + true + } + } + + | joinStates { ids, states -> + // remove normalization id + // TODO: make this optional through a parameter? + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + + // store data as yaml + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + def new_state = [ + output: dataset_uns_file, + _meta: [join_id: ids[0]] + ] + ["output", new_state] + } + + + emit: + output_ch +} diff --git a/src/datasets/workflows/extract_dataset_info/run_test.sh b/src/datasets/workflows/extract_dataset_info/run_test.sh new file mode 100755 index 0000000000..9723de008a --- /dev/null +++ b/src/datasets/workflows/extract_dataset_info/run_test.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +DATASETS_DIR="resources_test/common" + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/datasets/workflows/extract_dataset_info/main.nf \ + -profile docker \ + -resume \ + -c src/wf_utils/labels_ci.config \ + -entry auto \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output": "dataset_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file diff --git a/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml new file mode 100644 index 0000000000..26041b1039 --- /dev/null +++ b/src/datasets/workflows/extract_dataset_meta/config.vsh.yaml @@ -0,0 +1,25 @@ +functionality: + name: "extract_dataset_meta" + namespace: "datasets/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/datasets/api/file_raw.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + direction: output + example: meta.yaml + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + dependencies: + - name: common/extract_metadata +platforms: + - type: nextflow diff --git a/src/datasets/workflows/extract_dataset_meta/main.nf b/src/datasets/workflows/extract_dataset_meta/main.nf new file mode 100644 index 0000000000..cbac67b571 --- /dev/null +++ b/src/datasets/workflows/extract_dataset_meta/main.nf @@ -0,0 +1,20 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input"], + toState: [output: "output"] + ) + + | setState([ + "output", + ]) + + emit: + output_ch +} diff --git a/src/datasets/workflows/extract_dataset_meta/run_test.sh b/src/datasets/workflows/extract_dataset_meta/run_test.sh new file mode 100755 index 0000000000..4792938fee --- /dev/null +++ b/src/datasets/workflows/extract_dataset_meta/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +DATASETS_DIR="resources_test/common/pancreas/dataset.h5ad" + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/datasets/workflows/extract_dataset_meta/main.nf \ + -profile docker \ + -resume \ + -c src/wf_utils/labels_ci.config \ + --input $DATASETS_DIR \ + --output meta.yaml \ + --publish_dir "$OUTPUT_DIR" \ No newline at end of file diff --git a/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml b/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml new file mode 100644 index 0000000000..3e1fd5263b --- /dev/null +++ b/src/datasets/workflows/process_cellxgene_census/config.vsh.yaml @@ -0,0 +1,201 @@ +functionality: + name: process_cellxgene_census + namespace: datasets/workflows + description: | + Fetch and process datasets originating from the CELLxGENE census. + argument_groups: + - name: Input database + description: "Open CellxGene Census by version or URI." + arguments: + - name: "--input_uri" + type: string + description: "If specified, a URI containing the Census SOMA objects. If specified, will take precedence over the `--census_version` argument." + required: false + example: "s3://bucket/path" + - name: "--census_version" + description: "Which release of CellxGene census to use. Possible values are \"latest\", \"stable\", or the date of one of the releases (e.g. \"2023-07-25\"). For more information, check the documentation on [Census data releases](https://chanzuckerberg.github.io/cellxgene-census/cellxgene_census_docsite_data_release_info.html)." + type: string + example: "stable" + required: false + - name: Cell query + description: Arguments related to the query. + arguments: + - name: "--species" + type: string + description: The organism to query, usually one of `Homo sapiens` or `Mus musculus`. + required: false + default: "homo_sapiens" + multiple: false + - name: "--obs_value_filter" + type: string + description: "Filter for selecting the `obs` metadata (i.e. cells). Value is a filter query written in the SOMA `value_filter` syntax." + required: false + example: "is_primary_data == True and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] and suspension_type == 'cell'" + - name: Cell filter + description: Filter the cells based on a minimum cell count per specified group + arguments: + - name: "--cell_filter_grouping" + type: string + description: | + A subset of 'obs' columns by which to group the cells for filtering. + Only groups surpassing or equal to the `--cell_filter_minimum_count` + threshold will be retained. Take care not to introduce a selection + bias against cells with more fine-grained ontology annotations. + required: false + example: ["dataset_id", "tissue", "assay", "disease", "cell_type"] + multiple: true + - name: "--cell_filter_minimum_count" + type: double + description: | + A minimum number of cells per group to retain. If `--cell_filter_grouping` + is defined, this parameter should also be provided and vice versa. + required: false + example: 100 + - name: Cell metadata + description: Cell metadata arguments + arguments: + - name: "--obs_batch" + type: string + description: | + Location of where to find the observation batch IDs. + + * If not specified, the `.obs["batch"]` field will not be included. + * If one or more values are specified, the `.obs["batch"]` field will be + set to the concatenated values of the specified fields, separated by + the `obs_batch_separator`. + required: false + multiple: true + multiple_sep: "," + example: ["batch"] + - name: "--obs_batch_separator" + type: string + description: Separator to use when concatenating the values of the `--obs_batch` fields. + required: false + default: "+" + - name: Dataset metadata + description: Information about the dataset that will be stored in the `.uns` slot. + arguments: + - name: "--id" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: "output" + required: true + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: "--output_raw" + __merge__: /src/datasets/api/file_raw.yaml + direction: "output" + required: false + - name: "--output_normalized" + __merge__: /src/datasets/api/file_normalized.yaml + direction: "output" + required: false + - name: "--output_pca" + __merge__: /src/datasets/api/file_pca.yaml + direction: "output" + required: false + - name: "--output_hvg" + __merge__: /src/datasets/api/file_hvg.yaml + direction: "output" + required: false + - name: "--output_knn" + __merge__: /src/datasets/api/file_knn.yaml + direction: "output" + required: false + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/cellxgene_census + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow diff --git a/src/datasets/workflows/process_cellxgene_census/main.nf b/src/datasets/workflows/process_cellxgene_census/main.nf new file mode 100644 index 0000000000..bd1fc813a9 --- /dev/null +++ b/src/datasets/workflows/process_cellxgene_census/main.nf @@ -0,0 +1,160 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | cellxgene_census.run( + fromState: [ + "input_uri": "input_uri", + "census_version": "census_version", + "species": "species", + "obs_value_filter": "obs_value_filter", + "cell_filter_grouping": "cell_filter_grouping", + "cell_filter_minimum_count": "cell_filter_minimum_count", + "obs_batch": "obs_batch", + "obs_batch_separator": "obs_batch_separator", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + ], + toState: ["output_raw": "output"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "keep_features": "keep_features", + "keep_cell_type_categories": "keep_cell_type_categories", + "keep_batch_categories": "keep_batch_categories", + "even": "even", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + | hvg.run( + fromState: ["input": "output_normalized"], + toState: ["output_hvg": "output"] + ) + + | pca.run( + fromState: ["input": "output_hvg"], + toState: ["output_pca": "output" ] + ) + + | knn.run( + fromState: ["input": "output_pca"], + toState: ["output_knn": "output"] + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_knn]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "output_raw", + "output_normalized", + "output_pca", + "output_hvg", + "output_knn", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml new file mode 100644 index 0000000000..8d3ca51d0b --- /dev/null +++ b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/config.vsh.yaml @@ -0,0 +1,137 @@ +functionality: + name: process_openproblems_neurips2021_bmmc + namespace: datasets/workflows + description: | + Fetch and process Neurips 2021 multimodal datasets + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: "string" + description: "The ID of the dataset" + required: true + - name: "--input" + type: "file" + description: "Path to the input dataset" + required: true + - name: "--mod1" + type: string + description: Name of the first modality. + required: true + example: GEX + - name: "--mod2" + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + - name: Outputs + arguments: + - name: "--output_mod1" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_mod2" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_meta_mod1" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod1.yaml" + - name: "--output_meta_mod2" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod2.yaml" + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/openproblems_neurips2021_bmmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + - name: common/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf new file mode 100644 index 0000000000..5f3b9867c7 --- /dev/null +++ b/src/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf @@ -0,0 +1,196 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000] + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000] + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000] + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000] + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"] + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"] + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + | decompress_gzip.run( + fromState: ["input": "input"], + toState: ["input_decompressed": "output"] + ) + + // process neurips downloaded dataset + | openproblems_neurips2021_bmmc.run( + fromState: [ + "input": "input_decompressed", + "mod1": "mod1", + "mod2": "mod2", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism" + ], + toState: [ + "raw_mod1": "output_mod1", + "raw_mod2": "output_mod2" + ] + ) + + // subsample if need be + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "raw_mod1", + "input_mod2": "raw_mod2", + "n_obs": "n_obs", + "n_vars": "n_vars", + "keep_features": "keep_features", + "keep_cell_type_categories": "keep_cell_type_categories", + "keep_batch_categories": "keep_batch_categories", + "even": "even", + "seed": "seed" + ], + toState: [ + "raw_mod1": "output", + "raw_mod2": "output_mod2" + ] + ) + + // run mod1 normalization methods + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "raw_mod1"], + toState: { id, output, state, comp -> + state + [ + "normalization_id": comp.name, + "normalized_mod1": output.output + ] + } + ) + + // run normalization methods on second modality + // TODO: can we change this to DSB? + | prot_clr.run( + runIf: { id, state -> state.mod2 == "ADT" }, + args: [normalization_id: "prot_clr"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + | atac_tfidf.run( + runIf: { id, state -> state.mod2 == "ATAC" }, + args: [normalization_id: "atac_tfidf"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + + | svd.run( + fromState: [ + "input": "normalized_mod1", + "input_mod2": "normalized_mod2" + ], + toState: [ + "svd_mod1": "output", + "svd_mod2": "output_mod2" + ] + ) + + | hvg.run( + fromState: [ "input": "svd_mod1" ], + toState: [ "hvg_mod1": "output" ] + ) + + | hvg.run( + key: "hvg_mod2", + fromState: [ "input": "svd_mod2" ], + toState: [ "hvg_mod2": "output" ] + ) + + // add synonyms + | map{ id, state -> + [id, state + ["output_mod1": state.hvg_mod1, "output_mod2": state.hvg_mod2]] + } + + | extract_metadata.run( + key: "extract_metadata_mod1", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod1") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod1, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod1": "output"] + ) + + | extract_metadata.run( + key: "extract_metadata_mod2", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod2") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod2, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod2": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_mod1", + "output_mod2", + "output_meta_mod1", + "output_meta_mod2", + "_meta" + ]) + + emit: + output_ch +} diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml new file mode 100644 index 0000000000..96bcc3ee2c --- /dev/null +++ b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/config.vsh.yaml @@ -0,0 +1,143 @@ +functionality: + name: process_openproblems_neurips2022_pbmc + namespace: datasets/workflows + description: | + Fetch and process Neurips 2022 multimodal datasets + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: "string" + description: "The ID of the dataset" + required: true + - name: "--input_mod1" + type: file + description: "Processed RNA h5ad file" + required: true + example: cite_rna_merged.h5ad + - name: "--input_mod2" + type: file + description: "Processed ADT or ATAC h5ad file" + required: true + example: cite_prot_merged.h5ad + - name: "--mod1" + type: string + description: Name of the first modality. + required: true + example: GEX + - name: "--mod2" + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + - name: Outputs + arguments: + - name: "--output_mod1" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_mod2" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_meta_mod1" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod1.yaml" + - name: "--output_meta_mod2" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod2.yaml" + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/openproblems_neurips2022_pbmc + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + - name: common/decompress_gzip + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf new file mode 100644 index 0000000000..834d52bf63 --- /dev/null +++ b/src/datasets/workflows/process_openproblems_neurips2022_pbmc/main.nf @@ -0,0 +1,192 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000] + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000] + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000] + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000] + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"] + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"] + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // process neurips downloaded dataset + | openproblems_neurips2022_pbmc.run( + fromState: [ + "input_mod1": "input_mod1", + "input_mod2": "input_mod2", + "mod1": "mod1", + "mod2": "mod2", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism" + ], + toState: [ + "raw_mod1": "output_mod1", + "raw_mod2": "output_mod2" + ] + ) + + // subsample if need be + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "raw_mod1", + "input_mod2": "raw_mod2", + "n_obs": "n_obs", + "n_vars": "n_vars", + "keep_features": "keep_features", + "keep_cell_type_categories": "keep_cell_type_categories", + "keep_batch_categories": "keep_batch_categories", + "even": "even", + "seed": "seed" + ], + toState: [ + "raw_mod1": "output", + "raw_mod2": "output_mod2" + ] + ) + + // run mod1 normalization methods + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "raw_mod1"], + toState: { id, output, state, comp -> + state + [ + "normalization_id": comp.name, + "normalized_mod1": output.output + ] + } + ) + + // run normalization methods on second modality + // TODO: can we change this to DSB? + | prot_clr.run( + runIf: { id, state -> state.mod2 == "ADT" }, + args: [normalization_id: "prot_clr"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + | atac_tfidf.run( + runIf: { id, state -> state.mod2 == "ATAC" }, + args: [normalization_id: "atac_tfidf"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + + | svd.run( + fromState: [ + "input": "normalized_mod1", + "input_mod2": "normalized_mod2" + ], + toState: [ + "svd_mod1": "output", + "svd_mod2": "output_mod2" + ] + ) + + | hvg.run( + fromState: [ "input": "svd_mod1" ], + toState: [ "hvg_mod1": "output" ] + ) + + | hvg.run( + key: "hvg_mod2", + fromState: [ "input": "svd_mod2" ], + toState: [ "hvg_mod2": "output" ] + ) + + // add synonyms + | map{ id, state -> + [id, state + ["output_mod1": state.hvg_mod1, "output_mod2": state.hvg_mod2]] + } + + | extract_metadata.run( + key: "extract_metadata_mod1", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod1") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod1, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod1": "output"] + ) + + | extract_metadata.run( + key: "extract_metadata_mod2", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod2") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod2, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod2": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_mod1", + "output_mod2", + "output_meta_mod1", + "output_meta_mod2", + "_meta" + ]) + + emit: + output_ch +} diff --git a/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml new file mode 100644 index 0000000000..fb0cd73a65 --- /dev/null +++ b/src/datasets/workflows/process_openproblems_v1/config.vsh.yaml @@ -0,0 +1,163 @@ +functionality: + name: process_openproblems_v1 + namespace: datasets/workflows + description: | + Fetch and process legacy OpenProblems v1 datasets + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--input_id" + type: "string" + description: "The ID of the dataset in OpenProblems v1" + required: true + - name: "--obs_cell_type" + type: "string" + description: "Location of where to find the observation cell types." + - name: "--obs_batch" + type: "string" + description: "Location of where to find the observation batch IDs." + - name: "--obs_tissue" + type: "string" + description: "Location of where to find the observation tissue information." + - name: "--layer_counts" + type: "string" + description: "In which layer to find the counts matrix. Leave undefined to use `.X`." + example: counts + - name: "--sparse" + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: "--var_feature_id" + type: "string" + description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." + example: gene_ids + - name: "--var_feature_name" + type: "string" + description: "Location of where to find the feature names. Can be set to index if the feature names are the index." + default: index + - name: Metadata + arguments: + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: "output" + required: true + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: "--output_raw" + __merge__: /src/datasets/api/file_raw.yaml + direction: "output" + required: false + - name: "--output_normalized" + __merge__: /src/datasets/api/file_normalized.yaml + direction: "output" + required: false + - name: "--output_pca" + __merge__: /src/datasets/api/file_pca.yaml + direction: "output" + required: false + - name: "--output_hvg" + __merge__: /src/datasets/api/file_hvg.yaml + direction: "output" + required: false + - name: "--output_knn" + __merge__: /src/datasets/api/file_knn.yaml + direction: "output" + required: false + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/openproblems_v1 + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: datasets/processors/pca + - name: datasets/processors/hvg + - name: datasets/processors/knn + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1/main.nf b/src/datasets/workflows/process_openproblems_v1/main.nf new file mode 100644 index 0000000000..ad57d63029 --- /dev/null +++ b/src/datasets/workflows/process_openproblems_v1/main.nf @@ -0,0 +1,158 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | openproblems_v1.run( + fromState: [ + "input_id": "input_id", + "obs_cell_type": "obs_cell_type", + "obs_batch": "obs_batch", + "obs_tissue": "obs_tissue", + "layer_counts": "layer_counts", + "sparse": "sparse", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + ], + toState: ["output_raw": "output"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "keep_features": "keep_features", + "keep_cell_type_categories": "keep_cell_type_categories", + "keep_batch_categories": "keep_batch_categories", + "even": "even", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + | hvg.run( + fromState: ["input": "output_normalized"], + toState: ["output_hvg": "output"] + ) + + | pca.run( + fromState: ["input": "output_hvg"], + toState: ["output_pca": "output" ] + ) + + | knn.run( + fromState: ["input": "output_pca"], + toState: ["output_knn": "output"] + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_knn]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "output_raw", + "output_normalized", + "output_pca", + "output_hvg", + "output_knn", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml new file mode 100644 index 0000000000..58b045cc3b --- /dev/null +++ b/src/datasets/workflows/process_openproblems_v1_multimodal/config.vsh.yaml @@ -0,0 +1,161 @@ +functionality: + name: process_openproblems_v1_multimodal + namespace: datasets/workflows + description: | + Fetch and process legacy OpenProblems v1 multimodal datasets + argument_groups: + - name: Inputs + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--input_id" + type: "string" + description: "The ID of the dataset in OpenProblems v1" + required: true + - name: "--obs_cell_type" + type: "string" + description: "Location of where to find the observation cell types." + - name: "--obs_batch" + type: "string" + description: "Location of where to find the observation batch IDs." + - name: "--obs_tissue" + type: "string" + description: "Location of where to find the observation tissue information." + - name: "--layer_counts" + type: "string" + description: "In which layer to find the counts matrix. Leave undefined to use `.X`." + example: counts + - name: "--sparse" + type: boolean + default: true + description: Convert layers to a sparse CSR format. + - name: "--var_feature_id" + type: "string" + description: "Location of where to find the feature IDs. Can be set to index if the feature IDs are the index." + example: gene_ids + - name: "--var_feature_name" + type: "string" + description: "Location of where to find the feature names. Can be set to index if the feature names are the index." + default: index + - name: "--mod1" + type: string + description: Name of the first modality. + required: true + example: GEX + - name: "--mod2" + type: string + description: Name of the second modality. + required: true + example: ADT + - name: Metadata + arguments: + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--keep_features" + type: string + multiple: true + description: A list of genes to keep. + - name: "--keep_cell_type_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--keep_batch_categories" + type: "string" + multiple: true + description: "Categories indexes to be selected" + required: false + - name: "--even" + type: "boolean_true" + description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + - name: Outputs + arguments: + - name: "--output_mod1" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_mod2" + direction: "output" + __merge__: /src/datasets/api/file_multimodal_dataset.yaml + - name: "--output_meta_mod1" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod1.yaml" + - name: "--output_meta_mod2" + direction: "output" + type: file + description: "Dataset metadata" + example: "dataset_metadata_mod2.yaml" + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/openproblems_v1_multimodal + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/normalization/prot_clr + - name: datasets/normalization/atac_tfidf + - name: datasets/processors/subsample + - name: datasets/processors/svd + - name: datasets/processors/hvg + - name: common/extract_metadata + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow diff --git a/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf b/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf new file mode 100644 index 0000000000..96d37d6182 --- /dev/null +++ b/src/datasets/workflows/process_openproblems_v1_multimodal/main.nf @@ -0,0 +1,204 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000] + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000] + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000] + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000] + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"] + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"] + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | openproblems_v1_multimodal.run( + fromState: [ + "input_id": "input_id", + "obs_cell_type": "obs_cell_type", + "obs_batch": "obs_batch", + "obs_tissue": "obs_tissue", + "layer_counts": "layer_counts", + "sparse": "sparse", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism" + ], + toState: [ + "raw_mod1": "output_mod1", + "raw_mod2": "output_mod2" + ] + ) + + // subsample if need be + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "raw_mod1", + "input_mod2": "raw_mod2", + "n_obs": "n_obs", + "n_vars": "n_vars", + "keep_features": "keep_features", + "keep_cell_type_categories": "keep_cell_type_categories", + "keep_batch_categories": "keep_batch_categories", + "even": "even", + "seed": "seed" + ], + toState: [ + "raw_mod1": "output", + "raw_mod2": "output_mod2" + ] + ) + + // run normalization methods + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "raw_mod1"], + toState: { id, output, state, comp -> + state + [ + "normalization_id": comp.name, + "normalized_mod1": output.output + ] + } + ) + + // run normalization methods on second modality + // TODO: can we change this to DSB? + | prot_clr.run( + runIf: { id, state -> state.mod2 == "ADT" }, + args: [normalization_id: "prot_clr"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + | atac_tfidf.run( + runIf: { id, state -> state.mod2 == "ATAC" }, + args: [normalization_id: "atac_tfidf"], + fromState: ["input": "raw_mod2"], + toState: ["normalized_mod2": "output"] + ) + + | svd.run( + fromState: [ + "input": "normalized_mod1", + "input_mod2": "normalized_mod2" + ], + toState: [ + "svd_mod1": "output", + "svd_mod2": "output_mod2" + ] + ) + + | hvg.run( + fromState: [ "input": "svd_mod1" ], + toState: [ "hvg_mod1": "output" ] + ) + + | hvg.run( + key: "hvg_mod2", + fromState: [ "input": "svd_mod2" ], + toState: [ "hvg_mod2": "output" ] + ) + + // add synonyms + | map{ id, state -> + [id, state + [ + "output_mod1": state.hvg_mod1, + "output_mod2": state.hvg_mod2 + ]] + } + + | extract_metadata.run( + key: "extract_metadata_mod1", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod1") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod1, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod1": "output"] + ) + + | extract_metadata.run( + key: "extract_metadata_mod2", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_mod2") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_mod2, + "schema": schemaYaml + ] + }, + toState: ["output_meta_mod2": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_mod1", + "output_mod2", + "output_meta_mod1", + "output_meta_mod2", + "_meta" + ]) + + emit: + output_ch +} diff --git a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..91a2867820 --- /dev/null +++ b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml @@ -0,0 +1,142 @@ +functionality: + name: process_tenx_visium + namespace: datasets/workflows + description: | + Download and process datasets originating from 10x Genomics. + argument_groups: + - name: Input + arguments: + - name: "--input_expression" + type: string + description: URL to the feature / barcode matrix HDF5. + required: true + - name: "--input_spatial" + type: string + description: URL to the Spatial imaging data. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/tenx_visium + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_tenx_visium/main.nf b/src/datasets/workflows/process_tenx_visium/main.nf new file mode 100644 index 0000000000..2ec0eae247 --- /dev/null +++ b/src/datasets/workflows/process_tenx_visium/main.nf @@ -0,0 +1,133 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | tenx_visium.run( + fromState: [ + "input_expression": "input_expression", + "input_spatial": "input_spatial", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml new file mode 100644 index 0000000000..45b938b716 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml @@ -0,0 +1,138 @@ +functionality: + name: process_zenodo_spatial + namespace: datasets/workflows + description: | + Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. + argument_groups: + - name: Input + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 600 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/zenodo_spatial + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/main.nf b/src/datasets/workflows/process_zenodo_spatial/main.nf new file mode 100644 index 0000000000..a5893c0ab4 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial/main.nf @@ -0,0 +1,132 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | zenodo_spatial.run( + fromState: [ + "input_data": "input_data", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..23934fe161 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml @@ -0,0 +1,138 @@ +functionality: + name: process_zenodo_spatial_slidetags + namespace: datasets/workflows + description: | + Download and process slide tags datasets originating from Zenodo. + argument_groups: + - name: Input + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 600 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/zenodo_spatial_slidetags + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf b/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf new file mode 100644 index 0000000000..2bb6b9300a --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf @@ -0,0 +1,132 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | zenodo_spatial_slidetags.run( + fromState: [ + "input_data": "input_data", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/migration/check_migration.sh b/src/migration/check_migration.sh new file mode 100644 index 0000000000..1ce39634f2 --- /dev/null +++ b/src/migration/check_migration.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# viash run src/common/get_git_sha/config.vsh.yaml -p native -- --input /home/kai/Documents/openroblems/openproblems --output output/op_git_sha.json + +TASK_IDS=`ls src/tasks` + +for task_id in $TASK_IDS; do + echo ">> Processing $task_id" + viash run src/common/get_method_info/config.vsh.yaml -- --input . --task_id $task_id --output output/${task_id}_method.json + viash run src/migration/check_migration_status/config.vsh.yaml -p native -- --git_sha resources_test/input_git_sha.json --comp_info output/${task_id}_method.json --output output/${task_id}_method_status.json + viash run src/common/get_metric_info/config.vsh.yaml -- --input . --task_id $task_id --output output/${task_id}_metric.json + viash run src/migration/check_migration_status/config.vsh.yaml -p native -- --git_sha resources_test/input_git_sha.json --comp_info output/${task_id}_metric.json --output output/${task_id}_metric_status.json + +done \ No newline at end of file diff --git a/src/migration/check_migration_status/config.vsh.yaml b/src/migration/check_migration_status/config.vsh.yaml new file mode 100644 index 0000000000..bd8107381c --- /dev/null +++ b/src/migration/check_migration_status/config.vsh.yaml @@ -0,0 +1,30 @@ +functionality: + name: "check_migration_status" + namespace: "migration" + description: "Check migration status" + arguments: + - name: "--git_sha" + type: "file" + example: git_sha.json + description: "a json with git sha info" + - name: "--comp_info" + type: "file" + example: comp_info.json + description: "a json with component info" + - name: "--output" + type: "file" + direction: "output" + default: "output.yaml" + description: "Output yaml file with migration status" + resources: + - type: python_script + path: script.py + test_resources: + - path: /resources_test + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + - type: native diff --git a/src/migration/check_migration_status/script.py b/src/migration/check_migration_status/script.py new file mode 100644 index 0000000000..6e88b2d9ed --- /dev/null +++ b/src/migration/check_migration_status/script.py @@ -0,0 +1,60 @@ +import json +from typing import Dict, List + +## VIASH START +par = { + 'git_sha': 'resources_test/input_git_sha.json', + 'comp_info': 'output/denoising_metric.json', + 'output': 'output/denoising_metric_status.json' +} +## VIASH END + +def check_status(comp_item: List[Dict[str, str]], git_objects: List[Dict[str, str]]) -> str: + """Looks for the comp_item's matching git_object + based on the comp_item["v1"]["path"] and git_object["path"]. + If found, checks whether the comp_item["v1_commit"] equals + git_object["sha"].""" + + v1_path = comp_item.get("v1", {}).get("path") + + if "metric_id" in comp_item: + v1_path = comp_item.get("v1.path") + + if not v1_path: + return "v1.path missing" + + v1_commit = comp_item.get("v1", {}).get("commit") + + if "metric_id" in comp_item: + v1_commit = comp_item.get("v1.commit") + + if not v1_commit: + return "v1.commit missing" + + git_object = [ obj for obj in git_objects if obj["path"] == v1_path ] + if not git_object: + return "v1.path does not exist in git repo" + + git_sha = git_object[0]["sha"] + if git_sha == v1_commit: + return "up to date" + else: + return f"out of date (sha: {git_sha})" + +with open(par['git_sha'], 'r') as f1: + git_objects = json.load(f1) + +with open(par['comp_info'], 'r') as f2: + comp_items = json.load(f2) + +output = [] +for comp_item in comp_items: + # get status + status = check_status(comp_item, git_objects) + + # store results + output.append(comp_item | {"status": status}) + +# write to file +with open(par['output'], 'w') as outf: + json.dump(output, outf, indent=2) diff --git a/src/migration/check_migration_status/test.py b/src/migration/check_migration_status/test.py new file mode 100644 index 0000000000..878a167215 --- /dev/null +++ b/src/migration/check_migration_status/test.py @@ -0,0 +1,34 @@ +import subprocess +from os import path +import json + +input_sha = meta["resources_dir"] + "/resources_test/common/task_metadata/input_git_sha.json" +input_method_info = meta["resources_dir"] + "/resources_test/common/task_metadata/method_info.json" +output_path = "output.json" + +cmd = [ + meta['executable'], + "--git_sha", input_sha, + "--comp_info", input_method_info, + "--output", output_path, +] + +print(">> Running script as test", flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert path.exists(output_path), "Output does not exist" + +print(">> Reading json file", flush=True) +with open(output_path, 'r') as f: + out = json.load(f) + print(out) + +print("All checks succeeded!", flush=True) diff --git a/src/migration/list_git_shas/config.vsh.yaml b/src/migration/list_git_shas/config.vsh.yaml new file mode 100644 index 0000000000..c70366612a --- /dev/null +++ b/src/migration/list_git_shas/config.vsh.yaml @@ -0,0 +1,39 @@ +functionality: + name: list_git_shas + namespace: migration + description: "Extract git file info from a git repo" + arguments: + - name: --input + type: file + description: Path to a git repository + required: true + example: /path/to/repo + - name: --output + type: file + direction: output + description: | + A json containing a list of entries. Each entry must have the + following values: + + * "path" `string`: Path a file in the repository + * "last_modified" `string`: Date of when the file was last modified, in `yyyy-mm-dd HH:mm:ss` format. + * "sha" `string`: Sha of the commit in which the file was last modified + * "history_sha" `string` (optional): A list of SHAs during which the file was modified + required: true + example: output.json + - name: --show_history + type: boolean_true + description: Whether or not to include the full history of SHAs for each file. + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + test_setup: + - type: docker + run: "git clone https://github.com/openproblems-bio/openproblems-v2.git" + - type: nextflow \ No newline at end of file diff --git a/src/migration/list_git_shas/script.py b/src/migration/list_git_shas/script.py new file mode 100644 index 0000000000..46c56990e8 --- /dev/null +++ b/src/migration/list_git_shas/script.py @@ -0,0 +1,61 @@ +import subprocess +import json + +## VIASH START +par = { + 'input': '.', + 'output': 'resources_test/input_git_sha.json', + 'show_history': True +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +#? to do: what to do with untracked files? + +output = [] + +def git_ls_files(directory): + cmd = ["git", "ls-files"] + cmd_out = subprocess.run(cmd, capture_output=True, text=True, cwd=directory).stdout + out = [ line for line in cmd_out.split("\n") if line != "" ] + return out + +def get_git_file_info(file, full_history=False): + # construct command + cmd = ["git", "log", "--no-merges", "--pretty=%H\t%ci"] + + if not full_history: + cmd.extend(["-n", "1"]) + + cmd.extend(["--", file]) + + # run command + out = subprocess.run(cmd, capture_output=True, text=True, cwd=par["input"]).stdout + + # split output + split = [line.split("\t") for line in out.split("\n") if "\t" in line] + + return split + +for relative_path in git_ls_files(par['input']): + + # get git file info + git_file_info = get_git_file_info(relative_path, full_history=par["show_history"]) + last = git_file_info[0] + out = { + "path": relative_path, + "last_modified": last[1], + "sha": last[0] + } + if par['show_history']: + out['history_sha'] = [info[0] for info in git_file_info] + + output.append(out) + +with open(par['output'], 'w') as f: + json.dump(output, f, indent=2) + + + diff --git a/src/migration/list_git_shas/test.py b/src/migration/list_git_shas/test.py new file mode 100644 index 0000000000..5b8d44906a --- /dev/null +++ b/src/migration/list_git_shas/test.py @@ -0,0 +1,32 @@ +import subprocess +from os import path +import json + +input_path = "/openproblems-v2" +output_path = "output.json" + +cmd = [ + meta['executable'], + "--input", input_path, + "--output", output_path +] + +print(">> Running script as test", flush=True) +out = subprocess.run(cmd, stderr=subprocess.STDOUT) + +if out.stdout: + print(out.stdout) + +if out.returncode: + print(f"script: '{cmd}' exited with an error.") + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert path.exists(output_path), "Output path does not exist" + +print(">> Reading json file", flush=True) +with open(output_path, 'r') as f: + out = json.load(f) + print(out[0]) + +print("All checks succeeded!", flush=True) \ No newline at end of file diff --git a/src/migration/update_bibtex/config.vsh.yaml b/src/migration/update_bibtex/config.vsh.yaml new file mode 100644 index 0000000000..147e0b6c22 --- /dev/null +++ b/src/migration/update_bibtex/config.vsh.yaml @@ -0,0 +1,25 @@ +functionality: + name: update_bibtex + namespace: migration + arguments: + - name: --library + description: Path to bibtex file + type: file + default: src/common/library.bib + direction: output + - name: --library_v1 + description: Url of the v1 bibtex file + type: string + default: https://raw.githubusercontent.com/openproblems-bio/openproblems/main/main.bib + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: git+https://github.com/sciunto-org/python-bibtexparser@main diff --git a/src/migration/update_bibtex/script.py b/src/migration/update_bibtex/script.py new file mode 100644 index 0000000000..7d2b0d516e --- /dev/null +++ b/src/migration/update_bibtex/script.py @@ -0,0 +1,41 @@ +import bibtexparser +from tempfile import NamedTemporaryFile +import urllib.request +import collections + +## VIASH START +par = { + 'library': 'src/common/library.bib', + 'library_v1': 'https://raw.githubusercontent.com/openproblems-bio/openproblems/main/main.bib' +} +## VIASH END + +# Load the BibTeX file +print(">> Read input bibtex file", flush=True) +bib_input = bibtexparser.parse_file(par["library"]) + +print(" Library keys: " + ', '.join(bib_input.entries_dict.keys()), flush=True) + +# Merge with v1 library +if par["library_v1"]: + print(">> Merge with v1 library", flush=True) + with NamedTemporaryFile("w", suffix=".bib") as tempfile: + _ = urllib.request.urlretrieve(par["library_v1"], tempfile.name) + bib_v1 = bibtexparser.parse_file(tempfile.name) + + print(" Library v1 keys: " + ', '.join(bib_v1.entries_dict.keys()), flush=True) + blocks = bib_input.blocks + bib_v1.blocks +else: + blocks = bib_input.blocks + +# Remove duplicates +print(">> Remove duplicates", flush=True) +unique_blocks = {block.key : block for block in blocks if not hasattr(block, "error")} +unique_blocks_sorted = collections.OrderedDict(sorted(unique_blocks.items())) +bib_new = bibtexparser.Library(unique_blocks_sorted.values()) + +print(" New keys: " + ', '.join(bib_new.entries_dict.keys()), flush=True) + +# Save to a new BibTeX file +print(">> Write to file", flush=True) +bibtexparser.write_file(par["library"], bib_new) diff --git a/src/migration/update_bibtex/test.py b/src/migration/update_bibtex/test.py new file mode 100644 index 0000000000..2104488fa2 --- /dev/null +++ b/src/migration/update_bibtex/test.py @@ -0,0 +1,62 @@ +import subprocess + +def test_with_duplicates(exec_path): + # Create a temporary file with duplicate entries + with open("test_with_duplicates.bib", mode="w") as file: + file.write("@article{duplicate,\n author = {Duplicate, A.},\n title = {Duplicate article},\n year = {2022},\n}\n@article{duplicate,\n author = {Duplicate, A.},\n title = {Duplicate article},\n year = {2022},\n}\n") + + # Test basic functionality without merging + result = subprocess.run( + [exec_path, "--library", "test_with_duplicates.bib", "--library_v1", ""], + capture_output=True, + check=True, + text=True + ) + print(result.stdout, flush=True) + + assert "Read input bibtex file" in result.stdout, "Reading input failed" + assert not "Merge with v1 library" in result.stdout, "Merging failed" + assert "Remove duplicates" in result.stdout, "Duplicate removal failed" + assert "Write to file" in result.stdout, "Writing output failed" + + # Check the output file to make sure duplicates are removed + with open("test_with_duplicates.bib", "r") as f: + contents = f.read() + count = contents.count("article{") + assert count == 1, f"Count should be 1 but is {count}" + +def test_merge(exec_path, lib_v1_url): + # Create a temporary file with duplicate entries + with open("test_merge.bib", mode="w") as file: + file.write("@article{entry,\n author = {Duplicate, A.},\n title = {Duplicate article},\n year = {2022},\n}\n") + + # Test basic functionality without merging + result = subprocess.run( + [exec_path, "--library", "test_merge.bib", "--library_v1", lib_v1_url], + capture_output=True, + check=True, + text=True + ) + print(result.stdout, flush=True) + + assert "Read input bibtex file" in result.stdout, "Reading input failed" + assert "Merge with v1 library" in result.stdout, "Merging failed" + assert "Remove duplicates" in result.stdout, "Duplicate removal failed" + assert "Write to file" in result.stdout, "Writing output failed" + + # Check the output file to make sure duplicates are removed + with open("test_merge.bib", "r") as f: + contents = f.read() + count = contents.count("article{") + + assert count > 1, f"Count should be greater than 1 but is {count}" + +test_merge( + exec_path=meta["executable"], + lib_v1_url="https://raw.githubusercontent.com/openproblems-bio/openproblems/main/main.bib" +) +test_with_duplicates( + exec_path=meta["executable"] +) + +print("All tests passed!", flush=True) \ No newline at end of file diff --git a/src/tasks/batch_integration/README.md b/src/tasks/batch_integration/README.md new file mode 100644 index 0000000000..7d525e9fc8 --- /dev/null +++ b/src/tasks/batch_integration/README.md @@ -0,0 +1,571 @@ +# Batch Integration + + +Remove unwanted batch effects from scRNA data while retaining +biologically meaningful variation. + +Path: +[`src/tasks/batch_integration`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/batch_integration) + +## Motivation + +As single-cell technologies advance, single-cell datasets are growing +both in size and complexity. Especially in consortia such as the Human +Cell Atlas, individual studies combine data from multiple labs, each +sequencing multiple individuals possibly with different technologies. +This gives rise to complex batch effects in the data that must be +computationally removed to perform a joint analysis. These batch +integration methods must remove the batch effect while not removing +relevant biological information. Currently, over 200 tools exist that +aim to remove batch effects scRNA-seq datasets \[@zappia2018exploring\]. +These methods balance the removal of batch effects with the conservation +of nuanced biological information in different ways. This abundance of +tools has complicated batch integration method choice, leading to +several benchmarks on this topic \[@luecken2020benchmarking; +@tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking\]. +Yet, benchmarks use different metrics, method implementations and +datasets. Here we build a living benchmarking task for batch integration +methods with the vision of improving the consistency of method +evaluation. + +## Description + +In this task we evaluate batch integration methods on their ability to +remove batch effects in the data while conserving variation attributed +to biological effects. As input, methods require either normalised or +unnormalised data with multiple batches and consistent cell type labels. +The batch integrated output can be a feature matrix, a low dimensional +embedding and/or a neighbourhood graph. The respective batch-integrated +representation is then evaluated using sets of metrics that capture how +well batch effects are removed and whether biological variance is +conserved. We have based this particular task on the latest, and most +extensive benchmark of single-cell data integration methods. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Michaela Mueller | maintainer, author | +| Kai Waldrant | contributor | +| Robrecht Cannoodt | contributor | +| Daniel Strobl | author | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common Dataset") + comp_process_dataset[/"Data processor"/] + file_dataset("Dataset") + file_solution("Solution") + comp_control_method_embedding[/"Control method (embedding)"/] + comp_control_method_graaf[/"Control method (graph)"/] + comp_method_embedding[/"Method (embedding)"/] + comp_method_feature[/"Method (feature)"/] + comp_method_graaf[/"Method (graph)"/] + comp_metric_embedding[/"Metric (embedding)"/] + comp_metric_feature[/"Metric (feature)"/] + comp_metric_graaf[/"Metric (graph)"/] + file_integrated_embedding("Integrated embedding") + file_integrated_graaf("Integrated Graph") + file_integrated_feature("Integrated Feature") + file_score("Score") + comp_transformer_embedding_to_graaf[/"Embedding to Graph"/] + comp_transformer_feature_to_embedding[/"Feature to Embedding"/] + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_dataset + comp_process_dataset-->file_solution + file_dataset---comp_control_method_embedding + file_dataset---comp_control_method_graaf + file_dataset---comp_method_embedding + file_dataset---comp_method_feature + file_dataset---comp_method_graaf + file_solution---comp_metric_embedding + file_solution---comp_metric_feature + file_solution---comp_metric_graaf + comp_control_method_embedding-->file_integrated_embedding + comp_control_method_graaf-->file_integrated_graaf + comp_method_embedding-->file_integrated_embedding + comp_method_feature-->file_integrated_feature + comp_method_graaf-->file_integrated_graaf + comp_metric_embedding-->file_score + comp_metric_feature-->file_score + comp_metric_graaf-->file_score + file_integrated_embedding---comp_metric_embedding + file_integrated_embedding---comp_transformer_embedding_to_graaf + file_integrated_graaf---comp_metric_graaf + file_integrated_feature---comp_metric_feature + file_integrated_feature---comp_transformer_feature_to_embedding + comp_transformer_embedding_to_graaf-->file_integrated_graaf + comp_transformer_feature_to_embedding-->file_integrated_embedding +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: `resources_test/common/pancreas/dataset.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type', 'batch' + var: 'hvg', 'hvg_score', 'feature_name' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["cell_type"]` | `string` | Cell type information. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## Component type: Data processor + +Path: +[`src/batch_integration`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration) + +A label projection dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------|:----------|:---------------------------------------------------------------------------| +| `--input` | `file` | A subset of the common dataset. | +| `--output_dataset` | `file` | (*Output*) Unintegrated AnnData HDF5 file. | +| `--output_solution` | `file` | (*Output*) Solution dataset. | +| `--obs_label` | `string` | (*Optional*) Which .obs slot to use as label. Default: `cell_type`. | +| `--obs_batch` | `string` | (*Optional*) Which .obs slot to use as batch covariate. Default: `batch`. | +| `--hvgs` | `integer` | (*Optional*) Number of highly variable genes. Default: `2000`. | +| `--subset_hvg` | `boolean` | (*Optional*) Whether to subset to highly variable genes. Default: `FALSE`. | + +
+ +## File format: Dataset + +Unintegrated AnnData HDF5 file. + +Example file: `resources_test/batch_integration/pancreas/dataset.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'label' + var: 'hvg', 'hvg_score', 'feature_name' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'knn' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["label"]` | `string` | label information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## File format: Solution + +Solution dataset + +Example file: `resources_test/batch_integration/pancreas/solution.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'label' + var: 'hvg', 'hvg_score', 'feature_name' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["label"]` | `string` | label information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## Component type: Control method (embedding) + +Path: +[`src/batch_integration/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/control_methods) + +A batch integration embedding control method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:--------------------------------------------| +| `--input` | `file` | Unintegrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | + +
+ +## Component type: Control method (graph) + +Path: +[`src/batch_integration/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/control_methods) + +A batch integration graph control method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:-----------------------------------------| +| `--input` | `file` | Unintegrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | + +
+ +## Component type: Method (embedding) + +Path: +[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/methods) + +A batch integration embedding method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:--------------------------------------------| +| `--input` | `file` | Unintegrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | + +
+ +## Component type: Method (feature) + +Path: +[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/methods) + +A batch integration feature method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:-----------------------------------------| +| `--input` | `file` | Unintegrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | + +
+ +## Component type: Method (graph) + +Path: +[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/methods) + +A batch integration graph method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:-----------------------------------------| +| `--input` | `file` | Unintegrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | + +
+ +## Component type: Metric (embedding) + +Path: +[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/metrics) + +A batch integration embedding metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:---------------------------------| +| `--input_integrated` | `file` | An integrated AnnData HDF5 file. | +| `--input_solution` | `file` | Solution dataset. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## Component type: Metric (feature) + +Path: +[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/metrics) + +A batch integration feature metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:------------------------------| +| `--input_integrated` | `file` | Integrated AnnData HDF5 file. | +| `--input_solution` | `file` | Solution dataset. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## Component type: Metric (graph) + +Path: +[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/metrics) + +A batch integration graph metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:------------------------------| +| `--input_integrated` | `file` | Integrated AnnData HDF5 file. | +| `--input_solution` | `file` | Solution dataset. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Integrated embedding + +An integrated AnnData HDF5 file. + +Example file: +`resources_test/batch_integration/pancreas/integrated_embedding.h5ad` + +Format: + +
+ + AnnData object + obsm: 'X_emb' + uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:--------------------------------------------------------| +| `obsm["X_emb"]` | `double` | integration embedding prediction. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Integrated Graph + +Integrated AnnData HDF5 file. + +Example file: +`resources_test/batch_integration/pancreas/integrated_graph.h5ad` + +Format: + +
+ + AnnData object + obsp: 'connectivities', 'distances' + uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:--------------------------------------------------------| +| `obsp["connectivities"]` | `double` | Neighbors connectivities matrix. | +| `obsp["distances"]` | `double` | Neighbors connectivities matrix. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["neighbors"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## File format: Integrated Feature + +Integrated AnnData HDF5 file. + +Example file: +`resources_test/batch_integration/pancreas/integrated_feature.h5ad` + +Format: + +
+ + AnnData object + layers: 'corrected_counts' + uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:---------|:--------------------------------------------------------| +| `layers["corrected_counts"]` | `double` | Corrected counts after integration. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +Metric score file + +Example file: `score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ +## Component type: Embedding to Graph + +Path: +[`src/batch_integration/transformers`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/transformers) + +Transform an embedding to a graph output. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:-----------------------------------------| +| `--input` | `file` | An integrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | + +
+ +## Component type: Feature to Embedding + +Path: +[`src/batch_integration/transformers`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/batch_integration/transformers) + +Transform a feature output to an embedding. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:--------------------------------------------| +| `--input` | `file` | Integrated AnnData HDF5 file. | +| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | + +
+ diff --git a/src/tasks/batch_integration/api/comp_control_method_embedding.yaml b/src/tasks/batch_integration/api/comp_control_method_embedding.yaml new file mode 100644 index 0000000000..9c4bc65ce5 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_control_method_embedding.yaml @@ -0,0 +1,26 @@ +functionality: + namespace: batch_integration/control_methods + info: + type: control_method + subtype: embedding + type_info: + label: Control method (embedding) + summary: A batch integration embedding control method. + description: | + A batch integration control method which outputs a batch-corrected embedding. + arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + direction: output + __merge__: file_integrated_embedding.yaml + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_control_method_feature.yaml b/src/tasks/batch_integration/api/comp_control_method_feature.yaml new file mode 100644 index 0000000000..3d2ac9853d --- /dev/null +++ b/src/tasks/batch_integration/api/comp_control_method_feature.yaml @@ -0,0 +1,26 @@ +functionality: + namespace: batch_integration/control_methods + info: + type: control_method + subtype: feature + type_info: + label: Control method (feature) + summary: A batch integration feature control method. + description: | + A batch integration control method which outputs a batch-corrected feature space. + arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + direction: output + __merge__: file_integrated_feature.yaml + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_control_method_graph.yaml b/src/tasks/batch_integration/api/comp_control_method_graph.yaml new file mode 100644 index 0000000000..cba6f48f7a --- /dev/null +++ b/src/tasks/batch_integration/api/comp_control_method_graph.yaml @@ -0,0 +1,26 @@ +functionality: + namespace: batch_integration/control_methods + info: + type: control_method + subtype: graph + type_info: + label: Control method (graph) + summary: A batch integration graph control method. + description: | + A batch integration control method which outputs a batch-corrected cell graphs. + arguments: + - __merge__: file_dataset.yaml + name: --input + direction: input + required: true + - __merge__: file_integrated_graph.yaml + name: --output + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_embedding.yaml b/src/tasks/batch_integration/api/comp_method_embedding.yaml new file mode 100644 index 0000000000..86e7d7caf3 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_method_embedding.yaml @@ -0,0 +1,29 @@ +functionality: + namespace: batch_integration/methods + info: + type: method + subtype: embedding + type_info: + label: Method (embedding) + summary: A batch integration embedding method. + description: | + A batch integration method which outputs a batch-corrected embedding. + arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated_embedding.yaml + direction: output + required: true + test_resources: + # check method component + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - path: /src/common/library.bib + # auto-run component + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_feature.yaml b/src/tasks/batch_integration/api/comp_method_feature.yaml new file mode 100644 index 0000000000..d609c2dd5b --- /dev/null +++ b/src/tasks/batch_integration/api/comp_method_feature.yaml @@ -0,0 +1,29 @@ +functionality: + namespace: batch_integration/methods + info: + type: method + subtype: feature + type_info: + label: Method (feature) + summary: A batch integration feature method. + description: | + A batch integration method which outputs a batch-corrected feature-space. + arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated_feature.yaml + direction: output + required: true + test_resources: + # check method component + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - path: /src/common/library.bib + # auto-run component + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_graph.yaml b/src/tasks/batch_integration/api/comp_method_graph.yaml new file mode 100644 index 0000000000..2f37146e24 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_method_graph.yaml @@ -0,0 +1,29 @@ +functionality: + namespace: batch_integration/methods + info: + type: method + subtype: graph + type_info: + label: Method (graph) + summary: A batch integration graph method. + description: | + A batch integration method which outputs a batch-corrected cell graphs. + arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated_graph.yaml + direction: output + required: true + test_resources: + # check method component + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - path: /src/common/library.bib + # auto-run component + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_metric_embedding.yaml b/src/tasks/batch_integration/api/comp_metric_embedding.yaml new file mode 100644 index 0000000000..7443fca8b4 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_metric_embedding.yaml @@ -0,0 +1,38 @@ +functionality: + namespace: batch_integration/metrics + info: + type: metric + subtype: embedding + type_info: + label: Metric (embedding) + summary: A batch integration embedding metric. + description: | + A metric for evaluating batch corrected embeddings. + test_setup: + pancreas: + input_integrated: resources_test/batch_integration/pancreas/integrated_embedding.h5ad + input_solution: resources_test/batch_integration/pancreas/solution.h5ad + cellxgene_census: + input_integrated: resources_test/batch_integration/cxg_mouse_pancreas_atlas/integrated_embedding.h5ad + input_solution: resources_test/batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad + arguments: + - name: --input_integrated + __merge__: file_integrated_embedding.yaml + direction: input + required: true + - name: --input_solution + __merge__: file_solution.yaml + direction: input + required: true + - name: --output + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - path: /resources_test/batch_integration/ + dest: resources_test/batch_integration/ + # - type: python_script + # path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_metric_feature.yaml b/src/tasks/batch_integration/api/comp_metric_feature.yaml new file mode 100644 index 0000000000..2f741d0aa2 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_metric_feature.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: batch_integration/metrics + info: + type: metric + subtype: feature + type_info: + label: Metric (feature) + summary: A batch integration feature metric. + description: | + A metric for evaluating batch corrected feature spaces. + arguments: + - name: --input_integrated + __merge__: file_integrated_feature.yaml + direction: input + required: true + - name: --input_solution + __merge__: file_solution.yaml + direction: input + required: true + - name: --output + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_metric_graph.yaml b/src/tasks/batch_integration/api/comp_metric_graph.yaml new file mode 100644 index 0000000000..66935b9663 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_metric_graph.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: batch_integration/metrics + info: + type: metric + subtype: graph + type_info: + label: Metric (graph) + summary: A batch integration graph metric. + description: | + A metric for evaluating batch corrected cell graphs. + arguments: + - name: --input_integrated + __merge__: file_integrated_graph.yaml + direction: input + required: true + - name: --input_solution + __merge__: file_solution.yaml + direction: input + required: true + - name: --output + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_process_dataset.yaml b/src/tasks/batch_integration/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..715ef6d3c3 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_process_dataset.yaml @@ -0,0 +1,45 @@ +functionality: + namespace: batch_integration + info: + type: process_dataset + type_info: + label: Data processor + summary: A label projection dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_dataset" + __merge__: file_dataset.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + - name: "--obs_label" + type: "string" + description: "Which .obs slot to use as label." + default: "cell_type" + - name: "--obs_batch" + type: "string" + description: "Which .obs slot to use as batch covariate." + default: "batch" + - name: --hvgs + type: integer + description: Number of highly variable genes + default: 2000 + required: false + - name: --subset_hvg + type: boolean + description: Whether to subset to highly variable genes + default: false + required: false + test_resources: + - path: /resources_test/common/pancreas/ + dest: resources_test/common/pancreas/ + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml b/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml new file mode 100644 index 0000000000..d8e815dad5 --- /dev/null +++ b/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml @@ -0,0 +1,25 @@ +functionality: + namespace: batch_integration/transformers + info: + type: transformer + subtype: graph + type_info: + label: Embedding to Graph + summary: Transform an embedding to a graph output. + description: | + Transform an embedding to a graph output by applying the k nearest neighbors algorithm. + arguments: + - name: --input + __merge__: file_integrated_embedding.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated_graph.yaml + direction: output + required: true + test_resources: + # auto-run component + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas \ No newline at end of file diff --git a/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml b/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml new file mode 100644 index 0000000000..788e4b965a --- /dev/null +++ b/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml @@ -0,0 +1,25 @@ +functionality: + namespace: batch_integration/transformers + info: + type: transformer + subtype: embedding + type_info: + label: Feature to Embedding + summary: Transform a feature output to an embedding. + description: | + Transform a feature output to an embedding by computing a PCA on the corrected counts. + arguments: + - name: --input + __merge__: file_integrated_feature.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated_embedding.yaml + direction: output + required: true + test_resources: + # auto-run component + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/batch_integration/pancreas + dest: resources_test/batch_integration/pancreas \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_common_dataset.yaml b/src/tasks/batch_integration/api/file_common_dataset.yaml new file mode 100644 index 0000000000..097a6794a1 --- /dev/null +++ b/src/tasks/batch_integration/api/file_common_dataset.yaml @@ -0,0 +1,92 @@ +# This file is based on the spec of the common dataset located at +# `src/datasets/api/file_common_dataset.yaml`. However, some fields +# such as obs.cell_type and obs.batch are now required +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: object + name: knn + description: Supplementary K nearest neighbors data. + required: true + diff --git a/src/tasks/batch_integration/api/file_dataset.yaml b/src/tasks/batch_integration/api/file_dataset.yaml new file mode 100644 index 0000000000..6d1eb928d8 --- /dev/null +++ b/src/tasks/batch_integration/api/file_dataset.yaml @@ -0,0 +1,69 @@ +type: file +example: "resources_test/batch_integration/pancreas/dataset.h5ad" +info: + label: "Dataset" + summary: Unintegrated AnnData HDF5 file. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: string + name: label + description: label information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: object + name: knn + description: Supplementary K nearest neighbors data. + required: true + diff --git a/src/tasks/batch_integration/api/file_integrated_embedding.yaml b/src/tasks/batch_integration/api/file_integrated_embedding.yaml new file mode 100644 index 0000000000..aa526abe71 --- /dev/null +++ b/src/tasks/batch_integration/api/file_integrated_embedding.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/batch_integration/pancreas/integrated_embedding.h5ad" +info: + prediction_type: embedding + label: "Integrated embedding" + summary: An integrated AnnData HDF5 file. + slots: + obsm: + - type: double + name: X_emb + description: integration embedding prediction + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/tasks/batch_integration/api/file_integrated_feature.yaml b/src/tasks/batch_integration/api/file_integrated_feature.yaml new file mode 100644 index 0000000000..b89e16f907 --- /dev/null +++ b/src/tasks/batch_integration/api/file_integrated_feature.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/batch_integration/pancreas/integrated_feature.h5ad" +info: + prediction_type: feature + label: "Integrated Feature" + summary: Integrated AnnData HDF5 file. + slots: + layers: + - type: double + name: corrected_counts + description: Corrected counts after integration + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: method_id + description: "A unique identifier for the method" + required: true \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_integrated_graph.yaml b/src/tasks/batch_integration/api/file_integrated_graph.yaml new file mode 100644 index 0000000000..8c09147d0d --- /dev/null +++ b/src/tasks/batch_integration/api/file_integrated_graph.yaml @@ -0,0 +1,37 @@ +type: file +example: "resources_test/batch_integration/pancreas/integrated_graph.h5ad" +info: + prediction_type: graph + label: "Integrated Graph" + summary: Integrated AnnData HDF5 file. + slots: + obsp: + - type: double + name: connectivities + description: Neighbors connectivities matrix. + required: true + - type: double + name: distances + description: Neighbors connectivities matrix. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: object + name: neighbors + description: Supplementary K nearest neighbors data. + required: true diff --git a/src/tasks/batch_integration/api/file_score.yaml b/src/tasks/batch_integration/api/file_score.yaml new file mode 100644 index 0000000000..9b4dac654f --- /dev/null +++ b/src/tasks/batch_integration/api/file_score.yaml @@ -0,0 +1,29 @@ +type: file +example: "score.h5ad" +info: + label: "Score" + summary: "Metric score file" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_solution.yaml b/src/tasks/batch_integration/api/file_solution.yaml new file mode 100644 index 0000000000..7e8b07ea4c --- /dev/null +++ b/src/tasks/batch_integration/api/file_solution.yaml @@ -0,0 +1,89 @@ +type: file +example: "resources_test/batch_integration/pancreas/solution.h5ad" +info: + label: "Solution" + summary: Solution dataset + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: string + name: label + description: label information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: object + name: knn + description: Supplementary K nearest neighbors data. + required: true + diff --git a/src/tasks/batch_integration/api/task_info.yaml b/src/tasks/batch_integration/api/task_info.yaml new file mode 100644 index 0000000000..bc3a575029 --- /dev/null +++ b/src/tasks/batch_integration/api/task_info.yaml @@ -0,0 +1,41 @@ +name: batch_integration +label: Batch Integration +v1: + path: openproblems/tasks/batch_integration/README.md + commit: 637163fba7d74ab5393c2adbee5354dcf4d46f85 +summary: Remove unwanted batch effects from scRNA data while retaining biologically meaningful variation. +image: thumbnail.svg +motivation: | + As single-cell technologies advance, single-cell datasets are growing both in size and complexity. + Especially in consortia such as the Human Cell Atlas, individual studies combine data from multiple labs, each sequencing multiple individuals possibly with different technologies. + This gives rise to complex batch effects in the data that must be computationally removed to perform a joint analysis. + These batch integration methods must remove the batch effect while not removing relevant biological information. + Currently, over 200 tools exist that aim to remove batch effects scRNA-seq datasets [@zappia2018exploring]. + These methods balance the removal of batch effects with the conservation of nuanced biological information in different ways. + This abundance of tools has complicated batch integration method choice, leading to several benchmarks on this topic [@luecken2020benchmarking; @tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking]. + Yet, benchmarks use different metrics, method implementations and datasets. Here we build a living benchmarking task for batch integration methods with the vision of improving the consistency of method evaluation. +description: | + In this task we evaluate batch integration methods on their ability to remove batch effects in the data while conserving variation attributed to biological effects. + As input, methods require either normalised or unnormalised data with multiple batches and consistent cell type labels. + The batch integrated output can be a feature matrix, a low dimensional embedding and/or a neighbourhood graph. + The respective batch-integrated representation is then evaluated using sets of metrics that capture how well batch effects are removed and whether biological variance is conserved. + We have based this particular task on the latest, and most extensive benchmark of single-cell data integration methods. +authors: + - name: Michaela Mueller + roles: [ maintainer, author ] + info: + github: mumichae + - name: Kai Waldrant + roles: [ contributor ] + info: + github: KaiWaldrant + orcid: "0009-0003-8555-1361" + - name: Robrecht Cannoodt + roles: [ contributor ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Daniel Strobl + roles: [ author ] + info: + github: danielStrobl diff --git a/src/tasks/batch_integration/api/thumbnail.svg b/src/tasks/batch_integration/api/thumbnail.svg new file mode 100644 index 0000000000..77626c5bfb --- /dev/null +++ b/src/tasks/batch_integration/api/thumbnail.svg @@ -0,0 +1 @@ +Batch 1Batch 2dim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml new file mode 100644 index 0000000000..c2484fbaa2 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml @@ -0,0 +1,24 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: batch_embed + namespace: batch_integration/control_methods/no_integration + info: + label: No integration by Batch + summary: "Cells are embedded by computing PCA independently on each batch" + description: "Cells are embedded by computing PCA independently on each batch" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py new file mode 100644 index 0000000000..801440ce65 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py @@ -0,0 +1,49 @@ +import sys +import scanpy as sc +import numpy as np + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata.var["highly_variable"] = adata.var["hvg"] + +print("Process dataset", flush=True) +adata.obsm["X_emb"] = np.zeros((adata.shape[0], 50), dtype=float) +for batch in adata.obs["batch"].unique(): + batch_idx = adata.obs["batch"] == batch + n_comps = min(50, np.sum(batch_idx)) + solver = "full" if n_comps == np.sum(batch_idx) else "arpack" + adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( + adata[batch_idx].copy(), + n_comps=n_comps, + use_highly_variable=True, + svd_solver=solver, + copy=True, + ).obsm["X_pca"] + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml new file mode 100644 index 0000000000..95212518c5 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml @@ -0,0 +1,24 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: global_embed + namespace: batch_integration/control_methods/no_integration + info: + label: No integration + summary: "Cells are embedded by PCA on the unintegrated data" + description: "Cells are embedded by PCA on the unintegrated data" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py new file mode 100644 index 0000000000..f45038806b --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py @@ -0,0 +1,36 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print("process dataset", flush=True) +adata.obsm["X_emb"] = adata.obsm["X_pca"] + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml new file mode 100644 index 0000000000..b20701c8f1 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml @@ -0,0 +1,24 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_feature.yaml +functionality: + name: global_feature + namespace: batch_integration/control_methods/no_integration + info: + label: No integration + summary: "Original feature space is not modified" + description: "Original feature space is not modified" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py new file mode 100644 index 0000000000..2acdbf9b7a --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py @@ -0,0 +1,38 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +# no processing, subset matrix to highly variable genes +adata_hvg = adata[:, adata.var["hvg"]].copy() +adata.layers['corrected_counts'] = adata_hvg.X.copy() + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml new file mode 100644 index 0000000000..86886ce263 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_graph.yaml +functionality: + name: global_graph + namespace: batch_integration/control_methods/no_integration + info: + label: No integration + summary: "kNN graph is built on the PCA of the unintegrated data" + description: "Cells are embedded by PCA on the unintegrated data. A kNN graph is built on this PCA." + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py new file mode 100644 index 0000000000..4824c8f443 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py @@ -0,0 +1,41 @@ +import scanpy as sc +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _set_uns +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) + +print("process dataset", flush=True) +neighbors_map = adata.uns['knn'] +adata.obsp['connectivities'] = adata.obsp[neighbors_map['connectivities_key']] +adata.obsp['distances'] = adata.obsp[neighbors_map['distances_key']] +_set_uns(adata, neighbors_key='knn') + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml new file mode 100644 index 0000000000..6c853a7719 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: celltype_embed + namespace: batch_integration/control_methods/perfect_integration + info: + label: Perfect embedding by cell type + summary: "Cells are embedded as a one-hot encoding of celltype labels" + description: "Cells are embedded as a one-hot encoding of celltype labels" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py new file mode 100644 index 0000000000..ca16a60ab2 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py @@ -0,0 +1,34 @@ +import anndata as ad +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + +## VIASH END +sys.path.append(meta["resources_dir"]) +from utils import _perfect_embedding +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) + +print('Process data...', flush=True) +adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"]) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml new file mode 100644 index 0000000000..e945e3bc58 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml @@ -0,0 +1,29 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: celltype_jitter_embed + namespace: batch_integration/control_methods/perfect_integration + info: + label: Perfect embedding by celltype with jitter + summary: "Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding" + description: "Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + arguments: + - name: "--jitter" + type: double + default: 0.01 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py new file mode 100644 index 0000000000..8f88f77472 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py @@ -0,0 +1,38 @@ +import anndata as ad +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'jitter': 0.01, +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + +## VIASH END +sys.path.append(meta["resources_dir"]) +from utils import _perfect_embedding +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) + +print('Process data...', flush=True) +adata.obsm["X_emb"] = _perfect_embedding( + partition=adata.obs["label"], + jitter=par["jitter"] +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml new file mode 100644 index 0000000000..d8bcee01d4 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: batch_embed + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration by batch + summary: "Embedding coordinates are randomly permuted within each batch" + description: "Embedding coordinates are randomly permuted within each batch" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py new file mode 100644 index 0000000000..175a449a49 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py @@ -0,0 +1,40 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print("process dataset", flush=True) +adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_pca"], + partition=adata.obs["batch"], +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml new file mode 100644 index 0000000000..5f98284bb9 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_feature.yaml +functionality: + name: batch_feature + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration by batch + summary: "Feature values are randomly permuted within each batch" + description: "Feature values are randomly permuted within each batch" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: acf5c95a7306b819c4a13972783433d0a48f769b + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py new file mode 100644 index 0000000000..630871e780 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py @@ -0,0 +1,41 @@ +import anndata as ad +import sys + + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad' +} + +meta = { + 'functionality_name': 'foo', + 'config': 'bar', +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +adata.layers['corrected_counts'] = _randomize_features( + adata.X, + partition=adata.obs["batch"], +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml new file mode 100644 index 0000000000..72a12c5031 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_graph.yaml +functionality: + name: batch_graph + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration + summary: "Graph connectivity values are randomly permuted within each batch" + description: "Graph connectivity values are randomly permuted within each batch" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py new file mode 100644 index 0000000000..d5c20aa185 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py @@ -0,0 +1,41 @@ +import anndata as ad +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad' +} + +meta = { + 'functionality_name': 'foo', + 'config': 'bar', +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_graph +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) + +print('Randomize graph...', flush=True) +adata = _randomize_graph( + adata, + neighbors_key="knn", + partition=adata.obs["batch"], +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml new file mode 100644 index 0000000000..b4457498c9 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: celltype_embed + namespace: batch_integration/control_methods/random_integration + info: + label: Random embedding by cell type + summary: "Embedding coordinates are randomized within celltype labels" + description: "Embedding coordinates are randomized within celltype labels" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py new file mode 100644 index 0000000000..bf26568079 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py @@ -0,0 +1,38 @@ +import anndata as ad +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + +## VIASH END +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print('Process data...', flush=True) +adata.obsm["X_emb"] = _randomize_features( + adata.obsm["X_pca"], + partition=adata.obs["label"] +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml new file mode 100644 index 0000000000..7c483739c2 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_feature.yaml +functionality: + name: celltype_feature + namespace: batch_integration/control_methods/random_integration + info: + label: Random feature by cell type + summary: "Features are randomized within celltype labels" + description: "Features are randomized within celltype labels" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py new file mode 100644 index 0000000000..9f1302df0d --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py @@ -0,0 +1,42 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +print("Process data...", flush=True) +adata.layers['corrected_counts'] = _randomize_features( + adata.X, + partition=adata.obs["label"] +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml new file mode 100644 index 0000000000..6015185616 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_graph.yaml +functionality: + name: celltype_graph + namespace: batch_integration/control_methods/random_integration + info: + label: Random graph by cell type + summary: "Graph connectivities are randomized within celltype labels" + description: "Graph connectivities are randomized within celltype labels" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py new file mode 100644 index 0000000000..3634d55dbd --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py @@ -0,0 +1,41 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_graph +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) + +print("Process data...", flush=True) +adata = _randomize_graph( + adata, + neighbors_key="knn", + partition=adata.obs["label"], +) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml new file mode 100644 index 0000000000..0343c37817 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_embedding.yaml +functionality: + name: global_embed + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration + summary: "Embedding coordinates are randomly permuted" + description: "Embedding coordinates are randomly permuted" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py new file mode 100644 index 0000000000..ca626600b8 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py @@ -0,0 +1,37 @@ +import sys +import scanpy as sc + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality': 'foo', + 'config': 'bar', + "resources_dir": "src/tasks/batch_integration/control_methods/" +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print("process dataset", flush=True) +adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_pca"]) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml new file mode 100644 index 0000000000..f49ee146a1 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_feature.yaml +functionality: + name: global_feature + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration + summary: "Feature values are randomly permuted" + description: "Feature values are randomly permuted" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: acf5c95a7306b819c4a13972783433d0a48f769b + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py new file mode 100644 index 0000000000..c74c7d2a5e --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py @@ -0,0 +1,37 @@ +import anndata as ad +import sys + + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad' +} + +meta = { + 'functionality_name': 'foo', + 'config': 'bar', +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +adata.layers['corrected_counts'] = _randomize_features(adata.X) + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml new file mode 100644 index 0000000000..1b92cbc70a --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml @@ -0,0 +1,25 @@ +# use method api spec +__merge__: ../../../api/comp_control_method_graph.yaml +functionality: + name: global_graph + namespace: batch_integration/control_methods/random_integration + info: + label: Random integration + summary: "Graph connectivity values are randomly permuted" + description: "Graph connectivity values are randomly permuted" + v1: + path: openproblems/tasks/_batch_integration/_common/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py + - path: ../../utils.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py new file mode 100644 index 0000000000..cd4d64f043 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py @@ -0,0 +1,37 @@ +import anndata as ad +import sys + +## VIASH START + +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad' +} + +meta = { + 'functionality_name': 'foo', + 'config': 'bar', +} + +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from utils import _randomize_graph +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) + +print('Randomize graph...', flush=True) +adata = _randomize_graph(adata, neighbors_key="knn") + +print("Store outputs", flush=True) +adata.uns['method_id'] = meta['functionality_name'] +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/utils.py b/src/tasks/batch_integration/control_methods/utils.py new file mode 100644 index 0000000000..954e24af26 --- /dev/null +++ b/src/tasks/batch_integration/control_methods/utils.py @@ -0,0 +1,56 @@ +import numpy as np + + +def _set_uns(adata, neighbors_key): + adata.uns["neighbors"] = adata.uns[neighbors_key] + adata.uns["neighbors"]["connectivities_key"] = "connectivities" + adata.uns["neighbors"]["distances_key"] = "distances" + + +def _randomize_features(X, partition=None): + """ + Taken and adapted from opsca-v1: + https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L13 + """ + X_out = X.copy() + if partition is None: + partition = np.full(X.shape[0], 0) + else: + partition = np.asarray(partition) + for partition_name in np.unique(partition): + partition_idx = np.argwhere(partition == partition_name).flatten() + X_out[partition_idx] = X[np.random.permutation(partition_idx)] + return X_out + + +def _randomize_graph(adata, partition=None, neighbors_key="neighbors"): + """ + Taken and adapted from opsca-v1: + https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L25 + """ + knn_map = adata.uns[neighbors_key] + distances, connectivities = ( + adata.obsp[knn_map["distances_key"]], + adata.obsp[knn_map["connectivities_key"]], + ) + new_idx = _randomize_features(np.arange(distances.shape[0]), partition=partition) + adata.obsp["distances"] = distances[new_idx][:, new_idx] + adata.obsp["connectivities"] = connectivities[new_idx][:, new_idx] + _set_uns(adata, neighbors_key) + return adata + + +def _perfect_embedding(partition, jitter=0.01): + """ + Taken and adapted from opsca-v1: + https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L37 + """ + from sklearn.preprocessing import LabelEncoder + from sklearn.preprocessing import OneHotEncoder + + embedding = OneHotEncoder().fit_transform( + LabelEncoder().fit_transform(partition)[:, None] + ) + if jitter is not None: + embedding = embedding + np.random.uniform(-1 * jitter, jitter, embedding.shape) + return np.asarray(embedding) diff --git a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml new file mode 100644 index 0000000000..8eff37339f --- /dev/null +++ b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml @@ -0,0 +1,51 @@ +# use method api spec +__merge__: ../../api/comp_method_graph.yaml +functionality: + name: bbknn + info: + label: BBKNN + summary: "BBKNN creates k nearest neighbours graph by identifying neighbours within batches, then combining and processing them with UMAP for visualization." + description: | + "BBKNN or batch balanced k nearest neighbours graph is built for each cell by + identifying its k nearest neighbours within each defined batch separately, + creating independent neighbour sets for each cell in each batch. These sets + are then combined and processed with the UMAP algorithm for visualisation." + reference: "polanski2020bbknn" + repository_url: "https://github.com/Teichlab/bbknn" + documentation_url: "https://github.com/Teichlab/bbknn#readme" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + bbknn_full_unscaled: + bbknn_full_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --annoy_n_trees + type: integer + default: 10 + description: Number of trees to use in the annoy forrest. + - name: --neighbors_within_batch + type: integer + default: 3 + description: Number of neighbors to report within each batch. + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - bbknn + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/bbknn/script.py b/src/tasks/batch_integration/methods/bbknn/script.py new file mode 100644 index 0000000000..1496fda0bb --- /dev/null +++ b/src/tasks/batch_integration/methods/bbknn/script.py @@ -0,0 +1,63 @@ +import sys +import anndata as ad +import scanpy as sc +import bbknn + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'annoy_n_trees': 10, + 'neighbors_within_batch': 3, + 'n_hvg': 2000, +} +meta = { + 'functionality_name': 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + sc.pp.pca(adata) + +print('Run BBKNN', flush=True) +kwargs = dict(batch_key='batch', copy=True) +kwargs['annoy_n_trees'] = par['annoy_n_trees'] +kwargs['neighbors_within_batch'] = par['neighbors_within_batch'] + +ad_bbknn = bbknn.bbknn(adata, **kwargs) + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsp={ + 'connectivities': ad_bbknn.obsp['connectivities'], + 'distances': ad_bbknn.obsp['distances'], + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + 'neighbors': ad_bbknn.uns['neighbors'] + } +) + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/combat/config.vsh.yaml b/src/tasks/batch_integration/methods/combat/config.vsh.yaml new file mode 100644 index 0000000000..f94333627d --- /dev/null +++ b/src/tasks/batch_integration/methods/combat/config.vsh.yaml @@ -0,0 +1,42 @@ +# use method api spec +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: combat + info: + label: Combat + summary: "Adjusting batch effects in microarray expression data using + empirical Bayes methods" + description: | + "An Empirical Bayes (EB) approach to correct for batch effects. It + estimates batch-specific parameters by pooling information across genes in + each batch and shrinks the estimates towards the overall mean of the batch + effect estimates across all genes. These parameters are then used to adjust + the data for batch effects, leading to more accurate and reproducible + results." + reference: "hansen2012removing" + repository_url: "https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html" + documentation_url: "https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + combat_full_unscaled: + combat_full_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/combat/script.py b/src/tasks/batch_integration/methods/combat/script.py new file mode 100644 index 0000000000..9f282efb9c --- /dev/null +++ b/src/tasks/batch_integration/methods/combat/script.py @@ -0,0 +1,57 @@ +import sys +import scanpy as sc +from scipy.sparse import csr_matrix + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} + +meta = { + 'functionality_name': 'foo', + 'config': 'bar' +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + + +print('Run Combat', flush=True) +adata.X = sc.pp.combat(adata, key='batch', inplace=False) + + +print("Store output", flush=True) +output = sc.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + }, + layers={ + 'corrected_counts': csr_matrix(adata.X), + } +) + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml new file mode 100644 index 0000000000..cd885da3cd --- /dev/null +++ b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml @@ -0,0 +1,36 @@ +# use method api spec +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: fastmnn_embedding + info: + label: fastMnn (embedding) + summary: "A simpler version of the original mnnCorrect algorithm." + description: | + The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps. + + 1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality. + 2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch. + 3. Remove variation along the average batch vector in both reference and target batches. + 4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors. + 5. Merge the corrected target batch with the reference, and repeat with the next target batch. + + reference: "haghverdi2018batch" + repository_url: "https://code.bioconductor.org/browse/batchelor/" + documentation_url: "https://bioconductor.org/packages/batchelor/" + preferred_normalization: log_cp10k + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: r_script + path: ../fastmnn_feature/script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: + - batchelor + - type: nextflow + directives: + label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml new file mode 100644 index 0000000000..e28406eb54 --- /dev/null +++ b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: fastmnn_feature + info: + label: fastMnn (feature) + summary: "A simpler version of the original mnnCorrect algorithm." + description: | + The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps. + + 1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality. + 2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch. + 3. Remove variation along the average batch vector in both reference and target batches. + 4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors. + 5. Merge the corrected target batch with the reference, and repeat with the next target batch. + + reference: "haghverdi2018batch" + repository_url: "https://code.bioconductor.org/browse/batchelor/" + documentation_url: "https://bioconductor.org/packages/batchelor/" + preferred_normalization: log_cp10k + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: batchelor + - type: nextflow + directives: + label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/script.R b/src/tasks/batch_integration/methods/fastmnn_feature/script.R new file mode 100644 index 0000000000..dbccd52d29 --- /dev/null +++ b/src/tasks/batch_integration/methods/fastmnn_feature/script.R @@ -0,0 +1,51 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + requireNamespace("batchelor", quietly = TRUE) + library(SingleCellExperiment, warn.conflicts = FALSE) +}) +## VIASH START +par <- list( + input = 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + output = 'output.h5ad' +) +meta <- list( + functionality_name = "mnn_correct_feature" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +# TODO: pass output of 'multiBatchNorm' to fastMNN + +cat("Run mnn\n") +out <- suppressWarnings(batchelor::fastMNN( + t(adata$layers[["normalized"]]), + batch = adata$obs[["batch"]] +)) + +cat("Reformat output\n") +# reusing the same script for fastmnn_embed and fastmnn_feature +return_type <- gsub("fastmnn_", "", meta[["functionality_name"]]) + +output <- anndata::AnnData( + shape = adata$shape, + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$functionality_name + ) +) + +if (return_type == "feature") { + layer <- as(SummarizedExperiment::assay(out, "reconstructed"), "sparseMatrix") + output$layers[["corrected_counts"]] <- t(layer) +} else if (return_type == "embedding") { + obsm <- SingleCellExperiment::reducedDim(out, "corrected") + output$obsm[["X_emb"]] <- obsm +} + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/liger/config.vsh.yaml b/src/tasks/batch_integration/methods/liger/config.vsh.yaml new file mode 100644 index 0000000000..4c638d467b --- /dev/null +++ b/src/tasks/batch_integration/methods/liger/config.vsh.yaml @@ -0,0 +1,31 @@ +# use method api spec +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: liger + info: + label: LIGER + summary: Linked Inference of Genomic Experimental Relationships + description: | + LIGER or linked inference of genomic experimental relationships uses iNMF + deriving and implementing a novel coordinate descent algorithm to efficiently + do the factorization. Joint clustering is performed and factor loadings are + normalised. + reference: welch2019single + repository_url: https://github.com/welch-lab/liger + documentation_url: https://github.com/welch-lab/liger + preferred_normalization: log_cp10k + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: apt + packages: cmake + - type: r + cran: rliger + github: welch-lab/RcppPlanc + - type: nextflow + directives: + label: [lowcpu, highmem, midtime] diff --git a/src/tasks/batch_integration/methods/liger/script.R b/src/tasks/batch_integration/methods/liger/script.R new file mode 100644 index 0000000000..b7159063ff --- /dev/null +++ b/src/tasks/batch_integration/methods/liger/script.R @@ -0,0 +1,108 @@ +cat(">> Load dependencies\n") +requireNamespace("anndata", quietly = TRUE) +requireNamespace("rliger", quietly = TRUE) + +## VIASH START +par <- list( + input = "resources_test/batch_integration/pancreas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + functionality_name = "liger" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +anndataToLiger <- function(adata) { + # fetch batch names + batch <- adata$obs$batch + batch_names <- as.character(unique(batch)) + + # restructure data + raw_data <- lapply(batch_names, function(batch_name) { + Matrix::t(adata$layers[["counts"]][batch == batch_name, , drop = FALSE]) + }) + names(raw_data) <- batch_names + + rliger::createLiger(rawData = raw_data, removeMissing = FALSE) +} + +addNormalizedDataToLiger <- function(adata, lobj) { + norm_data <- lapply(names(rliger::rawData(lobj)), function(name) { + norm <- adata$layers[["normalized"]] + + # subset + col_names <- colnames(rliger::rawData(lobj)[[name]]) + row_names <- rownames(rliger::rawData(lobj)[[name]]) + prefix <- paste0(name, "_") + col_names <- sub(prefix, "", col_names) + + norm <- norm[ + col_names, + row_names, + drop = FALSE + ] + + # add prefix + rownames(norm) <- paste0(prefix, rownames(norm)) + + # transpose + norm <- Matrix::t(norm) + + # turn into dgcMatrix + as(as(norm, "denseMatrix"), "CsparseMatrix") + }) + names(norm_data) <- names(rliger::rawData(lobj)) + + for (name in names(rliger::rawData(lobj))) { + lobj@datasets[[name]]@normData <- norm_data[[name]] + } + + lobj +} + +cat(">> Create Liger Data object\n") +lobj <- anndataToLiger(adata) + +cat(">> Normalize data\n") +lobj <- addNormalizedDataToLiger(adata, lobj) + +# could also use the rliger normalization instead +# lobj <- rliger::normalize(lobj) + +cat(">> Select genes\n") +# lobj <- rliger::selectGenes(lobj) +# overwrite gene selection to include all genes +lobj@varFeatures <- adata$var_names + +cat(">> Perform scaling\n") +lobj <- rliger::scaleNotCenter(lobj, removeMissing = FALSE) + +cat(">> Joint Matrix Factorization\n") +lobj <- rliger::runIntegration(lobj, k = 20) + +cat(">> Quantile normalization\n") +lobj <- rliger::quantileNorm(lobj) + +cat(">> Store output\n") +# remove dataset names from rownames +for (name in names(rliger::rawData(lobj))) { + rownames(lobj@H.norm) <- sub(paste0(name, "_"), "", rownames(lobj@H.norm)) +} + +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$functionality_name + ), + obsm = list( + X_emb = lobj@H.norm[rownames(adata), , drop = FALSE] + ), + shape = adata$shape +) + +cat(">> Write AnnData to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml new file mode 100644 index 0000000000..1c999fa540 --- /dev/null +++ b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml @@ -0,0 +1,27 @@ +# use method api spec +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: mnn_correct + info: + label: mnnCorrect + summary: "Correct for batch effects in single-cell expression data using the mutual nearest neighbors method." + description: | + We present a strategy for batch correction based on the detection of mutual nearest neighbors (MNNs) in the high-dimensional expression space. + Our approach does not rely on predefined or equal population compositions across batches; instead, it requires only that a subset of the population be shared between batches. + reference: "haghverdi2018batch" + repository_url: "https://code.bioconductor.org/browse/batchelor/" + documentation_url: "https://bioconductor.org/packages/batchelor/" + preferred_normalization: log_cp10k + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: + - batchelor + - type: nextflow + directives: + label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/mnn_correct/script.R b/src/tasks/batch_integration/methods/mnn_correct/script.R new file mode 100644 index 0000000000..0e6dfa2606 --- /dev/null +++ b/src/tasks/batch_integration/methods/mnn_correct/script.R @@ -0,0 +1,47 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + requireNamespace("batchelor", quietly = TRUE) + library(SingleCellExperiment, warn.conflicts = FALSE) +}) +## VIASH START +par <- list( + input = 'resources_test/batch_integration/pancreas/dataset.h5ad', + output = 'output.h5ad' +) +meta <- list( + functionality_name = "mnn_correct_feature" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +cat("Run mnn\n") +out <- suppressWarnings(batchelor::mnnCorrect( + t(adata$layers[["normalized"]]), + batch = adata$obs[["batch"]] +)) + +cat("Reformat output\n") +layer <- SummarizedExperiment::assay(out, "corrected") +as(t(layer), "sparseMatrix") + + + +cat("Store outputs\n") +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$functionality_name + ), + layers = list( + corrected_counts = as(t(layer), "sparseMatrix") + ), + shape = adata$shape +) + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml new file mode 100644 index 0000000000..2c5075534b --- /dev/null +++ b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml @@ -0,0 +1,52 @@ +# use method api spec +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: mnnpy + info: + label: mnnpy + summary: "Batch effect correction by matching mutual nearest neighbors, Python implementation." + description: | + An implementation of MNN correct in python featuring low memory usage, full multicore support and compatibility with the scanpy framework. + + Batch effect correction by matching mutual nearest neighbors (Haghverdi et al, 2018) has been implemented as a function 'mnnCorrect' in the R package scran. Sadly it's extremely slow for big datasets and doesn't make full use of the parallel architecture of modern CPUs. + + This project is a python implementation of the MNN correct algorithm which takes advantage of python's extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones. + reference: "hie2019efficient" + repository_url: "https://github.com/chriscainx/mnnpy" + documentation_url: "https://github.com/chriscainx/mnnpy#readme" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py + commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + preferred_normalization: log_cp10k + variants: + mnn_full_unscaled: + mnn_full_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: script.py +platforms: + # Due to a [ gcc-8 ] dependency in the mnnpy package, we need to use a python:3.8 image + - type: docker + image: python:3.8 + setup: + - type: apt + packages: + - procps + - type: python + pypi: + - anndata~=0.8.0 + - scanpy + - pyyaml + - requests + - jsonschema + github: + - chriscainx/mnnpy + - type: nextflow + directives: + label: [ midtime, lowcpu, lowmem ] diff --git a/src/tasks/batch_integration/methods/mnnpy/script.py b/src/tasks/batch_integration/methods/mnnpy/script.py new file mode 100644 index 0000000000..1551573650 --- /dev/null +++ b/src/tasks/batch_integration/methods/mnnpy/script.py @@ -0,0 +1,55 @@ +import anndata as ad +import mnnpy + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} +meta = { + 'functionality_name': 'foo', + 'config': 'bar' +} +## VIASH END + +print('Read input', flush=True) +adata = ad.read_h5ad(par['input']) +adata.X = adata.layers['normalized'] +del adata.layers['normalized'] +del adata.layers['counts'] + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run mnn', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for i in batch_categories: + split.append(adata[adata.obs['batch'] == i].copy()) +corrected, _, _ = mnnpy.mnn_correct( + *split, + batch_key='batch', + batch_categories=batch_categories, + index_unique=None + ) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + }, + layers={ + 'corrected_counts': corrected.X, + } +) + + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml new file mode 100644 index 0000000000..cf16b2e684 --- /dev/null +++ b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml @@ -0,0 +1,37 @@ +# use method api spec +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: pyliger + info: + label: pyliger + summary: Python implementation of LIGER (Linked Inference of Genomic Experimental Relationships + description: | + LIGER (installed as rliger) is a package for integrating and analyzing multiple + single-cell datasets, developed by the Macosko lab and maintained/extended by the + Welch lab. It relies on integrative non-negative matrix factorization to identify + shared and dataset-specific factors. + reference: welch2019single + repository_url: https://github.com/welch-lab/pyliger + documentation_url: https://github.com/welch-lab/pyliger + preferred_normalization: log_cp10k + variants: + liger_unscaled: + liger_scaled: + preferred_normalization: log_cp10k_scaled + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - umap-learn[plot] + - pyliger + - dask-expr + - type: nextflow + directives: + label: [lowcpu, highmem, midtime] diff --git a/src/tasks/batch_integration/methods/pyliger/script.py b/src/tasks/batch_integration/methods/pyliger/script.py new file mode 100644 index 0000000000..2066e6965b --- /dev/null +++ b/src/tasks/batch_integration/methods/pyliger/script.py @@ -0,0 +1,86 @@ +import sys +import anndata as ad +import numpy as np +import pyliger + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'pyliger' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('>> Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) +adata.layers['norm_data'] = read_anndata(par['input'], X='layers/normalized').X + +print('>> Prepare data', flush=True) +adata_per_batch = [] +for batch in adata.obs['batch'].unique(): + adb = adata[adata.obs['batch'] == batch].copy() + + # save row sum and sum of squares for further use + norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) + norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0)) + adb.var["norm_sum"] = norm_sum + adb.var["norm_sum_sq"] = norm_sum_sq + adb.var["norm_mean"] = norm_sum / adb.shape[0] + + # set more metadata + adb.obs.index.name = 'cell_barcode' + adb.var.index.name = 'gene_id' + adb.uns['sample_name'] = batch + + # append to list + adata_per_batch.append(adb) + +print('Create liger object', flush=True) +lobj = pyliger.create_liger( + adata_per_batch, + remove_missing=False +) + +# do not select genes +lobj.var_genes = adata.var_names + +print('>> Scaling', flush=True) +pyliger.scale_not_center(lobj, remove_missing=False) + +print('>> Optimize ALS', flush=True) +pyliger.optimize_ALS(lobj, k=20) + +print('>> Quantile normalization', flush=True) +pyliger.quantile_norm(lobj) + +print('>> Concatenate outputs', flush=True) +ad_out = ad.concat(lobj.adata_list) + +print('Store output', flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + 'X_emb': ad_out[adata.obs_names, :].obsm['H_norm'] + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml new file mode 100644 index 0000000000..3437df19c9 --- /dev/null +++ b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: scalex_embed + info: + label: SCALEX (embedding) + summary: Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space + description : | + SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space. + reference: xiong2021online + repository_url: https://github.com/jsxlei/SCALEX + documentation_url: https://scalex.readthedocs.io + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + scalex_feature_unscaled: + scanorama_feature_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scalex + - numpy<1.24 + - torch<2.1 + - type: nextflow + directives: + label: [lowmem, lowcpu, midtime] diff --git a/src/tasks/batch_integration/methods/scalex_embed/script.py b/src/tasks/batch_integration/methods/scalex_embed/script.py new file mode 100644 index 0000000000..9974eba4b3 --- /dev/null +++ b/src/tasks/batch_integration/methods/scalex_embed/script.py @@ -0,0 +1,70 @@ +import sys +import anndata as ad +import scalex + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'hvg': True, +} +meta = { + 'functionality_name' : 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run SCALEX', flush=True) +adata = scalex.SCALEX( + adata, + batch_key="batch", + ignore_umap=True, + impute=adata.obs["batch"].cat.categories[0], + processed=True, + max_iteration=40, + min_features=None, + min_cells=None, + n_top_features=0, + outdir=None, + gpu=0, +) +adata.obsm["X_emb"] = adata.obsm["latent"] + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + layers={ + 'corrected_counts': adata.layers["impute"], + }, + obsm={ + 'X_emb': adata.obsm['latent'], + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml new file mode 100644 index 0000000000..1874bc190e --- /dev/null +++ b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: scalex_feature + info: + label: SCALEX (feature) + summary: Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space + description : | + SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space. + reference: xiong2021online + repository_url: https://github.com/jsxlei/SCALEX + documentation_url: https://scalex.readthedocs.io + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + scalex_feature_unscaled: + scanorama_feature_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: ../scalex_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scalex + - numpy<1.24 + - torch<2.1 + - type: nextflow + directives: + label: [lowmem, lowcpu, midtime] diff --git a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml new file mode 100644 index 0000000000..b5dcd8f54a --- /dev/null +++ b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml @@ -0,0 +1,41 @@ +# use method api spec +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: scanorama_embed + info: + label: Scanorama (embedding) + summary: "Efficient integration of heterogeneous single-cell + transcriptomes using Scanorama" + description: | + "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane." + reference: "hie2019efficient" + repository_url: "https://github.com/brianhie/scanorama" + documentation_url: "https://github.com/brianhie/scanorama#readme" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + scanorama_embed_full_unscaled: + scanorama_embed_full_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scanorama + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/methods/scanorama_embed/script.py b/src/tasks/batch_integration/methods/scanorama_embed/script.py new file mode 100644 index 0000000000..db12b458d5 --- /dev/null +++ b/src/tasks/batch_integration/methods/scanorama_embed/script.py @@ -0,0 +1,87 @@ +import sys +import anndata as ad +import scanorama + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} +meta = { + 'functionality_name': 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +# based on scib +# -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 +def merge_adata(*adata_list, **kwargs): + """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns + + :param adata_list: ``anndata`` objects to be concatenated + :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` + """ + + if len(adata_list) == 1: + return adata_list[0] + + # Make sure that adatas do not contain duplicate columns + for _adata in adata_list: + for attr in ("obs", "var"): + df = getattr(_adata, attr) + dup_mask = df.columns.duplicated() + if dup_mask.any(): + print( + f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." + ) + setattr(_adata, attr, df.loc[:, ~dup_mask]) + + return ad.AnnData.concatenate(*adata_list, **kwargs) + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run scanorama', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for i in batch_categories: + split.append(adata[adata.obs['batch'] == i].copy()) +corrected = scanorama.correct_scanpy(split, return_dimred=True) +corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['functionality_name'], + }, + layers={ + 'corrected_counts': corrected.X, + }, + obsm={ + 'X_emb': corrected.obsm["X_scanorama"], + } +) + +print("Write output to file", flush=True) +output.write(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml new file mode 100644 index 0000000000..3f735ddffd --- /dev/null +++ b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml @@ -0,0 +1,41 @@ +# use method api spec +__merge__: ../../api/comp_method_feature.yaml +functionality: + name: scanorama_feature + info: + label: Scanorama (feature) + summary: "Efficient integration of heterogeneous single-cell + transcriptomes using Scanorama" + description: | + "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane." + reference: "hie2019efficient" + repository_url: "https://github.com/brianhie/scanorama" + documentation_url: "https://github.com/brianhie/scanorama#readme" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + scanorama_feature_full_unscaled: + scanorama_feature_full_scaled: + preferred_normalization: log_cp10k_scaled + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + resources: + - type: python_script + path: ../scanorama_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scanorama + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml new file mode 100644 index 0000000000..5615fd72cd --- /dev/null +++ b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml @@ -0,0 +1,61 @@ +__merge__: ../../api/comp_method_embedding.yaml + +functionality: + name: scanvi + info: + label: scANVI + summary: "scANVI is a deep learning method that considers cell type labels." + description : | + scANVI (single-cell ANnotation using Variational Inference; Python class SCANVI) is a semi-supervised model for single-cell transcriptomics data. In a sense, it can be seen as a scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells. + reference: "lopez2018deep" + repository_url: "https://github.com/scverse/scvi-tools" + documentation_url: "https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py + commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf + preferred_normalization: counts + variants: + scanvi_full_unscaled: + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + - name: --n_latent + type: integer + default: 30 + description: Number of latent dimensions. + - name: --n_hidden + type: integer + default: 128 + description: Number of hidden units. + - name: --n_layers + type: integer + default: 2 + description: Number of layers. + - name: --max_epochs_scvi + type: integer + example: 400 + description: Maximum number of training epochs for scVI. + - name: --max_epochs_scanvi + type: integer + example: 10 + description: Maximum number of training epochs for scANVI. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu, gpu] diff --git a/src/tasks/batch_integration/methods/scanvi/script.py b/src/tasks/batch_integration/methods/scanvi/script.py new file mode 100644 index 0000000000..35d5b80f32 --- /dev/null +++ b/src/tasks/batch_integration/methods/scanvi/script.py @@ -0,0 +1,76 @@ +import sys +import anndata as ad +from scvi.model import SCVI, SCANVI + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, + 'n_latent': 30, + 'n_hidden': 128, + 'n_layers': 2, + 'max_epochs_scvi': 20, + 'max_epochs_scanvi': 20 +} +meta = { + 'functionality_name' : 'scanvi', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + adata = adata[:, idx].copy() + +print("Processing data", flush=True) +SCVI.setup_anndata(adata, batch_key="batch") + +print("Run scVI", flush=True) +model_kwargs = { + key: par[key] + for key in ["n_latent", "n_hidden", "n_layers"] + if par[key] is not None +} + +vae = SCVI(adata, **model_kwargs) + +vae.train(max_epochs=par["max_epochs_scvi"], train_size=1.0) + +print('Run SCANVI', flush=True) +scanvae = SCANVI.from_scvi_model( + scvi_model=vae, + labels_key="label", + unlabeled_category="UnknownUnknown", # pick anything definitely not in a dataset +) +scanvae.train(max_epochs=par["max_epochs_scanvi"], train_size=1.0) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": scanvae.get_latent_representation(), + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["functionality_name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml new file mode 100644 index 0000000000..45eb09d5cf --- /dev/null +++ b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml @@ -0,0 +1,59 @@ +# use method api spec +__merge__: ../../api/comp_method_embedding.yaml +functionality: + name: scvi + info: + label: scVI + summary: "scVI combines a variational autoencoder with a hierarchical Bayesian model." + description: | + scVI combines a variational autoencoder with a hierarchical Bayesian model. It uses the negative binomial distribution to describe gene expression of each cell, conditioned on unobserved factors and the batch variable. ScVI is run as implemented in Luecken et al. + reference: "lopez2018deep" + repository_url: "https://github.com/scverse/scvi-tools" + documentation_url: "https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html" + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: counts + variants: + scvi_full_unscaled: + # defaults are derived from te scvi tutorial: + # https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scrna/harmonization.html + arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + - name: --n_latent + type: integer + default: 30 + description: Number of latent dimensions. + - name: --n_hidden + type: integer + default: 128 + description: Number of hidden units. + - name: --n_layers + type: integer + default: 2 + description: Number of layers. + - name: --max_epochs + type: integer + example: 400 + description: Maximum number of epochs. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: nextflow + directives: + label: [midtime, midmem, lowcpu, gpu] diff --git a/src/tasks/batch_integration/methods/scvi/script.py b/src/tasks/batch_integration/methods/scvi/script.py new file mode 100644 index 0000000000..26490737a5 --- /dev/null +++ b/src/tasks/batch_integration/methods/scvi/script.py @@ -0,0 +1,66 @@ +import sys +import anndata as ad +from scvi.model import SCVI + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, + 'n_latent': 30, + 'n_hidden': 128, + 'n_layers': 2, + 'max_epochs': 400 +} +meta = { + 'functionality_name' : 'scvi', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + adata = adata[:, idx].copy() + +print("Processing data", flush=True) +SCVI.setup_anndata(adata, batch_key="batch") + +print("Run scVI", flush=True) +model_kwargs = { + key: par[key] + for key in ["n_latent", "n_hidden", "n_layers"] + if par[key] is not None +} + +vae = SCVI(adata, **model_kwargs) + +vae.train(max_epochs=par["max_epochs"], train_size=1.0) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": vae.get_latent_representation(), + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["functionality_name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml new file mode 100644 index 0000000000..be6567271c --- /dev/null +++ b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml @@ -0,0 +1,50 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: asw_batch + info: + metrics: + - name: asw_batch + label: ASW batch + summary: Average silhouette of batches per cell identity label (cell type) + description: | + We consider the absolute silhouette width, s(i), on + batch labels per cell i. Here, 0 indicates that batches are well mixed, and any + deviation from 0 indicates a batch effect: + 𝑠batch(𝑖)=|𝑠(𝑖)|. + + To ensure higher scores indicate better batch mixing, these scores are scaled by + subtracting them from 1. As we expect batches to integrate within cell identity + clusters, we compute the batchASWj score for each cell label j separately, + using the equation: + batchASW𝑗=1|𝐶𝑗|∑𝑖∈𝐶𝑗1−𝑠batch(𝑖), + + where Cj is the set of cells with the cell label j and |Cj| denotes the number of cells + in that set. + + To obtain the final batchASW score, the label-specific batchASWj scores are averaged: + batchASW=1|𝑀|∑𝑗∈𝑀batchASW𝑗. + + Here, M is the set of unique cell labels. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/asw_batch/script.py b/src/tasks/batch_integration/metrics/asw_batch/script.py new file mode 100644 index 0000000000..35b110b895 --- /dev/null +++ b/src/tasks/batch_integration/metrics/asw_batch/script.py @@ -0,0 +1,44 @@ +import sys +import anndata as ad +from scib.metrics import silhouette_batch + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) +score = silhouette_batch( + adata, + batch_key='batch', + label_key='label', + embed='X_emb', +) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml new file mode 100644 index 0000000000..068381b9e3 --- /dev/null +++ b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml @@ -0,0 +1,38 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: asw_label + info: + metrics: + - name: asw_label + label: ASW Label + summary: Average silhouette of cell identity labels (cell types) + description: | + For the bio-conservation score, the ASW was computed on cell identity labels and + scaled to a value between 0 and 1 using the equation: + celltypeASW=(ASW_C+1)/2, + + where C denotes the set of all cell identity labels. + For information about the batch silhouette score, check sil_batch. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/asw_label/script.py b/src/tasks/batch_integration/metrics/asw_label/script.py new file mode 100644 index 0000000000..01a7a2ad41 --- /dev/null +++ b/src/tasks/batch_integration/metrics/asw_label/script.py @@ -0,0 +1,44 @@ +import sys +import anndata as ad +from scib.metrics import silhouette + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) +score = silhouette( + adata, + label_key='label', + embed='X_emb' +) + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], + "metric_ids": [meta['functionality_name']], + "metric_values": [score] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml new file mode 100644 index 0000000000..3852029a60 --- /dev/null +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml @@ -0,0 +1,47 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: cell_cycle_conservation + info: + metrics: + - name: cell_cycle_conservation + label: Cell Cycle Conservation + summary: Cell cycle conservation score based on principle component regression on cell cycle gene scores + description: | + The cell-cycle conservation score evaluates how well the cell-cycle effect can be + captured before and after integration. We computed cell-cycle scores using Scanpy’s + score_cell_cycle function with a reference gene set from Tirosh et al for the + respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and + human data (using capitalization to convert between the gene symbols). We then computed + the variance contribution of the resulting S and G2/M phase scores using principal + component regression (Principal component regression), which was performed for each + batch separately. The differences in variance before, Varbefore, and after, Varafter, + integration were aggregated into a final score between 0 and 1, using the equation: + CCconservation=1−|Varafter−Varbefore|/Varbefore. + + In this equation, values close to 0 indicate lower conservation and 1 indicates complete + conservation of the variance explained by cell cycle. In other words, the variance + remains unchanged within each batch for complete conservation, while any deviation from + the preintegration variance contribution reduces the score. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py new file mode 100644 index 0000000000..fa432a21c6 --- /dev/null +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py @@ -0,0 +1,69 @@ +import sys +import anndata as ad +from scib.metrics import cell_cycle +import numpy as np + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad' +} + +meta = { + 'functionality_name': 'foo' +} +## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print('Use gene symbols for features', flush=True) +adata_solution.var_names = adata_solution.var['feature_name'] + +translator = { + "homo_sapiens": "human", + "mus_musculus": "mouse", +} + +print('Compute score', flush=True) +if adata_solution.uns['dataset_organism'] not in translator: + score = np.nan +else: + organism = translator[adata_solution.uns['dataset_organism']] + score = cell_cycle( + adata_solution, + adata_integrated, + batch_key='batch', + embed='X_emb', + organism=organism, + ) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml new file mode 100644 index 0000000000..8d92033e40 --- /dev/null +++ b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml @@ -0,0 +1,61 @@ +# use metric api spec +__merge__: ../../api/comp_metric_graph.yaml +functionality: + name: clustering_overlap + info: + metrics: + - name: ari + label: ARI + summary: Adjusted Rand Index compares clustering overlap, correcting for random labels and considering correct overlaps and disagreements. + description: | + The Adjusted Rand Index (ARI) compares the overlap of two clusterings; + it considers both correct clustering overlaps while also counting correct + disagreements between two clusterings. + We compared the cell-type labels with the NMI-optimized + Louvain clustering computed on the integrated dataset. + The adjustment of the Rand index corrects for randomly correct labels. + An ARI of 0 or 1 corresponds to random labeling or a perfect match, + respectively. + reference: + - hubert1985comparing + - luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + - name: nmi + label: NMI + summary: "NMI compares overlap by scaling using mean entropy terms and optimizing Louvain clustering to obtain the best match between clusters and labels." + description: | + Normalized Mutual Information (NMI) compares the overlap of two clusterings. + We used NMI to compare the cell-type labels with Louvain clusters computed on + the integrated dataset. The overlap was scaled using the mean of the entropy terms + for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated + clustering or a perfect match, respectively. We performed optimized Louvain clustering + for this metric to obtain the best match between clusters and labels. + reference: + - amelio2015normalized + - luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/script.py b/src/tasks/batch_integration/metrics/clustering_overlap/script.py new file mode 100644 index 0000000000..7bb9e533c8 --- /dev/null +++ b/src/tasks/batch_integration/metrics/clustering_overlap/script.py @@ -0,0 +1,53 @@ +import sys +import anndata as ad +import scanpy as sc +from scib.metrics.clustering import cluster_optimal_resolution +from scib.metrics import ari, nmi + +## VIASH START +par = { + 'adata_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('Run optimal Leiden clustering', flush=True) +cluster_optimal_resolution( + adata=adata, + label_key='label', + cluster_key='cluster', + cluster_function=sc.tl.leiden, +) + +print('Compute ARI score', flush=True) +ari_score = ari(adata, cluster_key='cluster', label_key='label') + +print('Compute NMI score', flush=True) +nmi_score = nmi(adata, cluster_key='cluster', label_key='label') + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], + "metric_ids": [ "ari", "nmi" ], + "metric_values": [ ari_score, nmi_score ] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml new file mode 100644 index 0000000000..6384feca62 --- /dev/null +++ b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml @@ -0,0 +1,47 @@ +# use metric api spec +__merge__: ../../api/comp_metric_graph.yaml +functionality: + name: graph_connectivity + info: + metrics: + - name: graph_connectivity + label: Graph Connectivity + summary: Connectivity of the subgraph per cell type label + description: | + The graph connectivity metric assesses whether the kNN graph representation, + G, of the integrated data directly connects all cells with the same cell + identity label. For each cell identity label c, we created the subset kNN + graph G(Nc;Ec) to contain only cells from a given label. Using these subset + kNN graphs, we computed the graph connectivity score using the equation: + + gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|. + + Here, C represents the set of cell identity labels, |LCC()| is the number + of nodes in the largest connected component of the graph, and |Nc| is the + number of nodes with cell identity c. The resultant score has a range + of (0;1], where 1 indicates that all cells with the same cell identity + are connected in the integrated kNN graph, and the lowest possible score + indicates a graph where no cell is connected. As this score is computed + on the kNN graph, it can be used to evaluate all integration outputs. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: https://github.com/openproblems-bio/openproblems/blob/main/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/script.py b/src/tasks/batch_integration/metrics/graph_connectivity/script.py new file mode 100644 index 0000000000..ead8f146bc --- /dev/null +++ b/src/tasks/batch_integration/metrics/graph_connectivity/script.py @@ -0,0 +1,42 @@ +import sys +import anndata as ad +import scib + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) +score = scib.metrics.graph_connectivity( + adata, + label_key='label' +) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml new file mode 100644 index 0000000000..a8025783d6 --- /dev/null +++ b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml @@ -0,0 +1,46 @@ +# use metric api spec +__merge__: ../../api/comp_metric_feature.yaml +functionality: + name: hvg_overlap + info: + metrics: + - name: hvg_overlap + label: HVG overlap + summary: Overlap of highly variable genes per batch before and after integration. + description: | + The HVG conservation score is a proxy for the preservation of + the biological signal. If the data integration method returned + a corrected data matrix, we computed the number of HVGs before + and after correction for each batch via Scanpy’s + highly_variable_genes function (using the ‘cell ranger’ flavor). + If available, we computed 500 HVGs per batch. If fewer than 500 + genes were present in the integrated object for a batch, + the number of HVGs was set to half the total genes in that batch. + The overlap coefficient is as follows: + overlap(𝑋,𝑌)=|𝑋∩𝑌|/min(|𝑋|,|𝑌|), + + where X and Y denote the fraction of preserved informative genes. + The overall HVG score is the mean of the per-batch HVG overlap + coefficients. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/script.py b/src/tasks/batch_integration/metrics/hvg_overlap/script.py new file mode 100644 index 0000000000..b7d177e991 --- /dev/null +++ b/src/tasks/batch_integration/metrics/hvg_overlap/script.py @@ -0,0 +1,55 @@ +import sys +import anndata as ad +from scib.metrics import hvg_overlap + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) + +print('compute score', flush=True) +score = hvg_overlap( + adata_solution, + adata_integrated, + batch_key="batch" +) + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + "method_id": adata_integrated.uns['method_id'], + "metric_ids": [meta['functionality_name']], + "metric_values": [score] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml new file mode 100644 index 0000000000..65e1970c4f --- /dev/null +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml @@ -0,0 +1,40 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: isolated_label_asw + info: + metrics: + - name: isolated_label_asw + label: Isolated label ASW + summary: Evaluate how well isolated labels separate by average silhouette width + description: | + Isolated cell labels are defined as the labels present in the least number + of batches in the integration task. The score evaluates how well these isolated labels + separate from other cell identities. + + The isolated label ASW score is obtained by computing the + ASW of isolated versus non-isolated labels on the PCA embedding (ASW metric above) and + scaling this score to be between 0 and 1. The final score for each metric version + consists of the mean isolated score of all isolated labels. + reference: luecken2022benchmarking + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_sil.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py new file mode 100644 index 0000000000..094937e687 --- /dev/null +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py @@ -0,0 +1,49 @@ +import sys +import anndata as ad +from scib.metrics import isolated_labels_asw + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) + +score = isolated_labels_asw( + adata, + label_key='label', + batch_key='batch', + embed='X_emb', + iso_threshold=None, + verbose=True, +) +print(score, flush=True) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml new file mode 100644 index 0000000000..6b8f0703bf --- /dev/null +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml @@ -0,0 +1,52 @@ +# use metric api spec +__merge__: ../../api/comp_metric_graph.yaml +functionality: + name: isolated_label_f1 + info: + metrics: + - name: isolated_label_f1 + label: Isolated label F1 score + summary: Evaluate how well isolated labels coincide with clusters + description: | + We developed two isolated label scores to evaluate how well the data integration methods + dealt with cell identity labels shared by few batches. Specifically, we identified + isolated cell labels as the labels present in the least number of batches in the + integration task. + The score evaluates how well these isolated labels separate from other cell identities. + We implemented the isolated label metric in two versions: + (1) the best clustering of the isolated label (F1 score) and + (2) the global ASW of the isolated label. For the cluster-based score, + we first optimize the cluster assignment of the isolated label using the F1 score˚ + across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1. + The optimal F1 score for the isolated label is then used as the metric score. + The F1 score is a weighted mean of precision and recall given by the equation: + 𝐹1=2×(precision×recall)/(precision+recall). + + It returns a value between 0 and 1, + where 1 shows that all of the isolated label cells and no others are captured in + the cluster. For the isolated label ASW score, we compute the ASW of isolated + versus nonisolated labels on the PCA embedding (ASW metric above) and scale this + score to be between 0 and 1. The final score for each metric version consists of + the mean isolated score of all isolated labels. + reference: luecken2022benchmarking + v1: + path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py new file mode 100644 index 0000000000..30fe25bccf --- /dev/null +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py @@ -0,0 +1,48 @@ +import sys +import anndata as ad +from scib.metrics import isolated_labels_f1 + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) +score = isolated_labels_f1( + adata, + label_key='label', + batch_key='batch', + embed=None, + iso_threshold=None, + verbose=True, +) +print(score, flush=True) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml new file mode 100644 index 0000000000..aca556a8fc --- /dev/null +++ b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml @@ -0,0 +1,57 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: kbet + info: + metrics: + - name: kbet + label: kBET + summary: kBET algorithm to determine how well batches are mixed within a cell type + description: | + The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition + of a k nearest neighborhood of a cell is similar to the expected (global) label + composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset + of cells, and the results are summarized as a rejection rate over all tested + neighborhoods. Thus, kBET works on a kNN graph. + + We compute kNN graphs where k = 50 for joint embeddings and corrected feature outputs + via Scanpy preprocessing steps. To test for technical effects and to account for + cell-type frequency shifts across datasets, we applied kBET + separately on the batch variable for each cell identity label. Using the kBET defaults, + a k equal to the median of the number of cells per batch within each label is used for + this computation. Additionally, we set the minimum and maximum thresholds of k to 10 and + 100, respectively. As kNN graphs that have been subset by cell identity labels may no + longer be connected, we compute kBET per connected component. If >25% of cells were + assigned to connected components too small for kBET computation (smaller than k × 3), + we assigned a kBET score of 1 to denote poor batch removal. Subsequently, kBET scores + for each label were averaged and subtracted from 1 to give a final kBET score. + + In Open Problems we do not run kBET on graph outputs to avoid computation-intensive + diffusion processes being run. + reference: luecken2022benchmarking + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + github: theislab/kBET + - type: python + pypi: + - scib==1.1.5 + - rpy2>=3 + - anndata2ri + - scipy<=1.13 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/kbet/script.py b/src/tasks/batch_integration/metrics/kbet/script.py new file mode 100644 index 0000000000..9834f525d5 --- /dev/null +++ b/src/tasks/batch_integration/metrics/kbet/script.py @@ -0,0 +1,49 @@ +import sys +import anndata as ad +from scib.metrics import kBET + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute score', flush=True) +score = kBET( + adata, + batch_key="batch", + label_key="label", + type_="embed", + embed="X_emb", + scaled=True, + verbose=False, +) +print(score, flush=True) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml new file mode 100644 index 0000000000..750574f84a --- /dev/null +++ b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml @@ -0,0 +1,56 @@ +# use metric api spec +__merge__: ../../api/comp_metric_graph.yaml +functionality: + status: disabled + name: lisi + info: + metrics: + - name: ilisi + label: iLISI + summary: Local inverse Simpson's Index + description: | + Local Inverse Simpson's Index metrics adapted from Korsunsky et al. 2019 to run on + all full feature, embedding and kNN integration outputs via shortest path-based + distance computation on single-cell kNN graphs. The metric assesses whether clusters + of cells in a single-cell RNA-seq dataset are well-mixed across a categorical batch + variable. + + The original LISI score ranges from 0 to the number of categories, with the latter + indicating good cell mixing. This is rescaled to a score between 0 and 1. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + repository_url: https://github.com/theislab/scib/blob/ed3e2846414ca1e3dc07552c0eef1e68d82230d4/scib/metrics/lisi.py + documentation_url: https://scib.readthedocs.io/en/latest/api/scib.metrics.ilisi_graph.html + - name: clisi + label: cLISI + summary: Local inverse Simpson's Index + description: | + Local Inverse Simpson's Index metrics adapted from Korsunsky et al. 2019 to run on + all full feature, embedding and kNN integration outputs via shortest path-based + distance computation on single-cell kNN graphs. The metric assesses whether clusters + of cells in a single-cell RNA-seq dataset are well-mixed across a categorical cell type variable. + + The original LISI score ranges from 0 to the number of categories, with the latter indicating good cell mixing. This is rescaled to a score between 0 and 1. + reference: luecken2022benchmarking + min: 0 + max: 1 + maximize: true + repository_url: https://github.com/theislab/scib/blob/ed3e2846414ca1e3dc07552c0eef1e68d82230d4/scib/metrics/lisi.py + documentation_url: https://scib.readthedocs.io/en/latest/api/scib.metrics.clisi_graph.html + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - git+https://github.com/theislab/scib.git@v1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/lisi/script.py b/src/tasks/batch_integration/metrics/lisi/script.py new file mode 100644 index 0000000000..44181dab71 --- /dev/null +++ b/src/tasks/batch_integration/metrics/lisi/script.py @@ -0,0 +1,64 @@ +import sys +import numpy as np +import anndata as ad +from scib.metrics.lisi import lisi_graph_py + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('compute iLISI score...', flush=True) +ilisi_scores = lisi_graph_py( + adata=adata, + obs_key='batch', + n_neighbors=90, + perplexity=None, + subsample=None, + n_cores=1, + verbose=False, +) +ilisi = np.nanmedian(ilisi_scores) +ilisi = (ilisi - 1) / (adata.obs['batch'].nunique() - 1) + +print('compute cLISI scores...', flush=True) +clisi_scores = lisi_graph_py( + adata=adata, + obs_key='label', + n_neighbors=90, + perplexity=None, + subsample=None, + n_cores=1, + verbose=False, +) +clisi = np.nanmedian(clisi_scores) +nlabs = adata.obs['label'].nunique() +clisi = (nlabs - clisi) / (nlabs - 1) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ 'ilisi', 'clisi' ], + 'metric_values': [ ilisi, clisi ] + } +) + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml new file mode 100644 index 0000000000..d3391fb528 --- /dev/null +++ b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml @@ -0,0 +1,44 @@ +# use metric api spec +__merge__: ../../api/comp_metric_embedding.yaml +functionality: + name: pcr + info: + metrics: + - name: pcr + label: PCR + summary: Compare explained variance by batch before and after integration + description: | + Principal component regression, derived from PCA, has previously been used to quantify + batch removal. Briefly, the R2 was calculated from a linear regression of the + covariate of interest (for example, the batch variable B) onto each principal component. + The variance contribution of the batch effect per principal component was then + calculated as the product of the variance explained by the ith principal component (PC) + and the corresponding R2(PCi|B). The sum across all variance contributions by the batch + effects in all principal components gives the total variance explained by the batch + variable as follows: + Var(𝐶|𝐵)=∑𝑖=1𝐺Var(𝐶|PC𝑖)×𝑅2(PC𝑖|𝐵), + + where Var(C|PCi) is the variance of the data matrix C explained by the ith principal + component. + reference: luecken2022benchmarking + v1: + path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + min: 0 + max: 1 + maximize: true + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/pcr/script.py b/src/tasks/batch_integration/metrics/pcr/script.py new file mode 100644 index 0000000000..512b3dff6b --- /dev/null +++ b/src/tasks/batch_integration/metrics/pcr/script.py @@ -0,0 +1,59 @@ +import sys +import anndata as ad +from scib.metrics import pcr_comparison + +## VIASH START +par = { + 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'output': 'output.h5ad', +} + +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + # obsm='obsm', + # varm='varm', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) + +print('compute score', flush=True) +score = pcr_comparison( + adata_solution, + adata_integrated, + embed='X_emb', + covariate='batch', + verbose=False +) + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], + 'metric_ids': [ meta['functionality_name'] ], + 'metric_values': [ score ] + } +) + + +print('Write data to file', flush=True) +output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/process_dataset/config.vsh.yaml b/src/tasks/batch_integration/process_dataset/config.vsh.yaml new file mode 100644 index 0000000000..73ea5815c3 --- /dev/null +++ b/src/tasks/batch_integration/process_dataset/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../api/comp_process_dataset.yaml +functionality: + name: process_dataset + description: Preprocess adata object for data integration + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/subset_anndata.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scib==1.1.5 + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/tasks/batch_integration/process_dataset/script.py b/src/tasks/batch_integration/process_dataset/script.py new file mode 100644 index 0000000000..cf8af4c4b7 --- /dev/null +++ b/src/tasks/batch_integration/process_dataset/script.py @@ -0,0 +1,66 @@ +import sys +import anndata as ad + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'hvgs': 2000, + 'obs_label': 'cell_type', + 'obs_batch': 'batch', + 'subset_hvg': False, + 'output': 'output.h5ad' +} +meta = { + "config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml", + "resources_dir": "src/common/helper_functions" +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +print('Read input', flush=True) +input = ad.read_h5ad(par['input']) + +def compute_batched_hvg(adata, n_hvgs): + adata = adata.copy() + adata.X = adata.layers['normalized'].copy() + if n_hvgs > adata.n_vars or n_hvgs <= 0: + hvg_list = adata.var_names.tolist() + else: + import scib + hvg_list = scib.pp.hvg_batch( + adata, + batch_key='batch', + target_genes=n_hvgs, + adataOut=False + ) + adata.var['hvg'] = adata.var_names.isin(hvg_list) + del adata.X + return adata + +print(f'Select {par["hvgs"]} highly variable genes', flush=True) +adata_with_hvg = compute_batched_hvg(input, n_hvgs=par['hvgs']) + +if par['subset_hvg']: + print('Subsetting to HVG dimensions', flush=True) + adata_with_hvg = adata_with_hvg[:, adata_with_hvg.var['hvg']].copy() + +print(">> Figuring out which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} +slot_info = read_config_slots_info(meta["config"], slot_mapping) + +print(">> Create output object", flush=True) +output_dataset = subset_anndata(adata_with_hvg, slot_info["output_dataset"]) +output_solution = subset_anndata(adata_with_hvg, slot_info["output_solution"]) + +print('Writing adatas to file', flush=True) +output_dataset.write(par['output_dataset'], compression='gzip') +output_solution.write(par['output_solution'], compression='gzip') diff --git a/src/tasks/batch_integration/resources_scripts/process_datasets.sh b/src/tasks/batch_integration/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..97e6b2c61c --- /dev/null +++ b/src/tasks/batch_integration/resources_scripts/process_datasets.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources/datasets/**/state.yaml +rename_keys: 'input:output_dataset' +settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/batch_integration/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels batch_integration,process_datasets \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_scripts/run_benchmark.sh b/src/tasks/batch_integration/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..f48a5ccdd1 --- /dev/null +++ b/src/tasks/batch_integration/resources_scripts/run_benchmark.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/batch_integration/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/batch_integration/datasets/**/state.yaml +rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ + --labels batch_integration,full \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh b/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh new file mode 100755 index 0000000000..b9b80a38ea --- /dev/null +++ b/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/batch_integration/**/state.yaml +rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/batch_integration/ +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels batch_integration,test \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_test_scripts/process.sh b/src/tasks/batch_integration/resources_test_scripts/process.sh new file mode 100755 index 0000000000..3ab0dd2a4d --- /dev/null +++ b/src/tasks/batch_integration/resources_test_scripts/process.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/batch_integration + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' +# output_state should be moved to settings once workaround is solved + +for id in pancreas cxg_mouse_pancreas_atlas; do + if [ ! -f $DATASET_DIR/$id/dataset.h5ad ]; then + echo "Dataset $id not found" + exit 1 + fi + + echo Running BBKNN on $id + viash run src/tasks/batch_integration/methods/bbknn/config.vsh.yaml -- \ + --input $DATASET_DIR/$id/dataset.h5ad \ + --output $DATASET_DIR/$id/integrated_graph.h5ad + + echo Running SCVI on $id + viash run src/tasks/batch_integration/methods/scvi/config.vsh.yaml -- \ + --input $DATASET_DIR/$id/dataset.h5ad \ + --output $DATASET_DIR/$id/integrated_embedding.h5ad + + echo Running combat on $id + viash run src/tasks/batch_integration/methods/combat/config.vsh.yaml -- \ + --input $DATASET_DIR/$id/dataset.h5ad \ + --output $DATASET_DIR/$id/integrated_feature.h5ad +done \ No newline at end of file diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml new file mode 100644 index 0000000000..e841081a91 --- /dev/null +++ b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml @@ -0,0 +1,19 @@ +__merge__: ../../api/comp_transformer_embedding_to_graph.yaml +functionality: + name: embed_to_graph + info: + label: Embedding to Graph + summary: Transform an embedding to a graph output. + description: | + Transform an embedding to a graph output by applying the k nearest neighbors algorithm. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/script.py b/src/tasks/batch_integration/transformers/embed_to_graph/script.py new file mode 100644 index 0000000000..74166eb77c --- /dev/null +++ b/src/tasks/batch_integration/transformers/embed_to_graph/script.py @@ -0,0 +1,33 @@ +import sys +import scanpy as sc + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', + 'ouput': 'output.h5ad' +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + + +print('Run kNN...', flush=True) +sc.pp.neighbors(adata, use_rep='X_emb') + +print("Store outputs", flush=True) +adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml new file mode 100644 index 0000000000..e08013c63b --- /dev/null +++ b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml @@ -0,0 +1,20 @@ +__merge__: ../../api/comp_transformer_feature_to_embedding.yaml +functionality: + name: feature_to_embed + info: + type: transformer + label: Feature to Embedding + summary: Transform a feature output to an embedding. + description: | + Transform a feature output to an embedding by computing a PCA on the corrected counts. + resources: + - type: python_script + path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/script.py b/src/tasks/batch_integration/transformers/feature_to_embed/script.py new file mode 100644 index 0000000000..0e022db8b1 --- /dev/null +++ b/src/tasks/batch_integration/transformers/feature_to_embed/script.py @@ -0,0 +1,41 @@ +import sys +import scanpy as sc + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/integrated_feature.h5ad', + 'ouput': 'output.h5ad' +} + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) + + +print('Run PCA', flush=True) +adata.obsm['X_emb'] = sc.pp.pca( + adata.X, + n_comps=50, + use_highly_variable=False, # Do we want to set this to True? + svd_solver='arpack', + return_info=False +) + +print('Store outputs', flush=True) +adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml b/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..3273e84165 --- /dev/null +++ b/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,30 @@ +functionality: + name: "process_datasets" + namespace: "batch_integration/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: "/src/tasks/batch_integration/api/file_common_dataset.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/tasks/batch_integration/api/file_dataset.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/tasks/batch_integration/api/file_solution.yaml + required: true + direction: output + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: batch_integration/process_dataset +platforms: + - type: nextflow diff --git a/src/tasks/batch_integration/workflows/process_datasets/main.nf b/src/tasks/batch_integration/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..59cfee9f47 --- /dev/null +++ b/src/tasks/batch_integration/workflows/process_datasets/main.nf @@ -0,0 +1,54 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_dataset: "output_dataset", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_dataset", "output_solution"]) + + emit: + output_ch +} diff --git a/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh b/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh new file mode 100755 index 0000000000..28e9382879 --- /dev/null +++ b/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export NXF_VER=22.04.5 + +nextflow run . \ + -main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --id resources_test \ + --input_states "resources_test/common/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \ + --publish_dir "output/test" \ No newline at end of file diff --git a/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml b/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..fd6f6811d2 --- /dev/null +++ b/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,115 @@ +functionality: + name: "run_benchmark" + namespace: "batch_integration/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_dataset" + __merge__: /src/tasks/batch_integration/api/file_dataset.yaml + required: true + direction: input + - name: "--input_solution" + __merge__: /src/tasks/batch_integration/api/file_solution.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: ../../api/task_info.yaml + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: batch_integration/methods/bbknn + - name: batch_integration/methods/combat + - name: batch_integration/methods/fastmnn_embedding + - name: batch_integration/methods/fastmnn_feature + - name: batch_integration/methods/liger + - name: batch_integration/methods/mnn_correct + - name: batch_integration/methods/mnnpy + - name: batch_integration/methods/pyliger + - name: batch_integration/methods/scalex_embed + - name: batch_integration/methods/scalex_feature + - name: batch_integration/methods/scanorama_embed + - name: batch_integration/methods/scanorama_feature + - name: batch_integration/methods/scanvi + - name: batch_integration/methods/scvi + - name: batch_integration/control_methods/no_integration/batch_embed + alias: no_integration_batch_embed + - name: batch_integration/control_methods/no_integration/global_embed + alias: no_integration_global_embed + - name: batch_integration/control_methods/no_integration/global_feature + alias: no_integration_global_feature + - name: batch_integration/control_methods/no_integration/global_graph + alias: no_integration_global_graph + - name: batch_integration/control_methods/perfect_integration/celltype_embed + alias: perfect_integration_celltype_embed + - name: batch_integration/control_methods/perfect_integration/celltype_jitter_embed + alias: perfect_integration_celltype_jitter_embed + - name: batch_integration/control_methods/random_integration/batch_embed + alias: random_integration_batch_embed + - name: batch_integration/control_methods/random_integration/batch_feature + alias: random_integration_batch_feature + - name: batch_integration/control_methods/random_integration/batch_graph + alias: random_integration_batch_graph + - name: batch_integration/control_methods/random_integration/celltype_embed + alias: random_integration_celltype_embed + - name: batch_integration/control_methods/random_integration/celltype_feature + alias: random_integration_celltype_feature + - name: batch_integration/control_methods/random_integration/celltype_graph + alias: random_integration_celltype_graph + - name: batch_integration/control_methods/random_integration/global_embed + alias: random_integration_global_embed + - name: batch_integration/control_methods/random_integration/global_feature + alias: random_integration_global_feature + - name: batch_integration/control_methods/random_integration/global_graph + alias: random_integration_global_graph + - name: batch_integration/transformers/feature_to_embed + - name: batch_integration/transformers/embed_to_graph + - name: batch_integration/metrics/asw_batch + - name: batch_integration/metrics/asw_label + - name: batch_integration/metrics/cell_cycle_conservation + - name: batch_integration/metrics/clustering_overlap + - name: batch_integration/metrics/graph_connectivity + - name: batch_integration/metrics/hvg_overlap + - name: batch_integration/metrics/isolated_label_asw + - name: batch_integration/metrics/isolated_label_f1 + - name: batch_integration/metrics/kbet + - name: batch_integration/metrics/pcr +platforms: + - type: nextflow diff --git a/src/tasks/batch_integration/workflows/run_benchmark/main.nf b/src/tasks/batch_integration/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..d86293f2a5 --- /dev/null +++ b/src/tasks/batch_integration/workflows/run_benchmark/main.nf @@ -0,0 +1,258 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + bbknn, + combat, + fastmnn_embedding, + fastmnn_feature, + liger, + mnn_correct, + mnnpy, + pyliger, + scalex_embed, + scalex_feature, + scanorama_embed, + scanorama_feature, + scanvi, + scvi, + no_integration_batch_embed, + no_integration_global_embed, + no_integration_global_feature, + no_integration_global_graph, + perfect_integration_celltype_embed, + perfect_integration_celltype_jitter_embed, + random_integration_batch_embed, + random_integration_batch_feature, + random_integration_batch_graph, + random_integration_celltype_embed, + random_integration_celltype_feature, + random_integration_celltype_graph, + random_integration_global_embed, + random_integration_global_feature, + random_integration_global_graph, + ] + + // construct list of metrics + metrics = [ + asw_batch, + asw_label, + cell_cycle_conservation, + clustering_overlap, + graph_connectivity, + hvg_overlap, + isolated_label_asw, + isolated_label_f1, + kbet, + pcr + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + // run all methods + method_out_ch1 = dataset_ch + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.functionality.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [input: "input_dataset"], + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output, + method_subtype: comp.config.functionality.info.subtype + ] + } + ) + + + // append feature->embed transformations + method_out_ch2 = method_out_ch1 + | runEach( + components: feature_to_embed, + id: { id, state, comp -> + id + "_f2e" + }, + filter: { id, state, comp -> state.method_subtype == "feature"}, + fromState: [ input: "method_output" ], + toState: { id, output, state, comp -> + state + [ + method_output: output.output, + method_subtype: comp.config.functionality.info.subtype + ] + } + ) + | mix(method_out_ch1) + + // append embed->graph transformations + method_out_ch3 = method_out_ch2 + | runEach( + components: embed_to_graph, + id: { id, state, comp -> + id + "_e2g" + }, + filter: { id, state, comp -> state.method_subtype == "embedding"}, + fromState: [ input: "method_output" ], + toState: { id, output, state, comp -> + state + [ + method_output: output.output, + method_subtype: comp.config.functionality.info.subtype + ] + } + ) + | mix(method_out_ch2) + + // run metrics + score_ch = method_out_ch3 + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + filter: { id, state, comp -> + state.method_subtype == comp.config.functionality.info.subtype + }, + fromState: [ + input_integrated: "method_output", + input_solution: "input_solution" + ], + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + // extract scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + // store the task info in a file + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + // create state + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} diff --git a/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh b/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000000..a24ebb706f --- /dev/null +++ b/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources_test/batch_integration" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -c src/wf_utils/labels_ci.config \ + -entry auto \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file diff --git a/src/tasks/denoising/README.md b/src/tasks/denoising/README.md new file mode 100644 index 0000000000..da9d9b1912 --- /dev/null +++ b/src/tasks/denoising/README.md @@ -0,0 +1,357 @@ +# Denoising + + +Removing noise in sparse single-cell RNA-sequencing count data + +Path: +[`src/tasks/denoising`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/denoising) + +## Motivation + +Single-cell RNA-Seq protocols only detect a fraction of the mRNA +molecules present in each cell. As a result, the measurements (UMI +counts) observed for each gene and each cell are associated with +generally high levels of technical noise ([Grün et al., +2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes +the task of estimating the true expression level of each gene in each +cell. In the single-cell literature, this task is also referred to as +*imputation*, a term which is typically used for missing data problems +in statistics. Similar to the use of the terms “dropout”, “missing +data”, and “technical zeros”, this terminology can create confusion +about the underlying measurement process ([Sarkar and Stephens, +2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). + +## Description + +A key challenge in evaluating denoising methods is the general lack of a +ground truth. A recent benchmark study ([Hou et al., +2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) +relied on flow-sorted datasets, mixture control experiments ([Tian et +al., 2019](https://www.nature.com/articles/s41592-019-0425-8)), and +comparisons with bulk RNA-Seq data. Since each of these approaches +suffers from specific limitations, it is difficult to combine these +different approaches into a single quantitative measure of denoising +accuracy. Here, we instead rely on an approach termed molecular +cross-validation (MCV), which was specifically developed to quantify +denoising accuracy in the absence of a ground truth ([Batson et al., +2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the +observed molecules in a given scRNA-Seq dataset are first partitioned +between a *training* and a *test* dataset. Next, a denoising method is +applied to the training dataset. Finally, denoising accuracy is measured +by comparing the result to the test dataset. The authors show that both +in theory and in practice, the measured denoising accuracy is +representative of the accuracy that would be obtained on a ground truth +dataset. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Wesley Lewis | author, maintainer | +| Scott Gigante | author, maintainer | +| Robrecht Cannoodt | author | +| Kai Waldrant | author | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common dataset") + comp_process_dataset[/"Data processor"/] + file_train("Training data") + file_test("Test data") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_denoised("Denoised data") + file_score("Score") + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_train + comp_process_dataset-->file_test + file_train---comp_control_method + file_train---comp_method + file_test---comp_control_method + file_test---comp_metric + comp_control_method-->file_denoised + comp_method-->file_denoised + comp_metric-->file_score + file_denoised---comp_metric +``` + +## File format: Common dataset + +A dataset processed by the common dataset processing pipeline. + +Example file: `resources_test/common/pancreas/dataset.h5ad` + +Description: + +This dataset contains both raw counts and normalized data matrices, as +well as a PCA embedding, HVG selection and a kNN graph. + +Format: + +
+ + AnnData object + obs: 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid', 'size_factors' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'pca_variance', 'knn' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------------------------------|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors created by the normalisation method, if any. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalised expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["pca_variance"]` | `double` | The PCA variance objects. | +| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## Component type: Data processor + +Path: +[`src/denoising`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/denoising) + +A denoising dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------------|:-------|:------------------------------------------------------------------| +| `--input` | `file` | A dataset processed by the common dataset processing pipeline. | +| `--output_train` | `file` | (*Output*) The subset of molecules used for the training dataset. | +| `--output_test` | `file` | (*Output*) The subset of molecules used for the test dataset. | + +
+ +## File format: Training data + +The subset of molecules used for the training dataset + +Example file: `resources_test/denoising/pancreas/train.h5ad` + +Format: + +
+ + AnnData object + layers: 'counts' + uns: 'dataset_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------|:----------|:-------------------------------------| +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | + +
+ +## File format: Test data + +The subset of molecules used for the test dataset + +Example file: `resources_test/denoising/pancreas/test.h5ad` + +Format: + +
+ + AnnData object + layers: 'counts' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'train_sum' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `layers["counts"]` | `integer` | Raw counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["train_sum"]` | `integer` | The total number of counts in the training dataset. | + +
+ +## Component type: Control method + +Path: +[`src/denoising/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/denoising/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:----------------|:-------|:---------------------------------------------------------------| +| `--input_train` | `file` | The subset of molecules used for the training dataset. | +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--output` | `file` | (*Output*) A denoised dataset as output by a denoising method. | + +
+ +## Component type: Method + +Path: +[`src/denoising/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/denoising/methods) + +A denoising method. + +Arguments: + +
+ +| Name | Type | Description | +|:----------------|:-------|:---------------------------------------------------------------| +| `--input_train` | `file` | The subset of molecules used for the training dataset. | +| `--output` | `file` | (*Output*) A denoised dataset as output by a denoising method. | + +
+ +## Component type: Metric + +Path: +[`src/denoising/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/denoising/metrics) + +A denoising metric. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:----------------------------------------------------| +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--input_denoised` | `file` | A denoised dataset as output by a denoising method. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Denoised data + +A denoised dataset as output by a denoising method. + +Example file: `resources_test/denoising/pancreas/denoised.h5ad` + +Format: + +
+ + AnnData object + layers: 'denoised' + uns: 'dataset_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------|:----------|:-------------------------------------| +| `layers["denoised"]` | `integer` | denoised data. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +NA + +Example file: `resources_test/denoising/pancreas/score.h5ad` + +Description: + +Metric score file + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/src/tasks/denoising/api/comp_control_method.yaml b/src/tasks/denoising/api/comp_control_method.yaml new file mode 100644 index 0000000000..6fe13f2a35 --- /dev/null +++ b/src/tasks/denoising/api/comp_control_method.yaml @@ -0,0 +1,33 @@ +functionality: + namespace: "denoising/control_methods" + info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_train" + __merge__: file_train.yaml + direction: input + required: true + - name: "--input_test" + __merge__: file_test.yaml + direction: input + required: true + - name: "--output" + __merge__: file_denoised.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/denoising/pancreas + dest: resources_test/denoising/pancreas \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_method.yaml b/src/tasks/denoising/api/comp_method.yaml new file mode 100644 index 0000000000..517723772d --- /dev/null +++ b/src/tasks/denoising/api/comp_method.yaml @@ -0,0 +1,26 @@ +functionality: + namespace: "denoising/methods" + info: + type: method + type_info: + label: Method + summary: A denoising method. + description: | + A denoising method to remove noise (i.e. technical artifacts) from a dataset. + arguments: + - name: "--input_train" + __merge__: file_train.yaml + direction: input + required: true + - name: "--output" + __merge__: file_denoised.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/denoising/pancreas + dest: resources_test/denoising/pancreas + - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_metric.yaml b/src/tasks/denoising/api/comp_metric.yaml new file mode 100644 index 0000000000..c2ef922239 --- /dev/null +++ b/src/tasks/denoising/api/comp_metric.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "denoising/metrics" + info: + type: metric + type_info: + label: Metric + summary: A denoising metric. + description: | + A metric for evaluating denoised datasets. + arguments: + - name: "--input_test" + __merge__: file_test.yaml + direction: input + required: true + - name: "--input_denoised" + __merge__: file_denoised.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/denoising/pancreas + dest: resources_test/denoising/pancreas + - path: /src/common/library.bib + \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_process_dataset.yaml b/src/tasks/denoising/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..ce6874c0ea --- /dev/null +++ b/src/tasks/denoising/api/comp_process_dataset.yaml @@ -0,0 +1,27 @@ +functionality: + namespace: "denoising" + info: + type: process_dataset + type_info: + label: Data processor + summary: A denoising dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input" + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas diff --git a/src/tasks/denoising/api/file_common_dataset.yaml b/src/tasks/denoising/api/file_common_dataset.yaml new file mode 100644 index 0000000000..ff913ce0de --- /dev/null +++ b/src/tasks/denoising/api/file_common_dataset.yaml @@ -0,0 +1,40 @@ +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false diff --git a/src/tasks/denoising/api/file_denoised.yaml b/src/tasks/denoising/api/file_denoised.yaml new file mode 100644 index 0000000000..fc79694028 --- /dev/null +++ b/src/tasks/denoising/api/file_denoised.yaml @@ -0,0 +1,21 @@ +type: file +example: "resources_test/denoising/pancreas/denoised.h5ad" +info: + label: "Denoised data" + summary: A denoised dataset as output by a denoising method. + slots: + layers: + - type: integer + name: denoised + description: denoised data + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + \ No newline at end of file diff --git a/src/tasks/denoising/api/file_score.yaml b/src/tasks/denoising/api/file_score.yaml new file mode 100644 index 0000000000..4f34eeb7f7 --- /dev/null +++ b/src/tasks/denoising/api/file_score.yaml @@ -0,0 +1,21 @@ +type: file +description: "Metric score file" +example: "resources_test/denoising/pancreas/score.h5ad" +info: + label: "Score" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + - type: string + name: method_id + description: "A unique identifier for the method" + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true diff --git a/src/tasks/denoising/api/file_test.yaml b/src/tasks/denoising/api/file_test.yaml new file mode 100644 index 0000000000..371b3054f7 --- /dev/null +++ b/src/tasks/denoising/api/file_test.yaml @@ -0,0 +1,44 @@ +type: file +example: "resources_test/denoising/pancreas/test.h5ad" +info: + label: "Test data" + summary: The subset of molecules used for the test dataset + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: train_sum + type: integer + description: The total number of counts in the training dataset. + required: true \ No newline at end of file diff --git a/src/tasks/denoising/api/file_train.yaml b/src/tasks/denoising/api/file_train.yaml new file mode 100644 index 0000000000..302eae2d5c --- /dev/null +++ b/src/tasks/denoising/api/file_train.yaml @@ -0,0 +1,16 @@ +type: file +example: "resources_test/denoising/pancreas/train.h5ad" +info: + label: "Training data" + summary: The subset of molecules used for the training dataset + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true \ No newline at end of file diff --git a/src/tasks/denoising/api/task_info.yaml b/src/tasks/denoising/api/task_info.yaml new file mode 100644 index 0000000000..f7de1118f2 --- /dev/null +++ b/src/tasks/denoising/api/task_info.yaml @@ -0,0 +1,54 @@ +name: denoising +label: Denoising +v1: + path: openproblems/tasks/denoising/README.md + commit: 3fe9251ba906061b6769eed2ac9da0db5f8e26bb +summary: "Removing noise in sparse single-cell RNA-sequencing count data" +image: "thumbnail.svg" +motivation: | + Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present + in each cell. As a result, the measurements (UMI counts) observed for each gene and each + cell are associated with generally high levels of technical noise ([Grün et al., + 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of + estimating the true expression level of each gene in each cell. In the single-cell + literature, this task is also referred to as *imputation*, a term which is typically + used for missing data problems in statistics. Similar to the use of the terms "dropout", + "missing data", and "technical zeros", this terminology can create confusion about the + underlying measurement process ([Sarkar and Stephens, + 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). +description: | + A key challenge in evaluating denoising methods is the general lack of a ground truth. A + recent benchmark study ([Hou et al., + 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) + relied on flow-sorted datasets, mixture control experiments ([Tian et al., + 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk + RNA-Seq data. Since each of these approaches suffers from specific limitations, it is + difficult to combine these different approaches into a single quantitative measure of + denoising accuracy. Here, we instead rely on an approach termed molecular + cross-validation (MCV), which was specifically developed to quantify denoising accuracy + in the absence of a ground truth ([Batson et al., + 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules + in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* + dataset. Next, a denoising method is applied to the training dataset. Finally, denoising + accuracy is measured by comparing the result to the test dataset. The authors show that + both in theory and in practice, the measured denoising accuracy is representative of the + accuracy that would be obtained on a ground truth dataset. +authors: + - name: "Wesley Lewis" + roles: [ author, maintainer ] + info: + github: wes-lewis + - name: "Scott Gigante" + roles: [ author, maintainer ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Robrecht Cannoodt + roles: [ author ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [ author ] + info: + github: KaiWaldrant \ No newline at end of file diff --git a/src/tasks/denoising/api/thumbnail.svg b/src/tasks/denoising/api/thumbnail.svg new file mode 100644 index 0000000000..65936f0e1e --- /dev/null +++ b/src/tasks/denoising/api/thumbnail.svg @@ -0,0 +1 @@ +dim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml new file mode 100644 index 0000000000..64a35f9986 --- /dev/null +++ b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "no_denoising" + info: + label: No Denoising + summary: "negative control by copying train counts" + description: "This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data." + v1: + path: openproblems/tasks/denoising/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + no_denoising: + preferred_normalization: counts + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/control_methods/no_denoising/script.py b/src/tasks/denoising/control_methods/no_denoising/script.py new file mode 100644 index 0000000000..97c9a4184c --- /dev/null +++ b/src/tasks/denoising/control_methods/no_denoising/script.py @@ -0,0 +1,22 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'output_train.h5ad', + 'output': 'output_ND.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_train.layers['counts'] + +input_train.uns["method_id"] = meta['functionality_name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml new file mode 100644 index 0000000000..b16862360b --- /dev/null +++ b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "perfect_denoising" + info: + label: Perfect Denoising + summary: "Positive control by copying the test counts" + description: "This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics." + v1: + path: openproblems/tasks/denoising/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + perfect_denoising: + preferred_normalization: counts + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/control_methods/perfect_denoising/script.py b/src/tasks/denoising/control_methods/perfect_denoising/script.py new file mode 100644 index 0000000000..c280a4a3bc --- /dev/null +++ b/src/tasks/denoising/control_methods/perfect_denoising/script.py @@ -0,0 +1,24 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/denoising/pancreas/train.h5ad', + 'input_test': 'resources_test/denoising/pancreas/test.h5ad', + 'output': 'output_PD.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Process data", flush=True) +input_train.layers["denoised"] = input_test.layers['counts'] + +input_train.uns["method_id"] = meta['functionality_name'] + +print("Write Data", flush=True) +input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/tasks/denoising/methods/alra/config.vsh.yaml b/src/tasks/denoising/methods/alra/config.vsh.yaml new file mode 100644 index 0000000000..374d317fce --- /dev/null +++ b/src/tasks/denoising/methods/alra/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "alra" + info: + label: ALRA + summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation, thresholding by gene, and rescaling the matrix." + description: | + Adaptively-thresholded Low Rank Approximation (ALRA). + + ALRA is a method for imputation of missing values in single cell RNA-sequencing data, + described in the preprint, "Zero-preserving imputation of scRNA-seq data using low-rank approximation" + available [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a + scRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. + Next, each row (gene) is thresholded by the magnitude of the most negative value of that gene. + Finally, the matrix is rescaled. + reference: "linderman2018zero" + repository_url: "https://github.com/KlugerLab/ALRA" + documentation_url: https://github.com/KlugerLab/ALRA/blob/master/README.md + v1: + path: openproblems/tasks/denoising/methods/alra.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + alra: + preferred_normalization: counts + arguments: + - name: "--norm" + type: string + choices: ["sqrt", "log"] + default: "log" + description: Normalization method + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ Matrix, rsvd ] + github: KlugerLab/ALRA + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/alra/script.R b/src/tasks/denoising/methods/alra/script.R new file mode 100644 index 0000000000..9a5b237c6f --- /dev/null +++ b/src/tasks/denoising/methods/alra/script.R @@ -0,0 +1,53 @@ +cat(">> Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +library(ALRA, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input_train = "resources_test/denoising/pancreas/train.h5ad", + norm = "log", + output = "output.h5ad" +) +meta <- list( + functionality_name = "alra" +) +## VIASH END + +cat(">> Load input data\n") +input_train <- read_h5ad(par$input_train, backed = "r") + +cat(">> Set normalization method\n") +if (par$norm == "sqrt") { + norm_fn <- sqrt + denorm_fn <- function(x) x^2 +} else if (par$norm == "log") { + norm_fn <- log1p + denorm_fn <- expm1 +} else { + stop("Unknown normalization method: ", par$norm) +} + +cat(">> Normalize data\n") +data <- as.matrix(input_train$layers[["counts"]]) +totalPerCell <- rowSums(data) +data <- sweep(data, 1, totalPerCell, "/") +data <- norm_fn(data) + +cat(">> Run ALRA\n") +data <- alra(data)$A_norm_rank_k_cor_sc +data <- denorm_fn(data) +data <- sweep(data, 1, totalPerCell, "*") + +cat(">> Store output\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train$obs[, c(), drop = FALSE], + var = input_train$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train$uns[["dataset_id"]], + method_id = meta$functionality_name + ) +) + +cat(">> Write output to file\n") +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/denoising/methods/dca/config.vsh.yaml b/src/tasks/denoising/methods/dca/config.vsh.yaml new file mode 100644 index 0000000000..33c6079866 --- /dev/null +++ b/src/tasks/denoising/methods/dca/config.vsh.yaml @@ -0,0 +1,45 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "dca" + info: + label: DCA + summary: "A deep autoencoder with ZINB loss function to address the dropout effect in count data" + description: | + "Deep Count Autoencoder + + Removes the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account + using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function." + reference: "eraslan2019single" + documentation_url: "https://github.com/theislab/dca#readme" + repository_url: "https://github.com/theislab/dca" + v1: + path: openproblems/tasks/denoising/methods/dca.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + dca: + preferred_normalization: counts + arguments: + - name: "--epochs" + type: "integer" + default: 300 + description: "Number of total epochs in training" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: python:3.9 + setup: + - type: apt + packages: procps + - type: python + packages: + - anndata~=0.8.0 + - scanpy + - pyyaml + - requests + - jsonschema + - "git+https://github.com/scottgigante-immunai/dca.git@patch-1" + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/dca/script.py b/src/tasks/denoising/methods/dca/script.py new file mode 100644 index 0000000000..d35f3c00a5 --- /dev/null +++ b/src/tasks/denoising/methods/dca/script.py @@ -0,0 +1,39 @@ +import anndata as ad +from dca.api import dca + +## VIASH START +par = { + 'input_train': 'resources_test/denoising/pancreas/train.h5ad', + 'output': 'output_dca.h5ad', + 'epochs': 300, +} +meta = { + 'functionality_name': 'dca', +} +## VIASH END + +print("load input data", flush=True) +input_train = ad.read_h5ad(par['input_train'], backed="r") + +print("Remove unneeded data", flush=True) +output = ad.AnnData( + X=input_train.layers["counts"], + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["functionality_name"] + } +) + +del input_train + +print("Run DCA", flush=True) +dca(output, epochs=par["epochs"]) + +print("Move output to correct location", flush=True) +output.layers["denoised"] = output.X +del output.X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml new file mode 100644 index 0000000000..b0c55ae0d8 --- /dev/null +++ b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "knn_smoothing" + info: + label: KNN Smoothing + summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing the size of neighbourhoods for smoothing until a maximum k value is reached." + description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq + expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first + applies initial normalisation and smoothing. Then, a chosen number of + principal components is used to calculate Euclidean distances between cells. + Minimally sized neighbourhoods are initially determined from these Euclidean + distances, and expression profiles are shared between neighbouring cells. + Then, the resultant smoothed matrix is used as input to the next step of + smoothing, where the size (k) of the considered neighbourhoods is increased, + leading to greater smoothing. This process continues until a chosen maximum k + value has been reached, at which point the iteratively smoothed object is + then optionally scaled to yield a final result." + reference: "wagner2018knearest" + documentation_url: "https://github.com/yanailab/knn-smoothing#readme" + repository_url: "https://github.com/yanailab/knn-smoothing" + v1: + path: openproblems/tasks/denoising/methods/knn_smoothing.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + knn_smoothing: + preferred_normalization: counts + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scipy + github: + - scottgigante-immunai/knn-smoothing@python_package + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/knn_smoothing/script.py b/src/tasks/denoising/methods/knn_smoothing/script.py new file mode 100644 index 0000000000..450da2012a --- /dev/null +++ b/src/tasks/denoising/methods/knn_smoothing/script.py @@ -0,0 +1,39 @@ +import knn_smooth +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/denoising/pancreas/train.h5ad', + 'output': 'output_knn.h5ad', +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par["input_train"], backed="r") + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"].astype(float).transpose().toarray() + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["functionality_name"] + } +) + +del input_train + +print("Run KNN smoothing", flush=True) +X = knn_smooth.knn_smoothing(X, k=10).transpose() + +print("Process data", flush=True) +output.layers["denoised"] = X + +print("Writing data", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/denoising/methods/magic/config.vsh.yaml b/src/tasks/denoising/methods/magic/config.vsh.yaml new file mode 100644 index 0000000000..380666a1b5 --- /dev/null +++ b/src/tasks/denoising/methods/magic/config.vsh.yaml @@ -0,0 +1,63 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "magic" + info: + label: MAGIC + summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." + description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for + imputation and denoising of noisy or dropout-prone single cell RNA-sequencing + data. Given a normalised scRNA-seq expression matrix, it first calculates + Euclidean distances between each pair of cells in the dataset, which is then + augmented using a Gaussian kernel (function) and row-normalised to give a + normalised affinity matrix. A t-step markov process is then calculated, by + powering this affinity matrix t times. Finally, the powered affinity matrix + is right-multiplied by the normalised data, causing the final imputed values + to take the value of a per-gene average weighted by the affinities of cells. + The resultant imputed matrix is then rescaled, to more closely match the + magnitude of measurements in the normalised (input) matrix." + reference: "van2018recovering" + documentation_url: "https://github.com/KrishnaswamyLab/MAGIC#readme" + repository_url: "https://github.com/KrishnaswamyLab/MAGIC" + v1: + path: openproblems/tasks/denoising/methods/magic.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + magic: + magic_approx: + solver: approximate + magic_knn_naive: + norm: log + decay: none + t: 1 + preferred_normalization: counts + arguments: + - name: "--solver" + type: "string" + choices: ["exact", "approximate"] + default: "exact" + description: Which solver to use. + - name: "--norm" + type: string + choices: ["sqrt", "log"] + default: "log" + description: Normalization method + - name: "--decay" + type: integer + default: 1 + description: sets decay rate of kernel tails + - name: "--t" + type: integer + default: 3 + description: power to which the diffusion operator is powered + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pip: [scprep, magic-impute, scipy, scikit-learn<1.2] + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/magic/script.py b/src/tasks/denoising/methods/magic/script.py new file mode 100644 index 0000000000..075d2e21cd --- /dev/null +++ b/src/tasks/denoising/methods/magic/script.py @@ -0,0 +1,76 @@ +import anndata as ad +import numpy as np +import scprep +from magic import MAGIC +import scipy + + +## VIASH START +par = { + "input_train": "resources_test/denoising/pancreas/train.h5ad", + "output": "output_magic.h5ad", + "solver": "exact", + "norm": "sqrt", + "decay": 1, + "t": 3, +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par["input_train"], backed="r") + +print("Set normalization method", flush=True) +if par["norm"] == "sqrt": + norm_fn = np.sqrt + denorm_fn = np.square +elif par["norm"] == "log": + norm_fn = np.log1p + denorm_fn = np.expm1 +else: + raise ValueError("Unknown normalization method: " + par["norm"] + ".") + +print("Remove unneeded data", flush=True) +X = input_train.layers["counts"] + +# Create output AnnData for later use +output = ad.AnnData( + obs=input_train.obs[[]], + var=input_train.var[[]], + uns={ + "dataset_id": input_train.uns["dataset_id"], + "method_id": meta["functionality_name"] + } +) + +del input_train + +print("Normalize data", flush=True) +X, libsize = scprep.normalize.library_size_normalize( + X, + rescale=1, + return_library_size=True +) +X = scprep.utils.matrix_transform(X, norm_fn) + +print("Run MAGIC", flush=True) +magic = MAGIC( + solver=par["solver"], + decay=par["decay"], + t=par["t"], + verbose=False, +) +X = magic.fit_transform(X, genes="all_genes") + +print("Denormalizing data", flush=True) +X = scprep.utils.matrix_transform(X, denorm_fn) +X = scprep.utils.matrix_vector_elementwise_multiply(X, libsize, axis=0) + +print("Create output AnnData", flush=True) +output.layers["denoised"] = X + +print("Write Data", flush=True) +output.write_h5ad(par["output"], compression="gzip") + diff --git a/src/tasks/denoising/methods/saver/config.vsh.yaml b/src/tasks/denoising/methods/saver/config.vsh.yaml new file mode 100644 index 0000000000..3c997fc36f --- /dev/null +++ b/src/tasks/denoising/methods/saver/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: saver + status: disabled + info: + label: SAVER + summary: SAVER (Single-cell Analysis Via Expression Recovery) implements a regularized regression prediction and empirical Bayes method to recover the true gene expression profile. + description: | + SAVER takes advantage of gene-to-gene relationships to recover the true expression level of each gene in each cell, + removing technical variation while retaining biological variation across cells (https://github.com/mohuangx/SAVER). + SAVER uses a post-quality-control scRNA-seq dataset with UMI counts as input. SAVER assumes that the count of each + gene in each cell follows a Poisson-gamma mixture, also known as a negative binomial model. Instead of specifying + the gamma prior, we estimate the prior parameters in an empirical Bayes-like approach with a Poisson LASSO regression, + using the expression of other genes as predictors. Once the prior parameters are estimated, SAVER outputs the + posterior distribution of the true expression, which quantifies estimation uncertainty, and the posterior mean is + used as the SAVER recovered expression value. + reference: huang2018savergene + repository_url: https://github.com/mohuangx/SAVER + documentation_url: https://mohuangx.github.io/SAVER/index.html + preferred_normalization: counts + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + github: mohuangx/SAVER + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/saver/script.R b/src/tasks/denoising/methods/saver/script.R new file mode 100644 index 0000000000..f6a44f4c3a --- /dev/null +++ b/src/tasks/denoising/methods/saver/script.R @@ -0,0 +1,39 @@ +cat(">> Loading dependencies\n") +library(anndata, warn.conflicts = FALSE) +library(SAVER, warn.conflicts = FALSE) +library(Matrix, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input_train = "resources_test/denoising/pancreas/train.h5ad", + norm = "log", + output = "output.h5ad" +) +meta <- list( + functionality_name = "saver", + ncpus = 30 +) +## VIASH END + +cat(">> Load input data\n") +input_train <- read_h5ad(par$input_train, backed = "r") + +cat(">> Normalize data\n") +data <- as(t(input_train$layers[["counts"]]), "CsparseMatrix") + +cat(">> Run SAVER\n") +data <- t(saver(data, ncores = meta$ncpus, estimates.only = TRUE)) + +cat(">> Store output\n") +output <- AnnData( + layers = list(denoised = data), + obs = input_train$obs[, c(), drop = FALSE], + var = input_train$var[, c(), drop = FALSE], + uns = list( + dataset_id = input_train$uns[["dataset_id"]], + method_id = meta$functionality_name + ) +) + +cat(">> Write output to file\n") +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/denoising/metrics/mse/config.vsh.yaml b/src/tasks/denoising/metrics/mse/config.vsh.yaml new file mode 100644 index 0000000000..8330a8de31 --- /dev/null +++ b/src/tasks/denoising/metrics/mse/config.vsh.yaml @@ -0,0 +1,30 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "mse" + info: + metrics: + - name: mse + label: Mean-squared error + summary: "The mean squared error between the denoised counts and the true counts." + description: "The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighing by the train/test ratio" + reference: batson2019molecular + v1: + path: openproblems/tasks/denoising/metrics/mse.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + maximize: false + min: 0 + max: "+.inf" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - scprep + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/denoising/metrics/mse/script.py b/src/tasks/denoising/metrics/mse/script.py new file mode 100644 index 0000000000..eba964f132 --- /dev/null +++ b/src/tasks/denoising/metrics/mse/script.py @@ -0,0 +1,51 @@ +import anndata as ad +import scanpy as sc +import sklearn.metrics +import scprep + +## VIASH START +par = { + 'input_test': 'resources_test/denoising/pancreas/test.h5ad', + 'input_denoised': 'resources_test/denoising/pancreas/magic.h5ad', + 'output': 'output_mse.h5ad' +} +meta = { + 'functionality_name': 'mse' +} +## VIASH END + +print("Load data", flush=True) +input_denoised = ad.read_h5ad(par['input_denoised'], backed="r") +input_test = ad.read_h5ad(par['input_test'], backed="r") + +test_data = ad.AnnData(X=input_test.layers["counts"], dtype="float") +denoised_data = ad.AnnData(X=input_denoised.layers["denoised"], dtype="float") + +print("Normalize data", flush=True) + +# scaling and transformation +target_sum = 10000 + +sc.pp.normalize_total(test_data, target_sum) +sc.pp.log1p(test_data) + +sc.pp.normalize_total(denoised_data, target_sum) +sc.pp.log1p(denoised_data) + +print("Compute mse value", flush=True) +error = sklearn.metrics.mean_squared_error( + scprep.utils.toarray(test_data.X), scprep.utils.toarray(denoised_data.X) +) + +print("Store mse value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['functionality_name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") + diff --git a/src/tasks/denoising/metrics/poisson/config.vsh.yaml b/src/tasks/denoising/metrics/poisson/config.vsh.yaml new file mode 100644 index 0000000000..e523a9306e --- /dev/null +++ b/src/tasks/denoising/metrics/poisson/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "poisson" + info: + metrics: + - name: poisson + label: Poisson Loss + summary: "The Poisson log likelihood of the true counts observed in the distribution of denoised counts" + description: "The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset." + reference: batson2019molecular + v1: + path: openproblems/tasks/denoising/metrics/poisson.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + maximize: false + min: 0 + max: "+.inf" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pip: scprep + - type: nextflow + directives: + label: [midtime, highmem, midcpu] \ No newline at end of file diff --git a/src/tasks/denoising/metrics/poisson/script.py b/src/tasks/denoising/metrics/poisson/script.py new file mode 100644 index 0000000000..537ccf0119 --- /dev/null +++ b/src/tasks/denoising/metrics/poisson/script.py @@ -0,0 +1,46 @@ +import anndata as ad +import scprep +import numpy as np + +## VIASH START +par = { + 'input_denoised': 'output_magic.h5ad', + 'input_test': 'output_test.h5ad', + 'output': 'output_poisson.h5ad' +} +meta = { + 'functionality_name': 'poisson' +} +## VIASH END + +print("Load Data", flush=True) +input_denoised = ad.read_h5ad(par['input_denoised'], backed="r") +input_test = ad.read_h5ad(par['input_test'], backed="r") + +test_data = scprep.utils.toarray(input_test.layers["counts"]) +denoised_data = scprep.utils.toarray(input_denoised.layers["denoised"]) + +print("Compute metric value", flush=True) +# scaling +initial_sum = input_test.uns["train_sum"] +target_sum = test_data.sum() +denoised_data = denoised_data * target_sum / initial_sum + +# from molecular_cross_validation.mcv_sweep import poisson_nll_loss +# copied from: https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/mcv_sweep.py +def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float: + return (y_pred - y_true * np.log(y_pred + 1e-6)).mean() + +error = poisson_nll_loss(test_data, denoised_data) + +print("Store poisson value", flush=True) +output = ad.AnnData( + uns={ key: val for key, val in input_test.uns.items() }, +) + +output.uns["method_id"] = input_denoised.uns["method_id"] +output.uns["metric_ids"] = meta['functionality_name'] +output.uns["metric_values"] = error + +print("Write adata to file", flush=True) +output.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/denoising/process_dataset/config.vsh.yaml b/src/tasks/denoising/process_dataset/config.vsh.yaml new file mode 100644 index 0000000000..c9b5b06c1a --- /dev/null +++ b/src/tasks/denoising/process_dataset/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../api/comp_process_dataset.yaml +functionality: + name: "process_dataset" + description: | + Split data using molecular cross-validation. + + Splits molecules into two (potentially overlapping) groups using a fraction ratio. + These are output as two separate AnnData objects. + arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["mcv"] + default: "mcv" + - name: "--train_frac" + type: "double" + description: "The fraction the molecules need to be split to train dataset" + default: 0.9 + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + resources: + - type: python_script + path: script.py + - path: helper.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - numpy + - scipy + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/tasks/denoising/process_dataset/helper.py b/src/tasks/denoising/process_dataset/helper.py new file mode 100644 index 0000000000..2044ed4c6e --- /dev/null +++ b/src/tasks/denoising/process_dataset/helper.py @@ -0,0 +1,55 @@ +# MIT License + +# Copyright (c) 2019 Chan Zuckerberg Biohub + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Copied from https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/util.py + + +from typing import Tuple + +import numpy as np + +def split_molecules( + umis: np.ndarray, + data_split: float, + overlap_factor: float = 0.0, + random_state: np.random.RandomState = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Splits molecules into two (potentially overlapping) groups. + :param umis: Array of molecules to split + :param data_split: Proportion of molecules to assign to the first group + :param overlap_factor: Overlap correction factor, if desired + :param random_state: For reproducible sampling + :return: umis_X and umis_Y, representing ``split`` and ``~(1 - split)`` counts + sampled from the input array + """ + if random_state is None: + random_state = np.random.RandomState() + + umis_X_disjoint = random_state.binomial(umis, data_split - overlap_factor) + umis_Y_disjoint = random_state.binomial( + umis - umis_X_disjoint, (1 - data_split) / (1 - data_split + overlap_factor) + ) + overlap_factor = umis - umis_X_disjoint - umis_Y_disjoint + umis_X = umis_X_disjoint + overlap_factor + umis_Y = umis_Y_disjoint + overlap_factor + + return umis_X, umis_Y \ No newline at end of file diff --git a/src/tasks/denoising/process_dataset/script.py b/src/tasks/denoising/process_dataset/script.py new file mode 100644 index 0000000000..94a5884046 --- /dev/null +++ b/src/tasks/denoising/process_dataset/script.py @@ -0,0 +1,75 @@ +import sys +import anndata as ad +import numpy as np + +## VIASH START +par = { + 'input': "resources_test/common/pancreas/dataset.h5ad", + 'output_train': "train.h5ad", + 'output_test': "test.h5ad", + 'train_frac': 0.9, + 'seed': 0 +} +meta = { + "functionality_name": "process_dataset", + "resources_dir": "src/tasks/denoising/process_dataset" +} +## VIASH END + +# add helper scripts to path +sys.path.append(meta["resources_dir"]) +from helper import split_molecules + +# set random state +random_state = np.random.RandomState(par['seed']) + +print(">> Load Data", flush=True) +adata = ad.read_h5ad(par["input"]) + +# remove all layers except for counts +for key in list(adata.layers.keys()): + if key != "counts": + del adata.layers[key] + +# round counts and convert to int +counts = np.array(adata.layers["counts"]).round().astype(int) + +print(">> process and split data", flush=True) +train_data, test_data = split_molecules( + counts.data, par["train_frac"], 0.0, random_state +) + +X_train = counts.copy() +X_test = counts.copy() +X_train.data = train_data +X_test.data = test_data +X_train.eliminate_zeros() +X_test.eliminate_zeros() + +# copy adata to train_set, test_set +output_train = ad.AnnData( + layers={"counts": X_train}, + obs=adata.obs[[]], + var=adata.var[[]], + uns={"dataset_id": adata.uns["dataset_id"]} +) +test_uns_keys = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +output_test = ad.AnnData( + layers={"counts": X_test}, + obs=adata.obs[[]], + var=adata.var[[]], + uns={key: adata.uns[key] for key in test_uns_keys} +) + +# add additional information for the train set +output_test.uns["train_sum"] = X_train.sum() + +# Remove no cells that do not have enough reads +is_missing = np.array(X_train.sum(axis=0) == 0) + +output_train = output_train[:, ~is_missing.flatten()] +output_test = output_test[:, ~is_missing.flatten()] + +print(">> Write to file", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) diff --git a/src/tasks/denoising/resources_scripts/process_datasets.sh b/src/tasks/denoising/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..44060a8f66 --- /dev/null +++ b/src/tasks/denoising/resources_scripts/process_datasets.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +id: denoising_process_datasets +input_states: s3://openproblems-data/resources/datasets/**/log_cp10k/state.yaml +rename_keys: 'input:output_dataset' +settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/denoising/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels denoising,process_datasets \ No newline at end of file diff --git a/src/tasks/denoising/resources_scripts/run_benchmark.sh b/src/tasks/denoising/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..983c42cc56 --- /dev/null +++ b/src/tasks/denoising/resources_scripts/run_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/denoising/results/${RUN_ID}" + +# make sure only log_cp10k is used +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/denoising/datasets/**/log_cp10k/state.yaml +rename_keys: 'input_train:output_train,input_test:output_test' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ + --labels denoising,full \ No newline at end of file diff --git a/src/tasks/denoising/resources_scripts/run_benchmark_test.sh b/src/tasks/denoising/resources_scripts/run_benchmark_test.sh new file mode 100755 index 0000000000..7f3ecbd3d2 --- /dev/null +++ b/src/tasks/denoising/resources_scripts/run_benchmark_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/denoising/**/state.yaml +rename_keys: 'input_train:output_train,input_test:output_test' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/denoising/ +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels denoising,test \ No newline at end of file diff --git a/src/tasks/denoising/resources_test_scripts/pancreas.sh b/src/tasks/denoising/resources_test_scripts/pancreas.sh new file mode 100755 index 0000000000..c737b39c2e --- /dev/null +++ b/src/tasks/denoising/resources_test_scripts/pancreas.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/denoising + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' + +# run one method +viash run src/tasks/denoising/methods/magic/config.vsh.yaml -- \ + --input_train $DATASET_DIR/pancreas/train.h5ad \ + --output $DATASET_DIR/pancreas/denoised.h5ad + +# run one metric +viash run src/tasks/denoising/metrics/poisson/config.vsh.yaml -- \ + --input_denoised $DATASET_DIR/pancreas/denoised.h5ad \ + --input_test $DATASET_DIR/pancreas/test.h5ad \ + --output $DATASET_DIR/pancreas/score.h5ad + +# # run benchmark +# export NXF_VER=22.04.5 + +# nextflow \ +# run . \ +# -main-script src/tasks/denoising/workflows/run/main.nf \ +# -profile docker \ +# -resume \ +# --id pancreas \ +# --input_train $DATASET_DIR/train.h5ad \ +# --input_test $DATASET_DIR/test.h5ad \ +# --output scores.tsv \ +# --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml b/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..6fc095704b --- /dev/null +++ b/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,30 @@ +functionality: + name: "process_datasets" + namespace: "denoising/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + required: true + example: dataset.h5ad + __merge__: "/src/tasks/denoising/api/file_common_dataset.yaml" + - name: Outputs + arguments: + - name: "--output_train" + __merge__: "/src/tasks/denoising/api/file_train.yaml" + direction: output + required: true + - name: "--output_test" + __merge__: "/src/tasks/denoising/api/file_test.yaml" + direction: output + required: true + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: denoising/process_dataset +platforms: + - type: nextflow diff --git a/src/tasks/denoising/workflows/process_datasets/main.nf b/src/tasks/denoising/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..4437206b09 --- /dev/null +++ b/src/tasks/denoising/workflows/process_datasets/main.nf @@ -0,0 +1,54 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test"]) + + emit: + output_ch +} diff --git a/src/tasks/denoising/workflows/process_datasets/run_test.sh b/src/tasks/denoising/workflows/process_datasets/run_test.sh new file mode 100755 index 0000000000..ed8484693b --- /dev/null +++ b/src/tasks/denoising/workflows/process_datasets/run_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export NXF_VER=22.04.5 + +nextflow run . \ + -main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --id run_test \ + --input_states "resources_test/common/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ + --publish_dir "resources_test/denoising" \ No newline at end of file diff --git a/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml b/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..5b1cf3dd04 --- /dev/null +++ b/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,67 @@ +functionality: + name: "run_benchmark" + namespace: "denoising/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: "/src/tasks/denoising/api/file_train.yaml" + required: true + direction: input + - name: "--input_test" + __merge__: "/src/tasks/denoising/api/file_test.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: denoising/control_methods/no_denoising + - name: denoising/control_methods/perfect_denoising + - name: denoising/methods/alra + - name: denoising/methods/dca + - name: denoising/methods/knn_smoothing + - name: denoising/methods/magic + - name: denoising/metrics/mse + - name: denoising/metrics/poisson +platforms: + - type: nextflow diff --git a/src/tasks/denoising/workflows/run_benchmark/main.nf b/src/tasks/denoising/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..8b8f6ebd8d --- /dev/null +++ b/src/tasks/denoising/workflows/run_benchmark/main.nf @@ -0,0 +1,184 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + no_denoising, + perfect_denoising, + alra, + dca, + knn_smoothing, + magic + ] + + // construct list of metrics + metrics = [ + mse, + poisson + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_test"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a defined method or all methods + filter: { id, state, comp -> + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_train: "input_train", + input_test: "input_test" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_test: "input_test", + input_denoised: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + // NOTE: the 'denoising' task doesn't use normalized data, + // so code related to normalization_ids is commented out + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // // only keep one of the normalization methods + // | filter{ id, state -> + // state.dataset_uns.normalization_id == "log_cp10k" + // } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + // uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} \ No newline at end of file diff --git a/src/tasks/denoising/workflows/run_benchmark/run_test.sh b/src/tasks/denoising/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000000..9b31877c52 --- /dev/null +++ b/src/tasks/denoising/workflows/run_benchmark/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/denoising" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_train:output_train,input_test:output_test' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" diff --git a/src/tasks/dimensionality_reduction/README.md b/src/tasks/dimensionality_reduction/README.md new file mode 100644 index 0000000000..c18c5dc5ba --- /dev/null +++ b/src/tasks/dimensionality_reduction/README.md @@ -0,0 +1,376 @@ +# Dimensionality reduction for 2D visualization + + +Reduction of high-dimensional datasets to 2D for visualization & +interpretation + +Path: +[`src/tasks/dimensionality_reduction`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/dimensionality_reduction) + +## Motivation + +Data visualisation is an important part of all stages of single-cell +analysis, from initial quality control to interpretation and +presentation of final results. For bulk RNA-seq studies, linear +dimensionality reduction techniques such as PCA and MDS are commonly +used to visualise the variation between samples. While these methods are +highly effective they can only be used to show the first few components +of variation which cannot fully represent the increased complexity and +number of observations in single-cell datasets. For this reason +non-linear techniques (most notably t-SNE and UMAP) have become the +standard for visualising single-cell studies. These methods attempt to +compress a dataset into a two-dimensional space while attempting to +capture as much of the variance between observations as possible. Many +methods for solving this problem now exist. In general these methods try +to preserve distances, while some additionally consider aspects such as +density within the embedded space or conservation of continuous +trajectories. Despite almost every single-cell study using one of these +visualisations there has been debate as to whether they can effectively +capture the variation in single-cell datasets \[@chari2023speciousart\]. + +## Description + +The dimensionality reduction task attempts to quantify the ability of +methods to embed the information present in complex single-cell studies +into a two-dimensional space. Thus, this task is specifically designed +for dimensionality reduction for visualisation and does not consider +other uses of dimensionality reduction in standard single-cell workflows +such as improving the signal-to-noise ratio (and in fact several of the +methods use PCA as a pre-processing step for this reason). Unlike most +tasks, methods for the dimensionality reduction task must accept a +matrix containing expression values normalised to 10,000 counts per cell +and log transformed (log-10k) and produce a two-dimensional coordinate +for each cell. Pre-normalised matrices are required to enforce +consistency between the metric evaluation (which generally requires +normalised data) and the method runs. When these are not consistent, +methods that use the same normalisation as used in the metric tend to +score more highly. For some methods we also evaluate the pre-processing +recommended by the method. + +## Authors & contributors + +| name | roles | +|:-----------------------|:-------------------| +| Luke Zappia | maintainer, author | +| Michal Klein | author | +| Scott Gigante | author | +| Ben DeMeo | author | +| Robrecht Cannoodt | author | +| Kai Waldrant | contributor | +| Sai Nirmayi Yasa | contributor | +| Juan A. Cordero Varela | contributor | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common dataset") + comp_process_dataset[/"Data processor"/] + file_dataset("Dataset") + file_solution("Test data") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_embedding("Embedding") + file_score("Score") + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_dataset + comp_process_dataset-->file_solution + file_dataset---comp_control_method + file_dataset---comp_method + file_solution---comp_control_method + file_solution---comp_metric + comp_control_method-->file_embedding + comp_method-->file_embedding + comp_metric-->file_score + file_embedding---comp_metric +``` + +## File format: Common dataset + +A dataset processed by the common dataset processing pipeline. + +Example file: `resources_test/common/pancreas/dataset.h5ad` + +Description: + +This dataset contains both raw counts and normalized data matrices, as +well as a PCA embedding, HVG selection and a kNN graph. + +Format: + +
+ + AnnData object + obs: 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid', 'size_factors' + var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' + obsm: 'X_pca' + obsp: 'knn_distances', 'knn_connectivities' + varm: 'pca_loadings' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'pca_variance', 'knn' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------------------------------|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | +| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | +| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | +| `obs["cell_type"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | +| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | +| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | +| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | +| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | +| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | +| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | +| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | +| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | +| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | +| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | +| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | +| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | +| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | +| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | +| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | +| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | +| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | +| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors created by the normalisation method, if any. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | +| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | +| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalised expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | +| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["pca_variance"]` | `double` | The PCA variance objects. | +| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | + +
+ +## Component type: Data processor + +Path: +[`src/dimensionality_reduction`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/dimensionality_reduction) + +A dimensionality reduction dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------|:-------|:---------------------------------------------------------------| +| `--input` | `file` | A dataset processed by the common dataset processing pipeline. | +| `--output_dataset` | `file` | (*Output*) The dataset to pass to a method. | +| `--output_solution` | `file` | (*Output*) The data for evaluating a dimensionality reduction. | + +
+ +## File format: Dataset + +The dataset to pass to a method. + +Example file: +`resources_test/dimensionality_reduction/pancreas/dataset.h5ad` + +Format: + +
+ + AnnData object + var: 'hvg_score' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:----------|:-------------------------------------------------------------------------------------| +| `var["hvg_score"]` | `double` | High variability gene score (normalized dispersion). The greater, the more variable. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Test data + +The data for evaluating a dimensionality reduction. + +Example file: +`resources_test/dimensionality_reduction/pancreas/solution.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type' + var: 'hvg_score' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:---------------------------------------------------------------------------------------------------------| +| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | +| `var["hvg_score"]` | `double` | High variability gene score (normalized dispersion). The greater, the more variable. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Control method + +Path: +[`src/dimensionality_reduction/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/dimensionality_reduction/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:--------------------------------------------------------------| +| `--input` | `file` | The dataset to pass to a method. | +| `--input_solution` | `file` | The data for evaluating a dimensionality reduction. | +| `--output` | `file` | (*Output*) A dataset with dimensionality reduction embedding. | + +
+ +## Component type: Method + +Path: +[`src/dimensionality_reduction/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/dimensionality_reduction/methods) + +A dimensionality reduction method. + +Arguments: + +
+ +| Name | Type | Description | +|:-----------|:-------|:--------------------------------------------------------------| +| `--input` | `file` | The dataset to pass to a method. | +| `--output` | `file` | (*Output*) A dataset with dimensionality reduction embedding. | + +
+ +## Component type: Metric + +Path: +[`src/dimensionality_reduction/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/dimensionality_reduction/metrics) + +A dimensionality reduction metric. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------|:-------|:----------------------------------------------------| +| `--input_embedding` | `file` | A dataset with dimensionality reduction embedding. | +| `--input_solution` | `file` | The data for evaluating a dimensionality reduction. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Embedding + +A dataset with dimensionality reduction embedding. + +Example file: +`resources_test/dimensionality_reduction/pancreas/embedding.h5ad` + +Format: + +
+ + AnnData object + obsm: 'X_emb' + uns: 'dataset_id', 'method_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obsm["X_emb"]` | `double` | The dimensionally reduced embedding. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Score + +Metric score file + +Example file: +`resources_test/dimensionality_reduction/pancreas/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/src/tasks/dimensionality_reduction/api/comp_control_method.yaml b/src/tasks/dimensionality_reduction/api/comp_control_method.yaml new file mode 100644 index 0000000000..dfa346752f --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/comp_control_method.yaml @@ -0,0 +1,33 @@ +functionality: + namespace: dimensionality_reduction/control_methods + info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + Control methods have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input" + __merge__: file_dataset.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_embedding.yaml + direction: output + required: true + test_resources: + - path: /resources_test/dimensionality_reduction/pancreas/ + dest: resources_test/dimensionality_reduction/pancreas/ + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/comp_method.yaml b/src/tasks/dimensionality_reduction/api/comp_method.yaml new file mode 100644 index 0000000000..34d63607a4 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/comp_method.yaml @@ -0,0 +1,27 @@ +functionality: + namespace: dimensionality_reduction/methods + info: + type: method + type_info: + label: Method + summary: A dimensionality reduction method. + description: | + A dimensionality reduction method to summarise the biological + information in a dataset in as few dimensions as possible. + arguments: + - name: "--input" + __merge__: file_dataset.yaml + direction: input + required: true + - name: "--output" + __merge__: file_embedding.yaml + direction: output + required: true + test_resources: + - path: /resources_test/dimensionality_reduction/pancreas/ + dest: resources_test/dimensionality_reduction/pancreas/ + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/dimensionality_reduction/api/comp_metric.yaml b/src/tasks/dimensionality_reduction/api/comp_metric.yaml new file mode 100644 index 0000000000..8cd90e4ca1 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/comp_metric.yaml @@ -0,0 +1,30 @@ +functionality: + namespace: dimensionality_reduction/metrics + info: + type: metric + type_info: + label: Metric + summary: A dimensionality reduction metric. + description: | + A metric for evaluating dimensionality reductions. + arguments: + - name: "--input_embedding" + direction: input + __merge__: file_embedding.yaml + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - path: /resources_test/dimensionality_reduction/pancreas/ + dest: resources_test/dimensionality_reduction/pancreas/ + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml b/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..1f7b150871 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml @@ -0,0 +1,27 @@ +functionality: + namespace: dimensionality_reduction + info: + type: process_dataset + type_info: + label: Data processor + summary: A dimensionality reduction dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input" + __merge__: /src/datasets/api/file_common_dataset.yaml + direction: input + required: true + - name: "--output_dataset" + __merge__: file_dataset.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + test_resources: + - path: /resources_test/common/pancreas/ + dest: resources_test/common/pancreas/ + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml b/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml new file mode 100644 index 0000000000..dba599da9a --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml @@ -0,0 +1,58 @@ +type: file +example: "resources_test/dimensionality_reduction/pancreas/dataset.h5ad" +info: + label: "Dataset" + summary: "The dataset to pass to a method." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + var: + - type: double + name: hvg_score + description: High variability gene score (normalized dispersion). The greater, the more variable. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/dimensionality_reduction/api/file_dataset.yaml b/src/tasks/dimensionality_reduction/api/file_dataset.yaml new file mode 100644 index 0000000000..8061f8f0c5 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/file_dataset.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/dimensionality_reduction/pancreas/dataset.h5ad" +info: + label: "Dataset" + summary: "The dataset to pass to a method." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + var: + - type: double + name: hvg_score + description: High variability gene score (normalized dispersion). The greater, the more variable. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/dimensionality_reduction/api/file_embedding.yaml b/src/tasks/dimensionality_reduction/api/file_embedding.yaml new file mode 100644 index 0000000000..c33d76ae8f --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/file_embedding.yaml @@ -0,0 +1,25 @@ +type: file +example: "resources_test/dimensionality_reduction/pancreas/embedding.h5ad" +info: + label: "Embedding" + summary: "A dataset with dimensionality reduction embedding." + slots: + obsm: + - type: double + name: X_emb + description: The dimensionally reduced embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + diff --git a/src/tasks/dimensionality_reduction/api/file_score.yaml b/src/tasks/dimensionality_reduction/api/file_score.yaml new file mode 100644 index 0000000000..71200ef9e1 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/file_score.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/dimensionality_reduction/pancreas/score.h5ad" +info: + label: "Score" + summary: "Metric score file" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/file_solution.yaml b/src/tasks/dimensionality_reduction/api/file_solution.yaml new file mode 100644 index 0000000000..9d08f8fb7a --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/file_solution.yaml @@ -0,0 +1,58 @@ +type: file +example: "resources_test/dimensionality_reduction/pancreas/solution.h5ad" +info: + label: "Test data" + summary: "The data for evaluating a dimensionality reduction." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + var: + - type: double + name: hvg_score + description: High variability gene score (normalized dispersion). The greater, the more variable. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/dimensionality_reduction/api/task_info.yaml b/src/tasks/dimensionality_reduction/api/task_info.yaml new file mode 100644 index 0000000000..4f24ae9764 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/task_info.yaml @@ -0,0 +1,73 @@ +name: dimensionality_reduction +label: "Dimensionality reduction for 2D visualization" +v1: + path: openproblems/tasks/dimensionality_reduction/README.md + commit: b353a462f6ea353e0fc43d0f9fcbbe621edc3a0b +summary: Reduction of high-dimensional datasets to 2D for visualization & interpretation +image: "thumbnail.svg" +motivation: | + Data visualisation is an important part of all stages of single-cell analysis, from + initial quality control to interpretation and presentation of final results. For bulk RNA-seq + studies, linear dimensionality reduction techniques such as PCA and MDS are commonly used + to visualise the variation between samples. While these methods are highly effective they + can only be used to show the first few components of variation which cannot fully represent + the increased complexity and number of observations in single-cell datasets. For this reason + non-linear techniques (most notably t-SNE and UMAP) have become the standard for visualising + single-cell studies. These methods attempt to compress a dataset into a two-dimensional space + while attempting to capture as much of the variance between observations as possible. Many + methods for solving this problem now exist. In general these methods try to preserve distances, + while some additionally consider aspects such as density within the embedded space or conservation + of continuous trajectories. Despite almost every single-cell study using one of these visualisations + there has been debate as to whether they can effectively capture the variation in single-cell + datasets [@chari2023speciousart]. +description: | + The dimensionality reduction task attempts to quantify the ability of methods to embed the + information present in complex single-cell studies into a two-dimensional space. Thus, this task + is specifically designed for dimensionality reduction for visualisation and does not consider other + uses of dimensionality reduction in standard single-cell workflows such as improving the + signal-to-noise ratio (and in fact several of the methods use PCA as a pre-processing step for this + reason). Unlike most tasks, methods for the dimensionality reduction task must accept a matrix + containing expression values normalised to 10,000 counts per cell and log transformed (log-10k) and + produce a two-dimensional coordinate for each cell. Pre-normalised matrices are required to + enforce consistency between the metric evaluation (which generally requires normalised data) and + the method runs. When these are not consistent, methods that use the same normalisation as used in + the metric tend to score more highly. For some methods we also evaluate the pre-processing + recommended by the method. +authors: + - name: Luke Zappia + roles: [ maintainer, author ] + info: + github: lazappi + - name: Michal Klein + roles: [ author ] + info: + github: michalk8 + - name: Scott Gigante + roles: [ author ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Ben DeMeo + roles: [ author ] + info: + github: bendemeo + - name: Robrecht Cannoodt + roles: [ author ] + info: + github: rcannood + orcid: 0000-0003-3641-729X + - name: Kai Waldrant + roles: [ contributor ] + info: + github: KaiWaldrant + orcid: 0009-0003-8555-1361 + - name: Sai Nirmayi Yasa + roles: [ contributor ] + info: + github: sainirmayi + orcid: 0009-0003-6319-9803 + - name: Juan A. Cordero Varela + roles: [ contributor ] + info: + github: jacorvar + orcid: 0000-0002-7373-5433 diff --git a/src/tasks/dimensionality_reduction/api/thumbnail.svg b/src/tasks/dimensionality_reduction/api/thumbnail.svg new file mode 100644 index 0000000000..62911379a1 --- /dev/null +++ b/src/tasks/dimensionality_reduction/api/thumbnail.svg @@ -0,0 +1 @@ +dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml new file mode 100644 index 0000000000..6c0d36ad44 --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "random_features" + info: + label: Random Features + summary: "Negative control by randomly embedding into a 2D space." + description: "This method serves as a negative control, where the data is randomly embedded into a two-dimensional space, with no attempt to preserve the original structure." + v1: + path: openproblems/tasks/dimensionality_reduction/methods/baseline.py + commit: 80b37e7a6aa27df4436f400397564c01276817e0 + preferred_normalization: counts + variants: + random_features: + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/script.py b/src/tasks/dimensionality_reduction/control_methods/random_features/script.py new file mode 100644 index 0000000000..7908207bda --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/random_features/script.py @@ -0,0 +1,34 @@ +import anndata as ad +import numpy as np + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "reduced.h5ad", +} +meta = { + "functionality_name": "random_features", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create random embedding", flush=True) +X_emb = np.random.normal(0, 1, (input.shape[0], 2)) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml new file mode 100644 index 0000000000..b3ae5aa95b --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "spectral_features" + info: + label: Spectral Features + summary: "Positive control by Use 1000-dimensional diffusions maps as an embedding." + description: "This serves as a positive control since it uses 1000-dimensional diffusions maps as an embedding" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + spectral_features: + arguments: + - name: "--n_comps" + type: integer + default: 1000 + description: "Number of components to use for the embedding." + - name: t + type: integer + default: 1 + description: "Number to power the eigenvalues by." + - name: n_retries + type: integer + default: 1 + description: "Number of times to retry if the embedding fails, each time adding noise." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - umap-learn + - scipy + - numpy + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py b/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py new file mode 100644 index 0000000000..cf8633120c --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py @@ -0,0 +1,77 @@ +import anndata as ad +import umap + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "reduced.h5ad", + "n_comps": 2, +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +def diffusion_map(graph, n_comps, t, n_retries): + import numpy as np + import scipy.sparse.linalg + + diag_data = np.asarray(graph.sum(axis=0)) + identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64) + diag = scipy.sparse.spdiags( + 1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0] + ) + laplacian = identity - diag * graph * diag + num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0]))) + try: + eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( + laplacian, + n_comps, + which="SM", + ncv=num_lanczos_vectors, + tol=1e-4, + v0=np.ones(laplacian.shape[0]), + maxiter=graph.shape[0] * 5, + ) + return (eigenvalues**t) * eigenvectors + except scipy.sparse.linalg.ArpackNoConvergence: + if n_retries > 0: + # add some noise and try again + graph_rand = graph.copy().tocoo() + graph_rand.row = np.random.choice( + graph_rand.shape[0], len(graph_rand.row), replace=True + ) + graph_rand.data *= 0.01 + return diffusion_map( + graph + graph_rand, n_comps, t, n_retries=n_retries - 1 + ) + else: + raise + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create high dimensionally embedding with all features", flush=True) + +n_comps = min(par["n_comps"], min(input.shape) - 2) + +graph = umap.UMAP(transform_mode="graph").fit_transform(input.layers["normalized"]) + +X_emb = diffusion_map(graph, n_comps, t=par["t"], n_retries=par["n_retries"]) + + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml new file mode 100644 index 0000000000..a83d393072 --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "true_features" + info: + label: True Features + summary: "Positive control by retaining the dimensionality without loss of information." + description: "This serves as a positive control since the original high-dimensional data is retained as is, without any loss of information" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + true_features: + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/script.py b/src/tasks/dimensionality_reduction/control_methods/true_features/script.py new file mode 100644 index 0000000000..1a58cd4984 --- /dev/null +++ b/src/tasks/dimensionality_reduction/control_methods/true_features/script.py @@ -0,0 +1,33 @@ +import anndata as ad + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "reduced.h5ad", +} +meta = { + "functionality_name": "true_features", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create high dimensionally embedding with all features", flush=True) +X_emb = input.layers["normalized"].toarray() + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml new file mode 100644 index 0000000000..ff5764a561 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml @@ -0,0 +1,45 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "densmap" + info: + label: densMAP + summary: "Modified UMAP with preservation of local density information" + description: "A modification of UMAP that adds an extra cost term in order to preserve information about the relative local density of the data. It is performed on the same inputs as UMAP." + reference: "narayan2021assessing" + repository_url: https://github.com/lmcinnes/umap + documentation_url: https://github.com/lmcinnes/umap#readme + v1: + path: openproblems/tasks/dimensionality_reduction/methods/umap.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + densmap_logCP10k: + densmap_pca_logCP10k: + n_pca_dims: 50 + densmap_logCP10k_1kHVG: + n_hvg: 1000 + densmap_pca_logCP10k_1kHVG: + n_pca_dims: 50 + n_hvg: 1000 + arguments: + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + - name: "--n_pca_dims" + type: integer + description: Number of PCA dimensions to use. If not specified, no PCA will be performed. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - umap-learn + - pynndescent==0.5.11 + - type: native + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/densmap/script.py b/src/tasks/dimensionality_reduction/methods/densmap/script.py new file mode 100644 index 0000000000..985c95d78a --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/densmap/script.py @@ -0,0 +1,54 @@ +import anndata as ad +from umap import UMAP +import scanpy as sc + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_pca_dims": 50, + "n_hvg": 1000 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +if par["n_pca_dims"]: + print("Apply PCA to normalized data", flush=True) + umap_input = sc.tl.pca( + X_mat, + n_comps=par["n_pca_dims"], + svd_solver="arpack" + ) +else: + print("Use normalized data as input for UMAP", flush=True) + umap_input = X_mat + +print("Run densMAP", flush=True) +X_emb = UMAP(densmap=True, random_state=42).fit_transform(umap_input) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml new file mode 100644 index 0000000000..ced082c708 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: diffusion_map + info: + label: Diffusion Map + summary: Finding meaningful geometric descriptions of datasets using diffusion maps. + description: Implements diffusion map method of data parametrization, including creation and visualization of diffusion map, clustering with diffusion K-means and regression using adaptive regression model. + reference: coifman2006diffusion + documentation_url: https://bioconductor.org/packages/release/bioc/html/destiny.html + repository_url: https://github.com/theislab/destiny + v1: + path: openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + resources: + - type: r_script + path: script.R + arguments: + - name: "--n_dim" + type: integer + description: Number of dimensions. + default: 3 +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: destiny + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R b/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R new file mode 100644 index 0000000000..a9146c8db9 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R @@ -0,0 +1,37 @@ +requireNamespace("anndata", quietly = TRUE) +requireNamespace("diffusionMap", quietly = TRUE) + +## VIASH START +par <- list( + input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + output = "output.h5ad", + n_dim = 3 +) +## VIASH END + +cat("Reading input files\n") +input <- anndata::read_h5ad(par$input) + +cat("Running destiny diffusion map\n") +# create SummarizedExperiment object +sce <- SingleCellExperiment::SingleCellExperiment( + assays = list( + logcounts = t(as.matrix(input$layers[["normalized"]])) + ) +) +dm <- destiny::DiffusionMap(sce) +X_emb <- destiny::eigenvectors(dm)[, seq_len(par$n_dim)] + +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + uns = list( + dataset_id = input$uns[["dataset_id"]], + normalization_id = input$uns[["normalization_id"]], + method_id = meta$functionality_name + ), + obsm = list( + X_emb = X_emb + ), + shape = input$shape +) +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml new file mode 100644 index 0000000000..aa3c5ca0b4 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml @@ -0,0 +1,44 @@ +# see https://github.com/openproblems-bio/openproblems/blob/9ebb777b3b76337e731a3b99f4bf39462a15c4cc/openproblems/tasks/dimensionality_reduction/methods/ivis.py + +__merge__: ../../api/comp_method.yaml +functionality: + name: "ivis" + info: + label: "ivis" + summary: "Structure-preserving dimensionality reduction using a siamese neural network trained on triplets." + description: | + ivis is a machine learning library for reducing dimensionality of very large datasets using Siamese Neural Networks. + ivis preserves global data structures in a low-dimensional space, adds new data points to existing embeddings using + a parametric mapping function, and scales linearly to millions of observations. + reference: szubert2019structurepreserving + repository_url: "https://github.com/beringresearch/ivis" + documentation_url: "https://github.com/beringresearch/ivis#readme" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/ivis.py + commit: 93d2161a08da3edf249abedff5111fb5ce527552 + preferred_normalization: log_cp10k + variants: + ivis_logCPM_1kHVG: + arguments: + - name: '--n_pca_dims' + type: integer + default: 50 + description: Number of principal components of PCA to use. + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + default: 1000 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - ivis[cpu] + - tensorflow<2.16 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/ivis/script.py b/src/tasks/dimensionality_reduction/methods/ivis/script.py new file mode 100644 index 0000000000..1eade8b74d --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/ivis/script.py @@ -0,0 +1,57 @@ +import anndata as ad +import scanpy as sc +from ivis import Ivis + +# todo: allow using gpus instead! + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + "output": "reduced.h5ad", + "n_hvg": 1000, + "n_pca_dims": 50 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +print(f"Running PCA with {par['n_pca_dims']} dimensions", flush=True) +X_pca = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") + +print("Run ivis", flush=True) +# parameters taken from: +# https://bering-ivis.readthedocs.io/en/latest/scanpy_singlecell.html#reducing-dimensionality-using-ivis +ivis = Ivis( + k=15, + model="maaten", + n_epochs_without_progress=5, + verbose=0, + embedding_dims=2, +) +X_emb = ivis.fit_transform(X_pca) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml new file mode 100644 index 0000000000..2b651271a9 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml @@ -0,0 +1,44 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: lmds + + info: + label: LMDS + summary: Landmark Multi-Dimensional Scaling + description: | + Landmark Multi-Dimensional Scaling (LMDS) is a method for dimensionality reduction that is based on the concept of multi-dimensional scaling. + LMDS is a non-linear dimensionality reduction method that is based on the concept of multi-dimensional scaling. + preferred_normalization: log_cp10k + reference: saelens2019comparison + documentation_url: https://dynverse.org/lmds/ + repository_url: https://github.com/dynverse/lmds + + arguments: + - name: "--n_dim" + type: integer + description: Number of dimensions. + default: 2 + - name: "--n_landmarks" + type: integer + description: Number of landmarks. + default: 1000 + - name: "--distance_method" + type: string + description: Number of clusters to be estimated over the input dataset. + choices: ["euclidean", "pearson", "spearman", "cosine", "chisquared", "hamming", "kullback", "manhattan", "maximum", "canberra", "minkowski"] + default: "pearson" + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ Matrix, lmds ] + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/methods/lmds/script.R b/src/tasks/dimensionality_reduction/methods/lmds/script.R new file mode 100644 index 0000000000..ae9461c496 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/lmds/script.R @@ -0,0 +1,39 @@ +requireNamespace("anndata", quietly = TRUE) +requireNamespace("lmds", quietly = TRUE) + +## VIASH START +par <- list( + input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + output = "output.h5ad", + n_dim = 3, + n_landmarks = 1000, + distance_method = "pearson" +) +## VIASH END + +cat("Reading input files\n") +input <- anndata::read_h5ad(par$input) + +# TODO: if we wanted to, we could compute the distance +# matrix in batches. This would be useful for large datasets. +cat("Running LMDS\n") +X_emb <- lmds::lmds( + input$layers[["normalized"]], + ndim = par$n_dim, + num_landmarks = par$n_landmarks, + distance_method = par$distance_method +) + +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + uns = list( + dataset_id = input$uns[["dataset_id"]], + method_id = meta$functionality_name, + normalization_id = input$uns[["normalization_id"]] + ), + obsm = list( + X_emb = X_emb + ), + shape = input$shape +) +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml new file mode 100644 index 0000000000..0d3d0234c4 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml @@ -0,0 +1,55 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "neuralee" + info: + label: NeuralEE + summary: "Non-linear method that uses a neural network to preserve pairwise distances between data points in a high-dimensional space." + description: | + A neural network implementation of elastic embedding. It is a + non-linear method that preserves pairwise distances between data points. + NeuralEE uses a neural network to optimize an objective function that + measures the difference between pairwise distances in the original + high-dimensional space and the two-dimensional space. It is computed on both + the recommended input from the package authors of 500 HVGs selected from a + logged expression matrix (without sequencing depth scaling) and the default + logCPM matrix with 1000 HVGs. + reference: "xiong2020neuralee" + repository_url: "https://github.com/HiBearME/NeuralEE" + documentation_url: "https://github.com/HiBearME/NeuralEE#readme" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/neuralee.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + neuralee_default: + normalize: true + n_hvg: 500 + neuralee_logCP10k_1kHVG: + normalize: false + n_hvg: 1000 + arguments: + - name: "--n_iter" + type: integer + description: Number of iterations. + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + default: 1000 + - name: "--normalize" + type: boolean + default: false + description: Whether to perform own normalization + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - torch + - "git+https://github.com/michalk8/neuralee@8946abf" + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/script.py b/src/tasks/dimensionality_reduction/methods/neuralee/script.py new file mode 100644 index 0000000000..bd13a2f34d --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/neuralee/script.py @@ -0,0 +1,78 @@ +import anndata as ad +import torch +from neuralee.embedding import NeuralEE +from neuralee.dataset import GeneExpressionDataset + +# todo: allow gpu +device = torch.device("cpu") + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_hvg": 1000, + "n_iter": 10, + "normalize": True +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +if par["normalize"]: + print("Performing own normalization", flush=True) + # perform own normalization based on the "recommended" preprocessing taken from example notebooks, e.g.: + # https://github.com/HiBearME/NeuralEE/blob/master/tests/notebooks/retina_dataset.ipynb + dataset = GeneExpressionDataset(input.layers["counts"]) + dataset.log_shift() + if par["n_hvg"]: + dataset.subsample_genes(par["n_hvg"]) + dataset.standardscale() + +else: + X_mat = input.layers["normalized"] + + if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[-par["n_hvg"]:] + X_mat = X_mat[:, idx] + + print("Using pre-normalized data", flush=True) + dataset = GeneExpressionDataset(X_mat) + + +# estimate the affinity matrix +batch_size = min(1000, input.n_obs) +print(f"Use {batch_size} cells as batch to estimate the affinity matrix", flush=True) +dataset.affinity_split(N_small=batch_size) + +print("Create NeuralEE object", flush=True) +NEE = NeuralEE(dataset, d=2, device=device) +fine_tune_kwargs = dict(verbose=False) + +if par["n_iter"]: + fine_tune_kwargs["maxit"] = par["n_iter"] + +print("Run NeuralEE", flush=True) +res = NEE.fine_tune(**fine_tune_kwargs) + +X_emb = res["X"].detach().cpu().numpy() + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml new file mode 100644 index 0000000000..11d3841fb6 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "pca" + info: + label: "PCA" + summary: A linear method that finds orthogonal directions to compute the two-dimensional embedding. + description: | + Principal Component Analysis is a linear method that finds orthogonal + directions in the data that capture the most variance. The first two + principal components are chosen as the two-dimensional embedding. We select + only the first two principal components as the two-dimensional embedding. PCA + is calculated on the logCPM expression matrix with and without selecting 1000 + HVGs. + reference: pearson1901pca + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html + v1: + path: openproblems/tasks/dimensionality_reduction/methods/pca.py + commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 + preferred_normalization: log_cp10k + variants: + pca_logCP10k: + pca_logCP10k_1kHVG: + n_hvg: 1000 + arguments: + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scanpy + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/pca/script.py b/src/tasks/dimensionality_reduction/methods/pca/script.py new file mode 100644 index 0000000000..81cff3441f --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/pca/script.py @@ -0,0 +1,41 @@ +import anndata as ad +import scanpy as sc + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_hvg": 1000 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +print(f"Running PCA", flush=True) +X_emb = sc.tl.pca(X_mat, n_comps=2, svd_solver="arpack")[:, :2] + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml new file mode 100644 index 0000000000..ff63659780 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml @@ -0,0 +1,58 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "phate" + info: + label: PHATE + summary: Preservating trajectories in a dataset by using heat diffusion potential. + description: | + PHATE or "Potential of Heat - diffusion for Affinity - based Transition + Embedding" uses the potential of heat diffusion to preserve trajectories in a + dataset via a diffusion process. It is an affinity - based method that + creates an embedding by finding the dominant eigenvalues of a Markov + transition matrix. We evaluate several variants including using the + recommended square - root transformed CPM matrix as input, this input with + the gamma parameter set to zero and the normal logCPM transformed matrix with + and without HVG selection. + reference: "moon2019visualizing" + repository_url: "https://github.com/KrishnaswamyLab/PHATE" + documentation_url: "https://github.com/KrishnaswamyLab/PHATE#readme" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/phate.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: sqrt_cp10k + variants: + phate_default: + phate_sqrt: + gamma: 0 + phate_logCP10k: + preferred_normalization: log_cp10k + phate_logCP10k_1kHVG: + n_hvg: 1000 + preferred_normalization: log_cp10k + arguments: + - name: '--n_pca_dims' + type: integer + default: 50 + description: Number of principal components of PCA to use. + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + - name: '--gamma' + type: double + description: Gamma value + default: 1 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - phate==1.0.* + - scprep + - "scikit-learn<1.2" + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/phate/script.py b/src/tasks/dimensionality_reduction/methods/phate/script.py new file mode 100644 index 0000000000..a21d9e0d87 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/phate/script.py @@ -0,0 +1,45 @@ +import anndata as ad +from phate import PHATE + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_pca_dims": 50, + "n_hvg": 1000, + "gamma": 1 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Subsetting to {par['n_hvg']} HVG", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +print("Run PHATE", flush=True) +phate_op = PHATE(n_pca=par["n_pca_dims"], verbose=False, n_jobs=-1, gamma=par["gamma"]) +X_emb = phate_op.fit_transform(X_mat) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml new file mode 100644 index 0000000000..2f733bb714 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: pymde + info: + label: PyMDE + summary: "A Python implementation of Minimum-Distortion Embedding" + description: | + PyMDE is a Python implementation of Minimum-Distortion Embedding. It is a non-linear + method that preserves distances between cells or neighbourhoods in the original space. + reference: agrawal2021mde + repository_url: https://github.com/cvxgrp/pymde + documentation_url: https://pymde.org + v1: + path: openproblems/tasks/dimensionality_reduction/methods/pymde.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + arguments: + - name: --embed_method + type: string + description: The method to use for embedding. Options are 'umap' and 'tsne'. + default: neighbors + choices: [ neighbors, distances ] + - name: --n_hvg + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + - name: --n_pca_dims + type: integer + description: Number of principal components to use for the initial PCA step. + default: 100 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: pymde + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/pymde/script.py b/src/tasks/dimensionality_reduction/methods/pymde/script.py new file mode 100644 index 0000000000..612582d8c3 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/pymde/script.py @@ -0,0 +1,59 @@ +import anndata as ad +import scanpy as sc +import pymde + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + "output": "reduced.h5ad", + "embed_method": "neighbors", + "n_hvg": 1000, + "n_pca_dims": 50, +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +if par["embed_method"] == "neighbors": + mde_fn = pymde.preserve_neighbors +elif par["embed_method"] == "distances": + mde_fn = pymde.preserve_distances +else: + raise ValueError(f"Unknown embedding method: {par['embed_method']}") + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +print(f"Compute PCA", flush=True) +X_pca = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") + +print(f"Run MDE", flush=True) +X_emb = ( + mde_fn(X_pca, embedding_dim=2, verbose=True) + .embed(verbose=True) + .detach() + .numpy() +) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml new file mode 100644 index 0000000000..ba4b7b3b84 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml @@ -0,0 +1,57 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: simlr + + info: + label: SIMLR + summary: Multikernel-based learning of distance metrics from gene expression data for dimension reduction, clustering and visulaization. + description: | + Single-cell Interpretation via Multikernel LeaRning (SIMLR) learns cell-to-cell similarity measures from single-cell RNA-seq data in using Gaussian kernels with various hyperparameters in order to perform dimension reduction, clustering and visualization. + SIMLR assumes that if C separable populations exist among the N cells, then the similarity matrix should have an approximate block-diagonal structure with C blocks whereby cells have larger similarities to other cells within the same subpopulations. Learned similarity between two cells should be small if the Euclidean distance between them is large. The cell-to-cell similarity is computed using an optimization framework over an N x N similarity matrix, a low-dimensional auxilary matrix enforcing low rank constraint on the similarity matrix, and the kernel weights. + Dimension reduction is achieved by the stochastic neighbor embedding methodology with the learned similarities as input. + preferred_normalization: log_cp10k + reference: "wang2017visualization" + documentation_url: https://github.com/BatzoglouLabSU/SIMLR/blob/SIMLR/README.md + repository_url: https://github.com/BatzoglouLabSU/SIMLR + + arguments: + - name: "--n_dim" + type: integer + description: Number of dimensions. + - name: "--n_clusters" + type: integer + description: Number of clusters to be estimated over the input dataset. + - name: "--tuning_param" + type: integer + default: 10 + description: Number of dimensions. + - name: "--impute" + type: boolean + default: false + description: Should the input data be transposed? + - name: "--normalize" + type: boolean + default: false + description: Should the input data be normalized? + - name: "--cores_ratio" + type: integer + default: 1 + description: Ratio of the number of cores to be used when computing the multi-kernel. + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + packages: [ grDevices ] + cran: [ Matrix, parallel, Rcpp, pracma, RcppAnnoy, RSpectra, igraph ] + bioc: [ SIMLR ] + - type: native + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/methods/simlr/script.R b/src/tasks/dimensionality_reduction/methods/simlr/script.R new file mode 100644 index 0000000000..0622076c08 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/simlr/script.R @@ -0,0 +1,69 @@ +requireNamespace("anndata", quietly = TRUE) +requireNamespace("SIMLR", quietly = TRUE) + +## VIASH START +par <- list( + input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + output = "output.h5ad", + n_clusters = NULL, + n_dim = NA, + tuning_param = 10, + impute = FALSE, + normalize = FALSE, + cores_ratio = 1 +) +meta <- list( + functionality_name = "simlr" +) +## VIASH END + +cat("Reading input files\n") +input <- anndata::read_h5ad(par$input) + +X <- t(as.matrix(input$layers[["normalized"]])) + +if (is.null(par$n_clusters)) { + cat("Estimating the number of clusters\n") + set.seed(1) + NUMC = 2:5 + estimates <- SIMLR::SIMLR_Estimate_Number_of_Clusters( + X = X, + NUMC = NUMC, + cores.ratio = par$cores_ratio + ) + n_clusters <- NUMC[which.min(estimates$K2)] +} else { + n_clusters <- par$n_clusters +} + +if (is.null(par$n_dim)) { + n_dim <- NA +} else { + n_dim <- par$n_dim +} + +cat("Running SIMLR\n") +simlr_result <- SIMLR::SIMLR( + X = X, + c = n_clusters, + no.dim = n_dim, + k = par$tuning_param, + if.impute = par$impute, + normalize = par$normalize, + cores.ratio = par$cores_ratio +) +obsm_X_emb <- simlr_result$ydata + +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + uns = list( + dataset_id = input$uns[["dataset_id"]], + method_id = meta$functionality_name, + normalization_id = input$uns[["normalization_id"]] + ), + obsm = list( + X_emb = obsm_X_emb + ), + shape = input$shape +) +output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml new file mode 100644 index 0000000000..cedaba0484 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml @@ -0,0 +1,49 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "tsne" + info: + label: t-SNE + summary: "Minimizing Kullback-Leibler divergence by converting similarities into joint probabilities between data points and the low/high dimensional embedding." + description: | + t-distributed Stochastic Neighbor Embedding converts similarities + between data points to joint probabilities and tries to minimize the + Kullback-Leibler divergence between the joint probabilities of the + low-dimensional embedding and the high-dimensional data. We use the + implementation in the scanpy package with the result of PCA on the logCPM + expression matrix (with and without HVG selection). + reference: vandermaaten2008visualizing + repository_url: "https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/tsne.py + commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 + preferred_normalization: log_cp10k + variants: + tsne_logCP10k: + tsne_logCP10k_1kHVG: + n_hvg: 1000 + arguments: + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + - name: "--n_pca_dims" + type: integer + description: Number of PCA dimensions to use. If not specified, no PCA will be performed. + default: 50 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: + - cmake + - gcc + - type: python + github: + - DmitryUlyanov/Multicore-TSNE + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/tsne/script.py b/src/tasks/dimensionality_reduction/methods/tsne/script.py new file mode 100644 index 0000000000..171e17bded --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/tsne/script.py @@ -0,0 +1,47 @@ +import anndata as ad +import scanpy as sc + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_pca_dims": 50, + "n_hvg": 1000 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Subsetting to {par['n_hvg']} HVG", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +print("Computing PCA", flush=True) +input.obsm["X_pca"] = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") + +print("Run t-SNE", flush=True) +sc.tl.tsne(input, use_rep="X_pca", n_pcs=par["n_pca_dims"]) +X_emb = input.obsm["X_tsne"].copy() + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml new file mode 100644 index 0000000000..a073e9dbe3 --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml @@ -0,0 +1,50 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "umap" + info: + label: UMAP + summary: "A manifold learning algorithm that utilizes topological data analysis for dimension reduction." + description: | + Uniform Manifold Approximation and Projection is an algorithm for + dimension reduction based on manifold learning techniques and ideas from + topological data analysis. We perform UMAP on the logCPM expression matrix + before and after HVG selection and with and without PCA as a pre-processing + step. + reference : "mcinnes2018umap" + repository_url: "https://github.com/lmcinnes/umap" + documentation_url: "https://github.com/lmcinnes/umap#readme" + v1: + path: openproblems/tasks/dimensionality_reduction/methods/umap.py + commit: 14d70b330cae09527a6d4c4e552db240601e31cf + preferred_normalization: log_cp10k + variants: + umap_logCP10k: + umap_pca_logCP10k: + n_pca_dims: 50 + umap_logCP10k_1kHVG: + n_hvg: 1000 + umap_pca_logCP10k_1kHVG: + n_pca_dims: 50 + n_hvg: 1000 + arguments: + - name: "--n_hvg" + type: integer + description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. + default: 1000 + - name: "--n_pca_dims" + type: integer + description: Number of PCA dimensions to use. If not specified, no PCA will be performed. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - umap-learn + - pynndescent==0.5.11 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/umap/script.py b/src/tasks/dimensionality_reduction/methods/umap/script.py new file mode 100644 index 0000000000..800e65328c --- /dev/null +++ b/src/tasks/dimensionality_reduction/methods/umap/script.py @@ -0,0 +1,54 @@ +import anndata as ad +from umap import UMAP +import scanpy as sc + +## VIASH START +par = { + "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "output": "reduced.h5ad", + "n_pca_dims": 50, + "n_hvg": 1000 +} +meta = { + "functionality_name": "foo", +} +## VIASH END + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) +X_mat = input.layers["normalized"] + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + X_mat = X_mat[:, idx] + +if par["n_pca_dims"]: + print("Apply PCA to normalized data", flush=True) + umap_input = sc.tl.pca( + X_mat, + n_comps=par["n_pca_dims"], + svd_solver="arpack" + ) +else: + print("Use normalized data as input for UMAP", flush=True) + umap_input = X_mat + +print("Run UMAP", flush=True) +X_emb = UMAP(densmap=False, random_state=42).fit_transform(umap_input) + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=input.obs[[]], + obsm={ + "X_emb": X_emb + }, + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml new file mode 100644 index 0000000000..67f1078f13 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml @@ -0,0 +1,61 @@ +__merge__: ../../api/comp_metric.yaml + +functionality: + name: clustering_performance + info: + metrics: + - name: normalized_mutual_information + label: NMI + summary: Normalized Mutual Information (NMI) is a measure of the concordance between clustering obtained from the reduced-dimensional embeddings and the cell labels. + description: | + The Normalized Mutual Information (NMI) is a measure of the similarity between cluster labels obtained from the clustering of dimensionality reduction embeddings and the true cell labels. It is a normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation). + Mutual Information quantifies the "amount of information" obtained about one random variable by observing the other random variable. Assuming two label assignments X and Y, it is given by: + $MI(X,Y) = \sum_{x=1}^{X}\sum_{y=1}^{Y}p(x,y)log(\frac{P(x,y)}{P(x)P'(y)})$, + where P(x,y) is the joint probability mass function of X and Y, and P(x), P'(y) are the marginal probability mass functions of X and Y respectively. The mutual information is normalized by some generalized mean of H(X) and H(Y). Therefore, Normalized Mutual Information can be defined as: + $NMI(X,Y) = \frac{MI(X,Y)}{mean(H(X),H(Y))}$, + where H(X) and H(Y) are the entropies of X and Y respectively. Higher NMI score suggests that the method is effective in preserving relevant information. + reference: emmons2016analysis + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html + repository_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html + min: 0 + max: 1 + maximize: true + - name: adjusted_rand_index + label: ARI + summary: Adjusted Rand Index (ARI) is a measure of the similarities between two cluster assignments of the reduced-dimensional embeddings and the true cell types. + description: | + Adjusted Rand Index (ARI) is a measure of similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted (from the reduced dimensional embeddings) and true clusterings (cell type labels). It is the Rand Index (RI) adjusted for chance. + Assuming the C as the cell type labels and K as the clustering of the reduced dimensional embedding, Rand Index can be defined as: + $RI = \frac{a + b}{{C}_{2}^{n_{samples}}}$, + where 'a' is the number of pairs of elements that are in the same set in C and in the same set in K, 'b' is the number of pairs of elements that are in different sets in C and in different sets in K, and ${C}_{2}^{n_{samples}}$ is the total number of possible pairs in the dataset. Random label assignments can be discounted as follows: + $ARI = \frac{RI - E[RI]}{max(RI) - E[RI]}$, + where E[RI] is the expected RI of random labellings. + reference: santos2009on + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score + repository_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score + min: 0 + max: 1 + maximize: true + + # Component-specific parameters + arguments: + - name: "--nmi_avg_method" + type: string + default: arithmetic + description: Method to compute normalizer in the denominator for normalized mutual information score calculation. + choices: [ min, geometric, arithmetic, max ] + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: [ scikit-learn, scanpy, leidenalg ] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py b/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py new file mode 100644 index 0000000000..eff2d5cd97 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py @@ -0,0 +1,63 @@ +import anndata as ad +import scanpy as sc +from sklearn.cluster import KMeans +from sklearn.metrics import normalized_mutual_info_score +from sklearn.metrics import adjusted_rand_score + +## VIASH START +par = { + 'input_embedding': 'resources_test/dimensionality_reduction/pancreas/embedding.h5ad', + 'input_solution': 'resources_test/dimensionality_reduction/pancreas/solution.h5ad', + 'output': 'output.h5ad', + 'nmi_avg_method': 'arithmetic' +} +meta = { + 'functionality_name': 'clustering_performance' +} +## VIASH END + +print('Reading input files', flush=True) +input_embedding = ad.read_h5ad(par['input_embedding']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Compute metrics', flush=True) + +# Perform Leiden clustering on dimensionlity reduction embedding +n = 20 +resolutions = [2 * x / n for x in range(1, n + 1)] +score_max = 0 +res_max = resolutions[0] +key_max = None +score_all = [] + +if "neighbors" not in input_embedding.uns: + sc.pp.neighbors(input_embedding, use_rep="X_emb") + +for res in resolutions: + key_added = f"X_emb_leiden_{res}" + sc.tl.leiden(input_embedding, resolution=res, key_added=key_added) + score = normalized_mutual_info_score(input_solution.obs["cell_type"], input_embedding.obs[key_added], average_method = par['nmi_avg_method']) + score_all.append(score) + + if score_max < score: + score_max = score + res_max = res + key_max = key_added + +# Compute NMI scores +nmi = normalized_mutual_info_score(input_solution.obs["cell_type"], input_embedding.obs[key_max], average_method = par['nmi_avg_method']) + +# Compute ARI scores +ari = adjusted_rand_score(input_solution.obs["cell_type"], input_embedding.obs[key_max]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_embedding.uns['dataset_id'], + 'normalization_id': input_embedding.uns['normalization_id'], + 'method_id': input_embedding.uns['method_id'], + 'metric_ids': [ 'normalized_mutual_information', 'adjusted_rand_index' ], + 'metric_values': [ nmi, ari ] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml new file mode 100644 index 0000000000..6787e88f7e --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml @@ -0,0 +1,166 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "coranking" + # description: | + # This is a set of metrics which all use a co-ranking matrix as the basis of the metric. + info: + metrics: + - name: continuity_at_k30 + label: Continuity at k=30 + reference: venna2006local + summary: "The continuity metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + description: "The continuity metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: trustworthiness_at_k30 + label: Trustworthiness at k=30 + summary: "The trustworthiness metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + description: "The trustworthiness metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: venna2006local + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: qnx_at_k30 + label: The value for QNX at k=30 + summary: "The QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + description: "The QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: lee2009quality + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: lcmc_at_k30 + label: The value for LCMC at k=30 + summary: "The LCMC metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + description: "The LCMC metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: chen2009local + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: qnx_auc + label: Area under the QNX curve + summary: "The AU-QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + description: "The AU-QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: lueks2011evaluate + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: qlocal + label: Local quality measure + summary: "The local quality metric computed on the co-ranking matrix between expression matrix and embedding." + description: "The local quality metric computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: lueks2011evaluate + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + - name: qglobal + label: Global quality measure + summary: "The Global quality metric computed on the co-ranking matrix between expression matrix and embedding." + description: "The Global quality metric computed on the co-ranking matrix between expression matrix and embedding." + repository_url: https://github.com/gdkrmr/coRanking/ + documentation_url: https://coranking.guido-kraemer.com/ + reference: lueks2011evaluate + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + note: | + The original v1 implementations consisted of a lot of helper functions which were + derived from the pyDRMetrics package. This version uses the coRanking package + to avoid reimplementing and potentially introducing a lot of bugs in how + the various metrics are computed. + + In addition, the references for each of the metrics were looked up to + properly attribute the original authors of each of the metrics. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ coRanking ] + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/script.R b/src/tasks/dimensionality_reduction/metrics/coranking/script.R new file mode 100644 index 0000000000..7fcce8c2f8 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/coranking/script.R @@ -0,0 +1,101 @@ +library(anndata) +library(coRanking) + +## VIASH START +par <- list( + "input_embedding" = "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", + "input_solution" = "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output" = "score.h5ad" +) +## VIASH END + +cat("Read anndata objects") +input_solution <- anndata::read_h5ad(par[["input_solution"]]) +input_embedding <- anndata::read_h5ad(par[["input_embedding"]]) + +# get datasets +high_dim <- input_solution$layers[["normalized"]] +X_emb <- input_embedding$obsm[["X_emb"]] + +if (any(is.na(X_emb))) { + continuity_at_k30 <- + trustworthiness_at_k30 <- + qnx_at_k30 <- + lcmc_at_k30 <- + qnx_auc <- + qlocal <- + qglobal <- + 0 +} else { + cat("Compute pairwise distances\n") + # TODO: computing a square distance matrix is problematic for large datasets! + # TODO: should we use a different distance metric for the high_dim? + # TODO: or should we subset to the HVG? + dist_highdim <- coRanking:::euclidean(as.matrix(high_dim)) + dist_emb <- coRanking:::euclidean(as.matrix(X_emb)) + + cat("Compute ranking matrices\n") + rmat_highdim <- rankmatrix(dist_highdim, input = "dist") + rmat_emb <- rankmatrix(dist_emb, input = "dist") + + cat("Compute coranking matrix\n") + corank <- coranking(rmat_highdim, rmat_emb, "rank") + + cat("Compute metrics\n") + # Compute QNX. This is a curve indicating the percentage of points + # that are mild in- and extrusions or keep their rank. + qnx <- Q_NX(corank) + + # Calculate the local continuity meta-criterion from a co-ranking matrix. + lcmc <- LCMC(corank) + + # the values of qnx are split into local and global values by kmax + kmax <- which.max(lcmc) + + # check certain quality values at k=30 + k30 <- 30 + trustworthiness_at_k30 <- coRanking:::cm.M_T(corank, k30) + continuity_at_k30 <- coRanking:::cm.M_C(corank, k30) + qnx_at_k30 <- qnx[[k30]] + lcmc_at_k30 <- lcmc[[k30]] + + # area under the QNX curve + qnx_auc <- mean(qnx) + + # local quality measure + qlocal <- mean(qnx[seq_len(kmax)]) + + # global quality measure + qglobal <- mean(qnx[-seq_len(kmax)]) +} + +cat("construct output AnnData\n") +output <- AnnData( + shape = c(0L, 0L), + uns = list( + dataset_id = input_solution$uns[["dataset_id"]], + normalization_id = input_solution$uns[["normalization_id"]], + method_id = input_embedding$uns[["method_id"]], + metric_ids = c( + "continuity_at_k30", + "trustworthiness_at_k30", + "qnx_at_k30", + "lcmc_at_k30", + "qnx_auc", + "qlocal", + "qglobal" + ), + metric_values = c( + continuity_at_k30, + trustworthiness_at_k30, + qnx_at_k30, + lcmc_at_k30, + qnx_auc, + qlocal, + qglobal + ) + ) +) + +cat("Write to file\n") +output$write_h5ad(par$output) diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml new file mode 100644 index 0000000000..4b1e9f3a32 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "density_preservation" + info: + metrics: + - name: density_preservation + label: Density preservation + summary: "Similarity between local densities in the high-dimensional data and the reduced data." + description: | + "Similarity between local densities in the high-dimensional data and the reduced data. + This is computed as the pearson correlation of local radii with the local radii in the original data space." + reference: narayan2021assessing + min: -1 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/density.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + arguments: + - name: "--n_neighbors" + type: integer + default: 30 + description: "Number of neighbors to use for density estimation." + - name: "--seed" + type: integer + default: 42 + description: "Random seed." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scipy + - numpy + - umap-learn + - pynndescent~=0.5.11 + - type: nextflow + directives: + label: [midtime, lowmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py b/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py new file mode 100644 index 0000000000..9bf44397c2 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py @@ -0,0 +1,132 @@ + + +import anndata as ad +import numpy as np +from typing import Optional +from umap import UMAP +from scipy.stats import pearsonr + +## VIASH START +par = { + "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", + "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "score.h5ad", + "n_neighbors": 30, + "seed": 42, +} +## VIASH END + +# Interpreted from: +# https://github.com/lmcinnes/umap/blob/317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243 +# +# Author: Leland McInnes +# +# License: BSD 3 clause +def _calculate_radii( + X: np.ndarray, + n_neighbors: int = 30, + random_state: Optional[int] = None +) -> np.ndarray: + from umap.umap_ import fuzzy_simplicial_set + from umap.umap_ import nearest_neighbors + + (knn_indices, knn_dists, _) = nearest_neighbors( + X, + n_neighbors, + "euclidean", + {}, + False, + random_state, + verbose=False, + ) + + emb_graph, _, _, emb_dists = fuzzy_simplicial_set( + X, + n_neighbors, + random_state, + "euclidean", + {}, + knn_indices, + knn_dists, + verbose=False, + return_dists=True, + ) + + emb_graph = emb_graph.tocoo() + emb_graph.sum_duplicates() + emb_graph.eliminate_zeros() + + n_vertices = emb_graph.shape[1] + + mu_sum = np.zeros(n_vertices, dtype=np.float32) + re = np.zeros(n_vertices, dtype=np.float32) + + head = emb_graph.row + tail = emb_graph.col + for i in range(len(head)): + j = head[i] + k = tail[i] + D = emb_dists[j, k] + mu = emb_graph.data[i] + re[j] += mu * D + re[k] += mu * D + mu_sum[j] += mu + mu_sum[k] += mu + + epsilon = 1e-8 + return np.log(epsilon + (re / mu_sum)) + +def compute_density_preservation( + X_emb: np.ndarray, + high_dim: np.ndarray, + n_neighbors: int = 30, + random_state: Optional[int] = None +) -> float: + if np.any(np.isnan(X_emb)): + return 0.0 + + print("Compute local radii in original data", flush=True) + ro = _calculate_radii( + high_dim, + n_neighbors=n_neighbors, + random_state=random_state + ) + + print("Compute local radii of embedding", flush=True) + re = _calculate_radii( + X_emb, + n_neighbors=n_neighbors, + random_state=random_state + ) + + print("Compute pearson correlation", flush=True) + return pearsonr(ro, re)[0] + + +print("Load data", flush=True) +input_solution = ad.read_h5ad(par["input_solution"]) +input_embedding = ad.read_h5ad(par["input_embedding"]) + +high_dim = input_solution.layers["normalized"] +X_emb = input_embedding.obsm["X_emb"] + +density_preservation = compute_density_preservation( + X_emb=X_emb, + high_dim=high_dim, + n_neighbors=par["n_neighbors"], + random_state=par["seed"] +) + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": input_solution.uns["dataset_id"], + "normalization_id": input_solution.uns["normalization_id"], + "method_id": input_embedding.uns["method_id"], + "metric_ids": [ "density_preservation" ], + "metric_values": [ density_preservation ] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml new file mode 100644 index 0000000000..b08c93db2c --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml @@ -0,0 +1,50 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: distance_correlation + info: + metrics: + - name: distance_correlation + label: Distance Correlation + summary: "Calculates the distance correlation by computing Spearman correlations between distances." + description: "Calculates the distance correlation by computing Spearman correlations between distances on the full (or processed) data matrix and the dimensionally-reduced matrix." + reference: kruskal1964mds + min: 0 + max: "+.inf" + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + note: This metric was ported but will probably be removed soon. + - name: distance_correlation_spectral + label: Distance Correlation Spectral + summary: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." + description: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." + reference: coifman2006diffusion + min: 0 + max: "+.inf" + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + note: This metric was ported but will probably be removed soon. + arguments: + - name: "--spectral" + type: boolean_true + description: Calculate the spectral root mean squared error. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - umap-learn + - scikit-learn + - numpy + - pynndescent~=0.5.11 + - scipy + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py b/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py new file mode 100644 index 0000000000..5d8e325126 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py @@ -0,0 +1,59 @@ +import anndata as ad +import numpy as np +import sklearn.decomposition +import scipy.stats +import scipy.spatial +from sklearn.metrics import pairwise_distances +import umap +import umap.spectral + +## VIASH START +par = { + "input_embedding": "resources_test/dimensionality_reduction/pancreas/embedding.h5ad", + "input_solution": "resources_test/dimensionality_reduction/pancreas/solution.h5ad", + "output": "score.h5ad", +} +## VIASH END + +def _distance_correlation(X, X_emb): + high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) + low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) + corr = scipy.stats.spearmanr( + low_dimensional_distance_vector, high_dimensional_distance_vector + ) + return corr + +print("Load data", flush=True) +input_solution = ad.read_h5ad(par["input_solution"]) +input_embedding = ad.read_h5ad(par["input_embedding"]) + +high_dim = input_solution.layers["normalized"] +X_emb = input_embedding.obsm["X_emb"] + +print("Compute NNLS residual after SVD", flush=True) +n_svd = 500 +svd_emb = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(high_dim) +dist_corr = _distance_correlation(svd_emb, X_emb).correlation + +#! Explicitly not changing it to use diffusion map method as this will have a positive effect on the diffusion map method for this specific metric. +print("Compute NLSS residual after spectral embedding", flush=True) +n_comps = min(1000, min(input_solution.shape) - 2) +umap_graph = umap.UMAP(transform_mode="graph").fit_transform(high_dim) +spectral_emb = umap.spectral.spectral_layout( + high_dim, umap_graph, n_comps, random_state=np.random.default_rng() +) +dist_corr_spectral = _distance_correlation(spectral_emb, X_emb).correlation + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": input_solution.uns["dataset_id"], + "normalization_id": input_solution.uns["normalization_id"], + "method_id": input_embedding.uns["method_id"], + "metric_ids": [ "distance_correlation", "distance_correlation_spectral" ], + "metric_values": [ dist_corr, dist_corr_spectral ] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml new file mode 100644 index 0000000000..5f75fa8e26 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "trustworthiness" + info: + metrics: + - name: trustworthiness + label: Trustworthiness at k=15 + summary: "A measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data." + description: "A measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data." + reference: venna2006local + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + note: This metric is already included in the 'coranking' component and can be removed. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - numpy + - type: nextflow + directives: + label: [midtime, highmem, lowcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py b/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py new file mode 100644 index 0000000000..410a0b3263 --- /dev/null +++ b/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py @@ -0,0 +1,37 @@ +import anndata as ad +import numpy as np +from sklearn import manifold + +## VIASH START +par = { + "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", + "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "output": "score.h5ad", +} +## VIASH END + +print("Load data", flush=True) +input_solution = ad.read_h5ad(par["input_solution"]) +input_embedding = ad.read_h5ad(par["input_embedding"]) + +high_dim = input_solution.layers["normalized"] +X_emb = input_embedding.obsm["X_emb"] + +print("Reduce dimensionality of raw data", flush=True) +trustworthiness = manifold.trustworthiness( + high_dim, X_emb, n_neighbors=15, metric="euclidean" +) + +print("Create output AnnData object", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": input_solution.uns["dataset_id"], + "normalization_id": input_solution.uns["normalization_id"], + "method_id": input_embedding.uns["method_id"], + "metric_ids": [ "trustworthiness" ], + "metric_values": [ trustworthiness ] + } +) + +print("Write data to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml b/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml new file mode 100644 index 0000000000..d6f62e0c7e --- /dev/null +++ b/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml @@ -0,0 +1,13 @@ +__merge__: ../api/comp_process_dataset.yaml +functionality: + name: "process_dataset" + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/subset_anndata.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/process_dataset/script.py b/src/tasks/dimensionality_reduction/process_dataset/script.py new file mode 100644 index 0000000000..9563ed56f0 --- /dev/null +++ b/src/tasks/dimensionality_reduction/process_dataset/script.py @@ -0,0 +1,34 @@ +import sys +import anndata as ad + +## VIASH START +par = { + "input": "resources_test/common/pancreas/dataset.h5ad", + "output_dataset": "train.h5ad", + "output_solution": "test.h5ad", +} +meta = { + "functionality_name": "split_data", + "config": "src/tasks/dimensionality_reduction/process_dataset/.config.vsh.yaml" +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +print(">> Load Data", flush=True) +adata = ad.read_h5ad(par["input"]) + +print(">> Figuring out which data needs to be copied to which output file", flush=True) +slot_info = read_config_slots_info(meta["config"]) + +print(">> Creating train data", flush=True) +output_dataset = subset_anndata(adata, slot_info["output_dataset"]) + +print(">> Creating test data", flush=True) +output_solution = subset_anndata(adata, slot_info["output_solution"]) + +print(">> Writing", flush=True) +output_dataset.write_h5ad(par["output_dataset"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh b/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..f83056dad6 --- /dev/null +++ b/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +id: dimensionality_reduction_process_datasets +input_states: s3://openproblems-data/resources/datasets/**/state.yaml +rename_keys: 'input:output_dataset' +settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/dimensionality_reduction/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels dimensionality_reduction,process_datasets \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..02c58d5cc5 --- /dev/null +++ b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/dimensionality_reduction/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/dimensionality_reduction/datasets/**/state.yaml +rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ + --labels dimensionality_reduction,full \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh new file mode 100755 index 0000000000..1c778d345c --- /dev/null +++ b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/dimensionality_reduction/**/state.yaml +rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/dimensionality-reduction/ +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels dimensionality_reduction,test \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh b/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh new file mode 100755 index 0000000000..03ec1659b6 --- /dev/null +++ b/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh @@ -0,0 +1,55 @@ +#!/bin/bash +#make sure the following command has been executed +#viash ns build -q 'dimensionality_reduction|common' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/dimensionality_reduction + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' +# output_state should be moved to settings once workaround is solved + + +# run one method +viash run src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml -- \ + --input $DATASET_DIR/pancreas/dataset.h5ad \ + --output $DATASET_DIR/pancreas/embedding.h5ad + +# run one metric +viash run src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml -- \ + --input_embedding $DATASET_DIR/pancreas/embedding.h5ad \ + --input_solution $DATASET_DIR/pancreas/solution.h5ad \ + --output $DATASET_DIR/pancreas/score.h5ad + +# # run benchmark +# export NXF_VER=22.04.5 + +# # after having added a split dataset component +# nextflow \ +# run . \ +# -main-script src/tasks/dimensionality_reduction/workflows/run/main.nf \ +# -profile docker \ +# --id pancreas \ +# --input_dataset $DATASET_DIR/dataset.h5ad \ +# --input_solution $DATASET_DIR/solution.h5ad \ +# --output scores.tsv \ +# --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml b/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..d6aa723b00 --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,30 @@ +functionality: + name: "process_datasets" + namespace: "dimensionality_reduction/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: "/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/tasks/dimensionality_reduction/api/file_dataset.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/tasks/dimensionality_reduction/api/file_solution.yaml + required: true + direction: output + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: dimensionality_reduction/process_dataset +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf b/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..8d34f77e82 --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf @@ -0,0 +1,54 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [input: "dataset"], + toState: [ + output_dataset: "output_dataset", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_dataset", "output_solution"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh b/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh new file mode 100644 index 0000000000..d16cd7736f --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +export NXF_VER=22.04.5 + +nextflow run . \ + -main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --id run_test \ + --input_states "resources_test/common/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \ + --publish_dir "resources_test/dimensionality_reduction" \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml b/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..aa751624d6 --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,82 @@ +functionality: + name: "run_benchmark" + namespace: "dimensionality_reduction/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_dataset" + __merge__: "/src/tasks/dimensionality_reduction/api/file_dataset.yaml" + required: true + direction: input + - name: "--input_solution" + __merge__: "/src/tasks/dimensionality_reduction/api/file_solution.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: dimensionality_reduction/control_methods/random_features + - name: dimensionality_reduction/control_methods/spectral_features + - name: dimensionality_reduction/control_methods/true_features + - name: dimensionality_reduction/methods/densmap + - name: dimensionality_reduction/methods/diffusion_map + - name: dimensionality_reduction/methods/ivis + - name: dimensionality_reduction/methods/lmds + - name: dimensionality_reduction/methods/neuralee + - name: dimensionality_reduction/methods/pca + - name: dimensionality_reduction/methods/phate + - name: dimensionality_reduction/methods/pymde + - name: dimensionality_reduction/methods/simlr + - name: dimensionality_reduction/methods/tsne + - name: dimensionality_reduction/methods/umap + - name: dimensionality_reduction/metrics/clustering_performance + - name: dimensionality_reduction/metrics/coranking + - name: dimensionality_reduction/metrics/density_preservation + - name: dimensionality_reduction/metrics/distance_correlation + - name: dimensionality_reduction/metrics/trustworthiness + # test_resources: + # - type: nextflow_script + # path: main.nf + # entrypoint: test_wf +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf b/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..1ba9251f9f --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf @@ -0,0 +1,210 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + // controls + random_features, + spectral_features, + true_features, + // methods + densmap, + diffusion_map, + ivis, + lmds, + neuralee, + pca, + phate, + pymde, + simlr, + tsne, + umap + ] + + // construct list of metrics + metrics = [ + clustering_performance, + coranking, + density_preservation, + distance_correlation, + trustworthiness + ] + + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.functionality.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input: state.input_dataset + ] + if (comp.config.functionality.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + [ + input_solution: state.input_solution, + input_embedding: state.method_output + ] + }, + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh b/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000000..4bd2b01008 --- /dev/null +++ b/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/dimensionality_reduction" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file diff --git a/src/tasks/label_projection/README.md b/src/tasks/label_projection/README.md new file mode 100644 index 0000000000..8981c503be --- /dev/null +++ b/src/tasks/label_projection/README.md @@ -0,0 +1,370 @@ +# Label projection + + +Automated cell type annotation from rich, labeled reference data + +Path: +[`src/tasks/label_projection`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/label_projection) + +## Motivation + +A major challenge for integrating single cell datasets is creating +matching cell type annotations for each cell. One of the most common +strategies for annotating cell types is referred to as +[“cluster-then-annotate”](https://www.nature.com/articles/s41576-018-0088-9) +whereby cells are aggregated into clusters based on feature similarity +and then manually characterized based on differential gene expression or +previously identified marker genes. Recently, methods have emerged to +build on this strategy and annotate cells using [known marker +genes](https://www.nature.com/articles/s41592-019-0535-3). However, +these strategies pose a difficulty for integrating atlas-scale datasets +as the particular annotations may not match. + +## Description + +To ensure that the cell type labels in newly generated datasets match +existing reference datasets, some methods align cells to a previously +annotated [reference +dataset](https://academic.oup.com/bioinformatics/article/35/22/4688/54802990) +and then *project* labels from the reference to the new dataset. + +Here, we compare methods for annotation based on a reference dataset. +The datasets consist of two or more samples of single cell profiles that +have been manually annotated with matching labels. These datasets are +then split into training and test batches, and the task of each method +is to train a cell type classifer on the training set and project those +labels onto the test set. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Nikolay Markov | author, maintainer | +| Scott Gigante | author | +| Robrecht Cannoodt | author | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common Dataset") + comp_process_dataset[/"Data processor"/] + file_train("Training data") + file_test("Test data") + file_solution("Solution") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_prediction("Prediction") + file_score("Score") + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_train + comp_process_dataset-->file_test + comp_process_dataset-->file_solution + file_train---comp_control_method + file_train---comp_method + file_test---comp_control_method + file_test---comp_method + file_solution---comp_control_method + file_solution---comp_metric + comp_control_method-->file_prediction + comp_method-->file_prediction + comp_metric-->file_score + file_prediction---comp_metric +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: `resources_test/common/pancreas/dataset.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["cell_type"]` | `string` | Cell type information. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Data processor + +Path: +[`src/label_projection`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/label_projection) + +A label projection dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------|:-------|:-------------------------------------------| +| `--input` | `file` | A subset of the common dataset. | +| `--output_train` | `file` | (*Output*) The training data. | +| `--output_test` | `file` | (*Output*) The test data (without labels). | +| `--output_solution` | `file` | (*Output*) The solution for the test data. | + +
+ +## File format: Training data + +The training data + +Example file: `resources_test/label_projection/pancreas/train.h5ad` + +Format: + +
+ + AnnData object + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:----------|:-------------------------------------------------------------------------| +| `obs["label"]` | `string` | Ground truth cell type labels. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Test data + +The test data (without labels) + +Example file: `resources_test/label_projection/pancreas/test.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:----------|:-------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Solution + +The solution for the test data + +Example file: `resources_test/label_projection/pancreas/solution.h5ad` + +Format: + +
+ + AnnData object + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["label"]` | `string` | Ground truth cell type labels. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Control method + +Path: +[`src/label_projection/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/label_projection/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:--------------------------------| +| `--input_train` | `file` | The training data. | +| `--input_test` | `file` | The test data (without labels). | +| `--input_solution` | `file` | The solution for the test data. | +| `--output` | `file` | (*Output*) The prediction file. | + +
+ +## Component type: Method + +Path: +[`src/label_projection/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/label_projection/methods) + +A label projection method. + +Arguments: + +
+ +| Name | Type | Description | +|:----------------|:-------|:--------------------------------| +| `--input_train` | `file` | The training data. | +| `--input_test` | `file` | The test data (without labels). | +| `--output` | `file` | (*Output*) The prediction file. | + +
+ +## Component type: Metric + +Path: +[`src/label_projection/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/label_projection/metrics) + +A label projection metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:--------------------------------| +| `--input_solution` | `file` | The solution for the test data. | +| `--input_prediction` | `file` | The prediction file. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Prediction + +The prediction file + +Example file: `resources_test/label_projection/pancreas/prediction.h5ad` + +Format: + +
+ + AnnData object + obs: 'label_pred' + uns: 'dataset_id', 'normalization_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obs["label_pred"]` | `string` | Predicted labels for the test cells. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +Metric score file + +Example file: `resources_test/label_projection/pancreas/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/src/tasks/label_projection/api/comp_control_method.yaml b/src/tasks/label_projection/api/comp_control_method.yaml new file mode 100644 index 0000000000..d32de4ab2c --- /dev/null +++ b/src/tasks/label_projection/api/comp_control_method.yaml @@ -0,0 +1,38 @@ +functionality: + namespace: "label_projection/control_methods" + info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + This folder contains control components for the task. + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_train" + __merge__: file_train.yaml + direction: input + required: true + - name: "--input_test" + __merge__: file_test.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_prediction.yaml + direction: output + required: true + test_resources: + - path: /resources_test/label_projection/pancreas + dest: resources_test/label_projection/pancreas + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/label_projection/api/comp_method.yaml b/src/tasks/label_projection/api/comp_method.yaml new file mode 100644 index 0000000000..1b7cb0dabc --- /dev/null +++ b/src/tasks/label_projection/api/comp_method.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "label_projection/methods" + info: + type: method + type_info: + label: Method + summary: A label projection method. + description: | + A label projection method to predict the labels of a new "test" + dataset based on an annotated "training" dataset. + arguments: + - name: "--input_train" + __merge__: file_train.yaml + direction: input + required: true + - name: "--input_test" + __merge__: file_test.yaml + direction: input + required: true + - name: "--output" + __merge__: file_prediction.yaml + direction: output + required: true + test_resources: + - path: /resources_test/label_projection/pancreas + dest: resources_test/label_projection/pancreas + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/label_projection/api/comp_metric.yaml b/src/tasks/label_projection/api/comp_metric.yaml new file mode 100644 index 0000000000..ce81b0f89f --- /dev/null +++ b/src/tasks/label_projection/api/comp_metric.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "label_projection/metrics" + info: + type: metric + type_info: + label: Metric + summary: A label projection metric. + description: | + A metric for evaluating predicted labels. + arguments: + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--input_prediction" + __merge__: file_prediction.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + required: true + direction: output + test_resources: + - path: /resources_test/label_projection/pancreas + dest: resources_test/label_projection/pancreas + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib + diff --git a/src/tasks/label_projection/api/comp_process_dataset.yaml b/src/tasks/label_projection/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..03c2ea3726 --- /dev/null +++ b/src/tasks/label_projection/api/comp_process_dataset.yaml @@ -0,0 +1,32 @@ +functionality: + namespace: "label_projection" + info: + type: process_dataset + type_info: + label: Data processor + summary: A label projection dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + diff --git a/src/tasks/label_projection/api/file_common_dataset.yaml b/src/tasks/label_projection/api/file_common_dataset.yaml new file mode 100644 index 0000000000..eeb01ffd1e --- /dev/null +++ b/src/tasks/label_projection/api/file_common_dataset.yaml @@ -0,0 +1,72 @@ +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + diff --git a/src/tasks/label_projection/api/file_prediction.yaml b/src/tasks/label_projection/api/file_prediction.yaml new file mode 100644 index 0000000000..36efa87af0 --- /dev/null +++ b/src/tasks/label_projection/api/file_prediction.yaml @@ -0,0 +1,24 @@ +type: file +example: "resources_test/label_projection/pancreas/prediction.h5ad" +info: + label: "Prediction" + summary: "The prediction file" + slots: + obs: + - type: string + name: label_pred + description: Predicted labels for the test cells. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/tasks/label_projection/api/file_score.yaml b/src/tasks/label_projection/api/file_score.yaml new file mode 100644 index 0000000000..7ee5eaa8ee --- /dev/null +++ b/src/tasks/label_projection/api/file_score.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/label_projection/pancreas/score.h5ad" +info: + label: "Score" + summary: "Metric score file" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/tasks/label_projection/api/file_solution.yaml b/src/tasks/label_projection/api/file_solution.yaml new file mode 100644 index 0000000000..c7591678e0 --- /dev/null +++ b/src/tasks/label_projection/api/file_solution.yaml @@ -0,0 +1,71 @@ +type: file +example: "resources_test/label_projection/pancreas/solution.h5ad" +info: + label: "Solution" + summary: "The solution for the test data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/label_projection/api/file_test.yaml b/src/tasks/label_projection/api/file_test.yaml new file mode 100644 index 0000000000..9cb2177da5 --- /dev/null +++ b/src/tasks/label_projection/api/file_test.yaml @@ -0,0 +1,43 @@ +type: file +example: "resources_test/label_projection/pancreas/test.h5ad" +info: + label: "Test data" + summary: "The test data (without labels)" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/label_projection/api/file_train.yaml b/src/tasks/label_projection/api/file_train.yaml new file mode 100644 index 0000000000..d615fc5693 --- /dev/null +++ b/src/tasks/label_projection/api/file_train.yaml @@ -0,0 +1,47 @@ +type: file +example: "resources_test/label_projection/pancreas/train.h5ad" +info: + label: "Training data" + summary: "The training data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/label_projection/api/task_info.yaml b/src/tasks/label_projection/api/task_info.yaml new file mode 100644 index 0000000000..07b6b0120d --- /dev/null +++ b/src/tasks/label_projection/api/task_info.yaml @@ -0,0 +1,46 @@ +name: label_projection +label: Label projection +v1: + path: openproblems/tasks/label_projection/README.md + commit: 817ea64a526c7251f74c9a7a6dba98e8602b94a8 +summary: Automated cell type annotation from rich, labeled reference data +image: "thumbnail.svg" +motivation: | + A major challenge for integrating single cell datasets is creating matching + cell type annotations for each cell. One of the most common strategies for + annotating cell types is referred to as + ["cluster-then-annotate"](https://www.nature.com/articles/s41576-018-0088-9) + whereby cells are aggregated into clusters based on feature similarity and + then manually characterized based on differential gene expression or previously + identified marker genes. Recently, methods have emerged to build on this + strategy and annotate cells using + [known marker genes](https://www.nature.com/articles/s41592-019-0535-3). + However, these strategies pose a difficulty for integrating atlas-scale + datasets as the particular annotations may not match. +description: | + To ensure that the cell type labels in newly generated datasets match + existing reference datasets, some methods align cells to a previously + annotated [reference dataset](https://academic.oup.com/bioinformatics/article/35/22/4688/54802990) + and then _project_ labels from the reference to the new dataset. + + Here, we compare methods for annotation based on a reference dataset. + The datasets consist of two or more samples of single cell profiles that + have been manually annotated with matching labels. These datasets are then + split into training and test batches, and the task of each method is to + train a cell type classifer on the training set and project those labels + onto the test set. +authors: + - name: "Nikolay Markov" + roles: [ author, maintainer ] + info: + github: mxposed + - name: "Scott Gigante" + roles: [ author ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Robrecht Cannoodt + roles: [ author ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" \ No newline at end of file diff --git a/src/tasks/label_projection/api/thumbnail.svg b/src/tasks/label_projection/api/thumbnail.svg new file mode 100644 index 0000000000..3a0c47b5c2 --- /dev/null +++ b/src/tasks/label_projection/api/thumbnail.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml new file mode 100644 index 0000000000..8f0915a1dd --- /dev/null +++ b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "majority_vote" + info: + label: Majority Vote + summary: "A control-type method that predicts all cells to belong to the most abundant cell type in the dataset" + description: "A control-type method that predicts all cells to belong to the most abundant cell type in the dataset" + v1: + path: openproblems/tasks/label_projection/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + variants: + majority_vote: + preferred_normalization: counts + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/majority_vote/script.py b/src/tasks/label_projection/control_methods/majority_vote/script.py new file mode 100644 index 0000000000..0fc6446f0d --- /dev/null +++ b/src/tasks/label_projection/control_methods/majority_vote/script.py @@ -0,0 +1,26 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Compute majority vote", flush=True) +majority = input_train.obs.label.value_counts().index[0] + +print("Create prediction object", flush=True) +input_test.obs["label_pred"] = majority + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml new file mode 100644 index 0000000000..728157a644 --- /dev/null +++ b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "random_labels" + info: + label: Random Labels + summary: "a negative control, where the labels are randomly predicted." + description: "A negative control, where the labels are randomly predicted without training the data." + v1: + path: openproblems/tasks/label_projection/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: counts + variants: + random_labels: + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scanpy + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/random_labels/script.py b/src/tasks/label_projection/control_methods/random_labels/script.py new file mode 100644 index 0000000000..a57a9d37f2 --- /dev/null +++ b/src/tasks/label_projection/control_methods/random_labels/script.py @@ -0,0 +1,33 @@ +import anndata as ad +import numpy as np + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Compute label distribution", flush=True) +label_distribution = input_train.obs.label.value_counts() +label_distribution = label_distribution / label_distribution.sum() + +print("Create prediction object", flush=True) +input_test.obs["label_pred"] = np.random.choice( + label_distribution.index, + size=input_test.n_obs, + replace=True, + p=label_distribution +) + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml new file mode 100644 index 0000000000..ec536fcc7d --- /dev/null +++ b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml @@ -0,0 +1,22 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "true_labels" + info: + label: True labels + summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." + description: "A positive control, where the solution labels are copied 1 to 1 to the predicted data." + v1: + path: openproblems/tasks/label_projection/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: counts + variants: + true_labels: + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/true_labels/script.py b/src/tasks/label_projection/control_methods/true_labels/script.py new file mode 100644 index 0000000000..dc9354c290 --- /dev/null +++ b/src/tasks/label_projection/control_methods/true_labels/script.py @@ -0,0 +1,25 @@ +import anndata as ad + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'input_solution': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo' +} +## VIASH END + +print("Load data", flush=True) +# input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) +input_solution = ad.read_h5ad(par['input_solution']) + +print("Create prediction object", flush=True) +input_test.obs["label_pred"] = input_solution.obs["label"] + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/knn/config.vsh.yaml b/src/tasks/label_projection/methods/knn/config.vsh.yaml new file mode 100644 index 0000000000..499fa69e81 --- /dev/null +++ b/src/tasks/label_projection/methods/knn/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "knn" + info: + label: KNN + summary: "Assumes cells with similar gene expression belong to the same cell type, and assigns an unlabelled cell the most common cell type among its k nearest neighbors in PCA space." + description: | + Using the "k-nearest neighbours" approach, which is a + popular machine learning algorithm for classification and regression tasks. + The assumption underlying KNN in this context is that cells with similar gene + expression profiles tend to belong to the same cell type. For each unlabelled + cell, this method computes the $k$ labelled cells (in this case, 5) with the + smallest distance in PCA space, and assigns that cell the most common cell + type among its $k$ nearest neighbors. + reference : "cover1967nearest" + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" + v1: + path: openproblems/tasks/label_projection/methods/knn_classifier.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + knn_classifier_log_cp10k: + knn_classifier_scran: + preferred_normalization: log_scran_pooling + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: [scikit-learn, jsonschema] + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/knn/script.py b/src/tasks/label_projection/methods/knn/script.py new file mode 100644 index 0000000000..44b8b6f4de --- /dev/null +++ b/src/tasks/label_projection/methods/knn/script.py @@ -0,0 +1,28 @@ +import anndata as ad +import sklearn.neighbors + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Fit to train data", flush=True) +classifier = sklearn.neighbors.KNeighborsClassifier() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print("Predict on test data", flush=True) +input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml new file mode 100644 index 0000000000..88f4c2d5af --- /dev/null +++ b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "logistic_regression" + info: + label: Logistic Regression + summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." + description: | + Logistic Regression estimates parameters of a logistic function for + multivariate classification tasks. Here, we use 100-dimensional whitened PCA + coordinates as independent variables, and the model minimises the cross + entropy loss over all cell type classes. + reference: "hosmer2013applied" + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + v1: + path: openproblems/tasks/label_projection/methods/logistic_regression.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + logistic_regression_log_cp10k: + logistic_regression_scran: + preferred_normalization: log_scran_pooling + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scikit-learn + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/logistic_regression/script.py b/src/tasks/label_projection/methods/logistic_regression/script.py new file mode 100644 index 0000000000..e8796c1b75 --- /dev/null +++ b/src/tasks/label_projection/methods/logistic_regression/script.py @@ -0,0 +1,28 @@ +import anndata as ad +import sklearn.linear_model + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Fit to train data", flush=True) +classifier = sklearn.linear_model.LogisticRegression() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print("Predict on test data", flush=True) +input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/mlp/config.vsh.yaml b/src/tasks/label_projection/methods/mlp/config.vsh.yaml new file mode 100644 index 0000000000..9c7e92fc68 --- /dev/null +++ b/src/tasks/label_projection/methods/mlp/config.vsh.yaml @@ -0,0 +1,47 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "mlp" + info: + label: Multilayer perceptron + summary: "A neural network with 100-dimensional PCA input, two hidden layers, and gradient descent weight updates to minimize cross entropy loss." + description: | + Multi-Layer Perceptron is a type of artificial neural network that + consists of multiple layers of interconnected neurons. Each neuron computes a + weighted sum of all neurons in the previous layer and transforms it with + nonlinear activation function. The output layer provides the final + prediction, and network weights are updated by gradient descent to minimize + the cross entropy loss. Here, the input data is 100-dimensional whitened PCA + coordinates for each cell, and we use two hidden layers of 100 neurons each. + reference: "hinton1989connectionist" + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html" + v1: + path: openproblems/tasks/label_projection/methods/mlp.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + mlp_log_cp10k: + mlp_scran: + preferred_normalization: log_scran_pooling + arguments: + - name: "--hidden_layer_sizes" + type: "integer" + multiple: true + description: "The ith element represents the number of neurons in the ith hidden layer." + default: [100, 100] + - name: "--max_iter" + type: "integer" + default: 1000 + description: "Maximum number of iterations" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scikit-learn + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/mlp/script.py b/src/tasks/label_projection/methods/mlp/script.py new file mode 100644 index 0000000000..c98fba3954 --- /dev/null +++ b/src/tasks/label_projection/methods/mlp/script.py @@ -0,0 +1,31 @@ +import anndata as ad +from sklearn.neural_network import MLPClassifier + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Fit to train data", flush=True) +classifier = MLPClassifier( + max_iter=par["max_iter"], + hidden_layer_sizes=tuple(par["hidden_layer_sizes"]) +) +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print("Predict on test data", flush=True) +input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml b/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml new file mode 100644 index 0000000000..90f6e72a52 --- /dev/null +++ b/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "naive_bayes" + info: + label: Naive Bayesian Classifier + summary: "Naive Bayes classification using feature probabilities to project cell type labels from a reference dataset." + description: | + Naive Bayes classification leverages probabilistic models based on Bayes' theorem + to classify cells into different types. In the context of single-cell datasets, this method + utilizes the probabilities of features to project cell type labels from a reference dataset + to new datasets. The algorithm assumes independence between features, making it computationally + efficient and well-suited for high-dimensional data. It is particularly useful for annotating + cells in atlas-scale datasets, ensuring consistency and alignment with existing reference annotations. + reference: "hosmer2013applied" + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html" + preferred_normalization: log_cp10k + variants: + naive_bayes_log_cp10k: + naive_bayes_scran: + preferred_normalization: log_scran_pooling + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scikit-learn + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/naive_bayes/script.py b/src/tasks/label_projection/methods/naive_bayes/script.py new file mode 100644 index 0000000000..542c088dca --- /dev/null +++ b/src/tasks/label_projection/methods/naive_bayes/script.py @@ -0,0 +1,28 @@ +import anndata as ad +import sklearn.naive_bayes + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +print("Fit to train data", flush=True) +classifier = sklearn.naive_bayes.GaussianNB() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) + +print("Predict on test data", flush=True) +input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml new file mode 100644 index 0000000000..6c36ead072 --- /dev/null +++ b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml @@ -0,0 +1,46 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "scanvi" + info: + label: scANVI + summary: "scANVI predicts cell type labels for unlabelled test data by leveraging cell type labels, modelling uncertainty and using deep neural networks with stochastic optimization." + description: | + single-cell ANnotation using Variational Inference is a + semi-supervised variant of the scVI(Lopez et al. 2018) algorithm. Like scVI, + scANVI uses deep neural networks and stochastic optimization to model + uncertainty caused by technical noise and bias in single - cell + transcriptomics measurements. However, scANVI also leverages cell type labels + in the generative modelling. In this approach, scANVI is used to predict the + cell type labels of the unlabelled test data. + reference: "lotfollahi2020query" + repository_url: "https://github.com/scverse/scvi-tools" + documentation_url: https://scarches.readthedocs.io/en/latest/scanvi_surgery_pipeline.html + v1: + path: openproblems/tasks/label_projection/methods/scvi_tools.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + preferred_normalization: counts + variants: + scanvi_all_genes: + scanvi_hvg: + num_hvg: 2000 + arguments: + - name: "--num_hvg" + type: integer + description: "The number of HVG genes to subset to." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - scarches + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: nextflow + directives: + label: [midtime, midmem, highcpu, gpu] diff --git a/src/tasks/label_projection/methods/scanvi/script.py b/src/tasks/label_projection/methods/scanvi/script.py new file mode 100644 index 0000000000..d34fccd932 --- /dev/null +++ b/src/tasks/label_projection/methods/scanvi/script.py @@ -0,0 +1,78 @@ +import anndata as ad +import scarches as sca +import pandas as pd + +# followed procedure from here: +# https://scarches.readthedocs.io/en/latest/scanvi_surgery_pipeline.html + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad', + 'num_hvg': 2000 +} +meta = { + 'functionality_name': 'scanvi' +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) + +if par["num_hvg"]: + print("Subsetting to HVG", flush=True) + hvg_idx = input_train.var['hvg_score'].to_numpy().argsort()[:par["num_hvg"]] + input_train = input_train[:,hvg_idx] + input_test = input_test[:,hvg_idx] + +print("Concatenating train and test data", flush=True) +input_train.obs['is_test'] = False +input_test.obs['is_test'] = True +input_test.obs['label'] = "Unknown" +adata = ad.concat([input_train, input_test], merge = "same") +del input_train + +print("Create SCANVI model and train it on fully labelled reference dataset", flush=True) +sca.models.SCVI.setup_anndata( + adata, + batch_key="batch", + labels_key="label", + layer="counts" +) + +vae = sca.models.SCVI( + adata, + n_layers=2, + encode_covariates=True, + deeply_inject_covariates=False, + use_layer_norm="both", + use_batch_norm="none", +) + +print("Create the SCANVI model instance with ZINB loss", flush=True) +scanvae = sca.models.SCANVI.from_scvi_model(vae, unlabeled_category = "Unknown") + +print("Train SCANVI model", flush=True) +scanvae.train() + +print("Make predictions", flush=True) +preds = scanvae.predict(adata) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=pd.DataFrame( + {"label_pred": preds[adata.obs['is_test'].values]}, + index=input_test.obs.index, + ), + var=input_test.var[[]], + uns={ + "dataset_id": input_test.uns["dataset_id"], + "normalization_id": input_test.uns["normalization_id"], + "method_id": meta["functionality_name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml new file mode 100644 index 0000000000..ccf2f449b4 --- /dev/null +++ b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml @@ -0,0 +1,53 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: scanvi_scarches + info: + label: scANVI+scArches + summary: 'Query to reference single-cell integration with transfer learning with scANVI and scArches' + description: 'scArches+scANVI or "Single-cell architecture surgery" is a deep learning method for mapping new datasets onto a pre-existing reference model, using transfer learning and parameter optimization. It first uses scANVI to build a reference model from the training data, and then apply scArches to map the test data onto the reference model and make predictions.' + reference: lotfollahi2020query + documentation_url: https://docs.scvi-tools.org + repository_url: https://github.com/scverse/scvi-tools + preferred_normalization: counts + v1: + path: openproblems/tasks/label_projection/methods/scvi_tools.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + variants: + scanvi_scarches: + arguments: + - name: "--n_latent" + type: "integer" + default: 30 + description: "Number of units in the latent layer" + - name: "--n_layers" + type: "integer" + default: 2 + description: "Number of hidden layers" + - name: "--n_hidden" + type: "integer" + default: 128 + description: "Number of units in the hidden layers" + - name: "--dropout_rate" + type: "double" + default: 0.2 + description: "Rate of dropout applied in training" + - name: "--max_epochs" + type: "integer" + default: 2 + description: "Maximum number of training epochs" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + pypi: scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - type: nextflow + directives: + label: [midtime, midmem, midcpu, gpu] diff --git a/src/tasks/label_projection/methods/scanvi_scarches/script.py b/src/tasks/label_projection/methods/scanvi_scarches/script.py new file mode 100644 index 0000000000..73c9c0f1fa --- /dev/null +++ b/src/tasks/label_projection/methods/scanvi_scarches/script.py @@ -0,0 +1,61 @@ +import anndata as ad +import numpy as np +import scvi + +## VIASH START +par = { + "input_train": "resources_test/label_projection/pancreas/train.h5ad", + "input_test": "resources_test/label_projection/pancreas/test.h5ad", + "output": "output.h5ad", + "n_latent": 30, + "n_layers": 2, + "n_hidden": 128, + "dropout_rate": 0.2, + "max_epochs": 200, +} +meta = {"functionality_name": "scanvi_xgboost"} +## VIASH END + +print("Reading input files", flush=True) +input_train = ad.read_h5ad(par["input_train"]) +input_test = ad.read_h5ad(par["input_test"]) +input_train.X = input_train.layers["counts"] +input_test.X = input_test.layers["counts"] + +print("Train model", flush=True) +unlabeled_category = "Unknown" + +scvi.model.SCVI.setup_anndata(input_train, batch_key="batch", labels_key="label") + +# specific scArches parameters +arches_params = dict( + use_layer_norm="both", + use_batch_norm="none", + encode_covariates=True, + dropout_rate=par["dropout_rate"], + n_hidden=par["n_hidden"], + n_layers=par["n_layers"], + n_latent=par["n_latent"], +) +scvi_model = scvi.model.SCVI(input_train, **arches_params) +train_kwargs = dict( + train_size=0.9, + early_stopping=True, +) +scvi_model.train(**train_kwargs) +model = scvi.model.SCANVI.from_scvi_model( + scvi_model, unlabeled_category=unlabeled_category +) +model.train(**train_kwargs) + +query_model = scvi.model.SCANVI.load_query_data(input_test, model) +train_kwargs = dict(max_epochs=par["max_epochs"], early_stopping=True) +query_model.train(plan_kwargs=dict(weight_decay=0.0), **train_kwargs) + +print("Generate predictions", flush=True) +input_test.obs["label"] = "Unknown" +input_test.obs["label_pred"] = query_model.predict(input_test) + +print("Write output AnnData to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml new file mode 100644 index 0000000000..d51b532917 --- /dev/null +++ b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml @@ -0,0 +1,36 @@ +__merge__: ../../api/comp_method.yaml +functionality: + status: disabled + name: "seurat_transferdata" + info: + label: Seurat TransferData + summary: "Seurat reference mapping predicts cell types for unlabelled cells using PCA distances, labelled anchors, and transfer anchors from Seurat, with SCTransform normalization." + description: | + Seurat reference mapping is a cell type label transfer method provided by the + Seurat package. Gene expression counts are first normalised by SCTransform + before computing PCA. Then it finds mutual nearest neighbours, known as + transfer anchors, between the labelled and unlabelled part of the data in PCA + space, and computes each cell's distance to each of the anchor pairs. + Finally, it uses the labelled anchors to predict cell types for unlabelled + cells based on these distances. + reference: "hao2021integrated" + repository_url: "https://github.com/satijalab/seurat" + documentation_url: "https://satijalab.org/seurat/articles/integration_mapping.html" + v1: + path: openproblems/tasks/label_projection/methods/seurat.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + preferred_normalization: log_cp10k + variants: + seurat: + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ Matrix>=1.5.3, Seurat, rlang ] + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/label_projection/methods/seurat_transferdata/script.R b/src/tasks/label_projection/methods/seurat_transferdata/script.R new file mode 100644 index 0000000000..999eb769ce --- /dev/null +++ b/src/tasks/label_projection/methods/seurat_transferdata/script.R @@ -0,0 +1,81 @@ +cat(">> Loading dependencies\n") +library(Matrix, warn.conflicts = FALSE) +library(anndata, warn.conflicts = FALSE) +requireNamespace("Seurat", quietly = TRUE) +library(magrittr, warn.conflicts = FALSE) + +## VIASH START +par <- list( + input_train = "resources_test/label_projection/pancreas/train.h5ad", + input_test = "resources_test/label_projection/pancreas/test.h5ad", + output = "output.h5ad" +) +## VIASH END + +packageVersion("Matrix") + +cat(">> Load input data\n") +input_train <- read_h5ad(par$input_train) +input_test <- read_h5ad(par$input_test) + +# sce_train <- zellkonverter::readH5AD(par$input_train) +# obj_train <- Seurat::as.Seurat(sce_train, data = "normalized") +# sce_test <- zellkonverter::readH5AD(par$input_test) +# obj_test <- Seurat::as.Seurat(sce_test, data = "normalized") + +cat(">> Converting AnnData to Seurat\n") +anndataToSeurat <- function(adata) { + # interpreted from https://github.com/satijalab/seurat/blob/v3.1.0/R/objects.R + obj <- + SeuratObject::CreateSeuratObject( + counts = as(Matrix::t(adata$layers[["counts"]]), "CsparseMatrix") + ) %>% + SeuratObject::SetAssayData( + slot = "data", + new.data = as(Matrix::t(adata$layers[["normalized"]]), "CsparseMatrix") + ) %>% + SeuratObject::AddMetaData( + adata$obs + ) + + # set hvg + SeuratObject::VariableFeatures(obj) <- adata$var_names[adata$var[["hvg"]]] + + # set embedding + # could add loadings and stdev + embed <- SeuratObject::CreateDimReducObject( + embeddings = adata$obsm[["X_pca"]], + key = "PC_" + ) + obj[["pca"]] <- embed + + # return + obj +} + +obj_train <- anndataToSeurat(input_train) +obj_test <- anndataToSeurat(input_test) + +cat(">> Find transfer anchors\n") +npcs <- ncol(obj_train[["pca"]]) +anchors <- Seurat::FindTransferAnchors( + reference = obj_train, + query = obj_test, + npcs = npcs, + dims = seq_len(npcs), + verbose = FALSE +) + +cat(">> Predict on test data\n") +query <- Seurat::TransferData( + anchorset = anchors, + reference = obj_train, + query = obj_test, + refdata = list(labels = "label"), + verbose = FALSE +) +input_test$obs[["label_pred"]] <- query$predicted.labels[input_test$obs_names] + +cat(">> Write output to file\n") +input_test$uns[["method_id"]] <- meta[["functionality_name"]] +input_test$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml new file mode 100644 index 0000000000..516308fbdd --- /dev/null +++ b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "xgboost" + info: + label: XGBoost + summary: "XGBoost is a decision tree model that averages multiple trees with gradient boosting." + description: | + XGBoost is a gradient boosting decision tree model that learns multiple tree + structures in the form of a series of input features and their values, + leading to a prediction decision, and averages predictions from all its + trees. Here, input features are normalised gene expression values. + reference: "chen2016xgboost" + repository_url: "https://github.com/dmlc/xgboost" + documentation_url: "https://xgboost.readthedocs.io/en/stable/index.html" + v1: + path: openproblems/tasks/label_projection/methods/xgboost.py + commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 + preferred_normalization: log_cp10k + variants: + xgboost_log_cp10k: + xgboost_scran: + preferred_normalization: log_scran_pooling + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: xgboost + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/methods/xgboost/script.py b/src/tasks/label_projection/methods/xgboost/script.py new file mode 100644 index 0000000000..c56eae59d5 --- /dev/null +++ b/src/tasks/label_projection/methods/xgboost/script.py @@ -0,0 +1,39 @@ +import anndata as ad +import xgboost as xgb + +## VIASH START +par = { + 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', + 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'foo', +} +## VIASH END + +print("Load input data", flush=True) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) +input_layer = "normalized" + +print("Transform into integers", flush=True) +input_train.obs["label_int"] = input_train.obs["label"].cat.codes +categories = input_train.obs["label"].cat.categories + +print("Convert AnnDatas into datasets", flush=True) +xg_train = xgb.DMatrix(input_train.layers[input_layer], label=input_train.obs["label_int"]) +xg_test = xgb.DMatrix(input_test.layers[input_layer]) + +print("Fit on train data", flush=True) +param = {'objective': 'multi:softmax', 'num_class': len(categories)} +watchlist = [(xg_train, "train")] +xgb_op = xgb.train(param, xg_train, evals=watchlist) + +print("Predict on test data", flush=True) +pred = xgb_op.predict(xg_test).astype(int) +input_test.obs["label_pred"] = categories[pred] + +print("Write output to file", flush=True) +input_test.uns["method_id"] = meta["functionality_name"] +input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml new file mode 100644 index 0000000000..8fc7021ffa --- /dev/null +++ b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml @@ -0,0 +1,28 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "accuracy" + info: + metrics: + - name: accuracy + label: Accuracy + summary: "The percentage of correctly predicted labels." + description: "The percentage of correctly predicted labels." + min: 0 + max: 1 + maximize: true + reference: grandini2020metrics + v1: + path: openproblems/tasks/label_projection/metrics/accuracy.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scikit-learn + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/metrics/accuracy/script.py b/src/tasks/label_projection/metrics/accuracy/script.py new file mode 100644 index 0000000000..80795111d5 --- /dev/null +++ b/src/tasks/label_projection/metrics/accuracy/script.py @@ -0,0 +1,36 @@ +import numpy as np +import sklearn.preprocessing +import anndata as ad + +## VIASH START +par = { + 'input_prediction': 'resources_test/label_projection/pancreas/knn.h5ad', + 'input_solution': 'resources_test/label_projection/pancreas/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'accuracy' +} +## VIASH END + +print("Load data", flush=True) +input_prediction = ad.read_h5ad(par['input_prediction']) +input_solution = ad.read_h5ad(par['input_solution']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print("Encode labels", flush=True) +cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) +encoder = sklearn.preprocessing.LabelEncoder().fit(cats) +input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) +input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) + +print("Compute prediction accuracy", flush=True) +accuracy = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) + +print("Store metric value", flush=True) +input_prediction.uns["metric_ids"] = "accuracy" +input_prediction.uns["metric_values"] = accuracy + +print("Writing adata to file", flush=True) +input_prediction.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/metrics/f1/config.vsh.yaml b/src/tasks/label_projection/metrics/f1/config.vsh.yaml new file mode 100644 index 0000000000..f5abc0caa6 --- /dev/null +++ b/src/tasks/label_projection/metrics/f1/config.vsh.yaml @@ -0,0 +1,50 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "f1" + info: + metrics: + - name: f1_weighted + label: F1 weighted + summary: "Average weigthed support between each labels F1 score" + description: "Calculates the F1 score for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall." + reference: grandini2020metrics + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/label_projection/metrics/f1.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + - name: f1_macro + label: F1 macro + summary: "Unweighted mean of each label F1-score" + description: "Calculates the F1 score for each label, and find their unweighted mean. This does not take label imbalance into account." + reference: grandini2020metrics + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/label_projection/metrics/f1.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + - name: f1_micro + label: F1 micro + summary: "Calculation of TP, FN and FP." + description: "Calculates the F1 score globally by counting the total true positives, false negatives and false positives." + reference: grandini2020metrics + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/label_projection/metrics/f1.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: scikit-learn + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/metrics/f1/script.py b/src/tasks/label_projection/metrics/f1/script.py new file mode 100644 index 0000000000..4d4b1a2395 --- /dev/null +++ b/src/tasks/label_projection/metrics/f1/script.py @@ -0,0 +1,43 @@ +from sklearn.metrics import f1_score +import sklearn.preprocessing +import anndata as ad + +## VIASH START +par = { + 'input_prediction': 'resources_test/label_projection/pancreas/knn.h5ad', + 'input_solution': 'resources_test/label_projection/pancreas/solution.h5ad', + 'average': 'weighted', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'f1' +} +## VIASH END + +print("Load data", flush=True) +input_prediction = ad.read_h5ad(par['input_prediction']) +input_solution = ad.read_h5ad(par['input_solution']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print("Encode labels", flush=True) +cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) +encoder = sklearn.preprocessing.LabelEncoder().fit(cats) +input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) +input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) + +print("Compute F1 score", flush=True) +metric_type = [ "macro", "micro", "weighted" ] +metric_id = [ "f1_" + x for x in metric_type] +metric_value = [ f1_score( + input_solution.obs["label"], + input_prediction.obs["label_pred"], + average=x + ) for x in metric_type ] + +print("Store metric value", flush=True) +input_prediction.uns["metric_ids"] = metric_id +input_prediction.uns["metric_values"] = metric_value + +print("Writing adata to file", flush=True) +input_prediction.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/process_dataset/config.vsh.yaml b/src/tasks/label_projection/process_dataset/config.vsh.yaml new file mode 100644 index 0000000000..aa010876cb --- /dev/null +++ b/src/tasks/label_projection/process_dataset/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../api/comp_process_dataset.yaml +functionality: + name: "process_dataset" + arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["batch", "random"] + default: "batch" + - name: "--obs_label" + type: "string" + description: "Which .obs slot to use as label." + default: "cell_type" + - name: "--obs_batch" + type: "string" + description: "Which .obs slot to use as batch covariate." + default: "batch" + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/subset_anndata.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/tasks/label_projection/process_dataset/script.py b/src/tasks/label_projection/process_dataset/script.py new file mode 100644 index 0000000000..0f2c5482b6 --- /dev/null +++ b/src/tasks/label_projection/process_dataset/script.py @@ -0,0 +1,78 @@ +import sys +import random +import numpy as np +import anndata as ad + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'method': 'batch', + 'seed': None, + 'obs_batch': 'batch', + 'obs_label': 'cell_type', + 'output_train': 'train.h5ad', + 'output_test': 'test.h5ad', + 'output_solution': 'solution.h5ad' +} +meta = { + 'resources_dir': 'src/tasks/label_projection/process_dataset', + 'config': 'src/tasks/label_projection/process_dataset/.config.vsh.yaml' +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par["input"]) +print("input:", adata) + +print(f">> Process data using {par['method']} method") +if par["method"] == "batch": + batch_info = adata.obs[par["obs_batch"]] + batch_categories = batch_info.dtype.categories + test_batches = random.sample(list(batch_categories), 1) + is_test = [ x in test_batches for x in batch_info ] +elif par["method"] == "random": + train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) + is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] + +# subset the different adatas +print(">> Figuring which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} +slot_info = read_config_slots_info(meta["config"], slot_mapping) + +print(">> Creating train data", flush=True) +output_train = subset_anndata( + adata[[not x for x in is_test]], + slot_info["output_train"] +) + +print(">> Creating test data", flush=True) +output_test = subset_anndata( + adata[is_test], + slot_info["output_test"] +) + +print(">> Creating solution data", flush=True) +output_solution = subset_anndata( + adata[is_test], + slot_info['output_solution'] +) + +print(">> Writing data", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/tasks/label_projection/resources_scripts/process_datasets.sh b/src/tasks/label_projection/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..dbd284d237 --- /dev/null +++ b/src/tasks/label_projection/resources_scripts/process_datasets.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +id: label_projection_process_datasets +input_states: s3://openproblems-data/resources/datasets/**/state.yaml +rename_keys: 'input:output_dataset' +settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/label_projection/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/label_projection/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels label_projection,process_datasets \ No newline at end of file diff --git a/src/tasks/label_projection/resources_scripts/run_benchmark.sh b/src/tasks/label_projection/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..58a16c38d3 --- /dev/null +++ b/src/tasks/label_projection/resources_scripts/run_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/label_projection/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/label_projection/datasets/**/state.yaml +rename_keys: 'input_train:output_train,input_test:output_test,input_solution:output_solution' +output_state: "state.yaml" +settings: '{"method_ids": "scanvi_scarches"}' +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ + --labels label_projection,full \ No newline at end of file diff --git a/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh b/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh new file mode 100755 index 0000000000..5baf56f4e4 --- /dev/null +++ b/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/label_projection/**/state.yaml +rename_keys: 'input_train:output_train,input_test:output_test,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/label_projection/ +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels label_projection,test \ No newline at end of file diff --git a/src/tasks/label_projection/resources_test_scripts/pancreas.sh b/src/tasks/label_projection/resources_test_scripts/pancreas.sh new file mode 100755 index 0000000000..5a69340510 --- /dev/null +++ b/src/tasks/label_projection/resources_test_scripts/pancreas.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/label_projection + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/label_projection/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' +# output_state should be moved to settings once workaround is solved + +# run one method +viash run src/tasks/label_projection/methods/knn/config.vsh.yaml -- \ + --input_train $DATASET_DIR/pancreas/train.h5ad \ + --input_test $DATASET_DIR/pancreas/test.h5ad \ + --output $DATASET_DIR/pancreas/prediction.h5ad + +# run one metric +viash run src/tasks/label_projection/metrics/accuracy/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/pancreas/prediction.h5ad \ + --input_solution $DATASET_DIR/pancreas/solution.h5ad \ + --output $DATASET_DIR/pancreas/score.h5ad diff --git a/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml b/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..09b2e9a829 --- /dev/null +++ b/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,34 @@ +functionality: + name: "process_datasets" + namespace: "label_projection/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: "/src/tasks/label_projection/api/file_common_dataset.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_train" + __merge__: /src/tasks/label_projection/api/file_train.yaml + required: true + direction: output + - name: "--output_test" + __merge__: /src/tasks/label_projection/api/file_test.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/tasks/label_projection/api/file_solution.yaml + required: true + direction: output + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: label_projection/process_dataset +platforms: + - type: nextflow diff --git a/src/tasks/label_projection/workflows/process_datasets/main.nf b/src/tasks/label_projection/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..88cf24935c --- /dev/null +++ b/src/tasks/label_projection/workflows/process_datasets/main.nf @@ -0,0 +1,55 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test", "output_solution"]) + + emit: + output_ch +} diff --git a/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml b/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..083bb47a5a --- /dev/null +++ b/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,77 @@ +functionality: + name: "run_benchmark" + namespace: "label_projection/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: /src/tasks/label_projection/api/file_train.yaml + type: file + direction: input + required: true + - name: "--input_test" + __merge__: /src/tasks/label_projection/api/file_test.yaml + type: file + direction: input + required: true + - name: "--input_solution" + __merge__: /src/tasks/label_projection/api/file_solution.yaml + type: file + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: label_projection/control_methods/true_labels + - name: label_projection/control_methods/majority_vote + - name: label_projection/control_methods/random_labels + - name: label_projection/methods/knn + - name: label_projection/methods/logistic_regression + - name: label_projection/methods/mlp + - name: label_projection/methods/scanvi + - name: label_projection/methods/scanvi_scarches + - name: label_projection/methods/xgboost + - name: label_projection/metrics/accuracy + - name: label_projection/metrics/f1 +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/label_projection/workflows/run_benchmark/main.nf b/src/tasks/label_projection/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..5dafc98d1e --- /dev/null +++ b/src/tasks/label_projection/workflows/run_benchmark/main.nf @@ -0,0 +1,200 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + true_labels, + majority_vote, + random_labels, + knn, + logistic_regression, + mlp, + scanvi, + scanvi_scarches, + // seurat_transferdata, + xgboost + ] + + // construct list of metrics + metrics = [ + accuracy, + f1 + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.functionality.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_train: state.input_train, + input_test: state.input_test + ] + if (comp.config.functionality.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_solution: "input_solution", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} \ No newline at end of file diff --git a/src/tasks/label_projection/workflows/run_benchmark/run_test.sh b/src/tasks/label_projection/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000000..e9c712af48 --- /dev/null +++ b/src/tasks/label_projection/workflows/run_benchmark/run_test.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources_test/label_projection" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_train:output_train,input_test:output_test,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" diff --git a/src/tasks/match_modalities/README.md b/src/tasks/match_modalities/README.md new file mode 100644 index 0000000000..399c31ee92 --- /dev/null +++ b/src/tasks/match_modalities/README.md @@ -0,0 +1,499 @@ +# Match Modalities + + +Match cells across datasets of the same set of samples on different +technologies / modalities. + +Path: +[`src/tasks/match_modalities`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/match_modalities) + +## Motivation + +Cellular function is regulated by the complex interplay of different +types of biological molecules (DNA, RNA, proteins, etc.), which +determine the state of a cell. Several recently described technologies +allow for simultaneous measurement of different aspects of cellular +state. For example, sci-CAR \[@cao2018joint\] jointly profiles RNA +expression and chromatin accessibility on the same cell and CITE-seq +\[@stoeckius2017simultaneous\] measures surface protein abundance and +RNA expression from each cell. These technologies enable us to better +understand cellular function, however datasets are still rare and there +are tradeoffs that these measurements make for to profile multiple +modalities. + +Joint methods can be more expensive or lower throughput or more noisy +than measuring a single modality at a time. Therefore it is useful to +develop methods that are capable of integrating measurements of the same +biological system but obtained using different technologies on different +cells. + +## Description + +In this task, the goal is to learn a latent space where cells profiled +by different technologies in different modalities are matched if they +have the same state. We use jointly profiled data as ground truth so +that we can evaluate when the observations from the same cell acquired +using different modalities are similar. A perfect result has each of the +paired observations sharing the same coordinates in the latent space. A +method that can achieve this would be able to match datasets across +modalities to enable multimodal cellular analysis from separately +measured profiles. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Scott Gigante | author, maintainer | +| Alex Tong | author | +| Robrecht Cannoodt | author | +| Kai Waldrant | contributor | + +## API + +``` mermaid +flowchart LR + file_common_dataset_mod1("Common dataset mod1") + comp_process_dataset[/"Data processor"/] + file_dataset_mod1("Modality 1") + file_dataset_mod2("Modality 2") + file_solution_mod1("Solution mod1") + file_solution_mod2("Solution mod1") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_integrated_mod1("Integrated mod1") + file_integrated_mod2("Integrated mod2") + file_score("Score") + file_common_dataset_mod2("Common dataset mod2") + file_common_dataset_mod1---comp_process_dataset + comp_process_dataset-->file_dataset_mod1 + comp_process_dataset-->file_dataset_mod2 + comp_process_dataset-->file_solution_mod1 + comp_process_dataset-->file_solution_mod2 + file_dataset_mod1---comp_control_method + file_dataset_mod1---comp_method + file_dataset_mod2---comp_control_method + file_dataset_mod2---comp_method + file_solution_mod1---comp_control_method + file_solution_mod1---comp_metric + file_solution_mod2---comp_control_method + file_solution_mod2---comp_metric + comp_control_method-->file_integrated_mod1 + comp_control_method-->file_integrated_mod2 + comp_method-->file_integrated_mod1 + comp_method-->file_integrated_mod2 + comp_metric-->file_score + file_integrated_mod1---comp_metric + file_integrated_mod2---comp_metric + file_common_dataset_mod2---comp_process_dataset +``` + +## File format: Common dataset mod1 + +The first modality (RNA) of a dataset processed by the common multimodal +dataset processing pipeline. + +Example file: +`resources_test/common/scicar_cell_lines/dataset_mod1.h5ad` + +Description: + +This dataset contains both raw counts and normalized data matrices, as +well as a PCA embedding, HVG selection and a kNN graph. + +Format: + +
+ + AnnData object + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Data processor + +Path: +[`src/match_modalities`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/match_modalities) + +A match modalities dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------------|:-------|:---------------------------------------------------------------------------------------------------------------| +| `--input_mod1` | `file` | The first modality (RNA) of a dataset processed by the common multimodal dataset processing pipeline. | +| `--input_mod2` | `file` | The second modality (ADT or ATAC) of a dataset processed by the common multimodal dataset processing pipeline. | +| `--output_mod1` | `file` | (*Output*) The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--output_mod2` | `file` | (*Output*) The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--output_solution_mod1` | `file` | (*Output*) The ground truth information for the first modality. | +| `--output_solution_mod2` | `file` | (*Output*) The ground truth information for the second modality. | + +
+ +## File format: Modality 1 + +The first modality of a multimodal dataset. The cells of this dataset +are randomly permuted. + +Example file: +`resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad` + +Format: + +
+ + AnnData object + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:----------|:-------------------------------------| +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Modality 2 + +The second modality of a multimodal dataset. The cells of this dataset +are randomly permuted. + +Example file: +`resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad` + +Format: + +
+ + AnnData object + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:----------|:-------------------------------------| +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Solution mod1 + +The ground truth information for the first modality + +Example file: +`resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad` + +Format: + +
+ + AnnData object + obs: 'permutation_indices' + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["permutation_indices"]` | `integer` | Indices with which to revert the permutation of the cells. | +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Solution mod1 + +The ground truth information for the second modality + +Example file: +`resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad` + +Format: + +
+ + AnnData object + obs: 'permutation_indices' + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["permutation_indices"]` | `integer` | Indices with which to revert the permutation of the cells. | +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Control method + +Path: +[`src/match_modalities/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/match_modalities/control_methods) + +A multimodal data integration control method. + +Arguments: + +
+ +| Name | Type | Description | +|:------------------------|:-------|:----------------------------------------------------------------------------------------------| +| `--input_mod1` | `file` | The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--input_mod2` | `file` | The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--input_solution_mod1` | `file` | The ground truth information for the first modality. | +| `--input_solution_mod2` | `file` | The ground truth information for the second modality. | +| `--output_mod1` | `file` | (*Output*) The integrated embedding for the first modality. | +| `--output_mod2` | `file` | (*Output*) The integrated embedding for the second modality. | + +
+ +## Component type: Method + +Path: +[`src/match_modalities/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/match_modalities/methods) + +A multimodal data integration method. + +Arguments: + +
+ +| Name | Type | Description | +|:----------------|:-------|:----------------------------------------------------------------------------------------------| +| `--input_mod1` | `file` | The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--input_mod2` | `file` | The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | +| `--output_mod1` | `file` | (*Output*) The integrated embedding for the first modality. | +| `--output_mod2` | `file` | (*Output*) The integrated embedding for the second modality. | + +
+ +## Component type: Metric + +Path: +[`src/match_modalities/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/match_modalities/metrics) + +A multimodal data integration metric. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------------|:-------|:------------------------------------------------------| +| `--input_integrated_mod1` | `file` | The integrated embedding for the first modality. | +| `--input_integrated_mod2` | `file` | The integrated embedding for the second modality. | +| `--input_solution_mod1` | `file` | The ground truth information for the first modality. | +| `--input_solution_mod2` | `file` | The ground truth information for the second modality. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Integrated mod1 + +The integrated embedding for the first modality + +Example file: +`resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad` + +Format: + +
+ + AnnData object + obsm: 'integrated' + uns: 'dataset_id', 'normalization_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obsm["integrated"]` | `double` | An integrated embedding. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | Which method was used. | + +
+ +## File format: Integrated mod2 + +The integrated embedding for the second modality + +Example file: +`resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad` + +Format: + +
+ + AnnData object + obsm: 'integrated' + uns: 'dataset_id', 'normalization_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obsm["integrated"]` | `double` | An integrated embedding. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | Which method was used. | + +
+ +## File format: Score + +Metric score file + +Example file: +`resources_test/match_modalities/scicar_cell_lines/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ +## File format: Common dataset mod2 + +The second modality (ADT or ATAC) of a dataset processed by the common +multimodal dataset processing pipeline. + +Example file: +`resources_test/common/scicar_cell_lines/dataset_mod2.h5ad` + +Description: + +This dataset contains both raw counts and normalized data matrices, as +well as a PCA embedding, HVG selection and a kNN graph. + +Format: + +
+ + AnnData object + obsm: 'X_svd' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ diff --git a/src/tasks/match_modalities/api/comp_control_method.yaml b/src/tasks/match_modalities/api/comp_control_method.yaml new file mode 100644 index 0000000000..446ee8a41a --- /dev/null +++ b/src/tasks/match_modalities/api/comp_control_method.yaml @@ -0,0 +1,47 @@ +functionality: + namespace: "match_modalities/control_methods" + info: + type: control_method + type_info: + label: Control method + summary: A multimodal data integration control method. + description: | + This folder contains control components for the task. + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_mod1" + __merge__: file_dataset_mod1.yaml + direction: input + required: true + - name: "--input_mod2" + __merge__: file_dataset_mod2.yaml + direction: input + required: true + - name: "--input_solution_mod1" + __merge__: file_solution_mod1.yaml + direction: input + required: true + - name: "--input_solution_mod2" + __merge__: file_solution_mod2.yaml + direction: input + required: true + - name: "--output_mod1" + __merge__: file_integrated_mod1.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_integrated_mod2.yaml + direction: output + required: true + test_resources: + - path: /resources_test/match_modalities/scicar_cell_lines + dest: resources_test/match_modalities/scicar_cell_lines + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/match_modalities/api/comp_method.yaml b/src/tasks/match_modalities/api/comp_method.yaml new file mode 100644 index 0000000000..37a5e90b0e --- /dev/null +++ b/src/tasks/match_modalities/api/comp_method.yaml @@ -0,0 +1,34 @@ +functionality: + namespace: "match_modalities/methods" + info: + type: method + type_info: + label: Method + summary: A multimodal data integration method. + description: | + A multimodal method to integrate data. + arguments: + - name: "--input_mod1" + __merge__: file_dataset_mod1.yaml + direction: input + required: true + - name: "--input_mod2" + __merge__: file_dataset_mod2.yaml + direction: input + required: true + - name: "--output_mod1" + __merge__: file_integrated_mod1.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_integrated_mod2.yaml + direction: output + required: true + test_resources: + - path: /resources_test/match_modalities/scicar_cell_lines + dest: resources_test/match_modalities/scicar_cell_lines + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib diff --git a/src/tasks/match_modalities/api/comp_metric.yaml b/src/tasks/match_modalities/api/comp_metric.yaml new file mode 100644 index 0000000000..220598bbbf --- /dev/null +++ b/src/tasks/match_modalities/api/comp_metric.yaml @@ -0,0 +1,39 @@ +functionality: + namespace: "match_modalities/metrics" + info: + type: metric + type_info: + label: Metric + summary: A multimodal data integration metric. + description: | + A metric for evaluating integrated data. + arguments: + - name: "--input_integrated_mod1" + __merge__: file_integrated_mod1.yaml + direction: input + required: true + - name: "--input_integrated_mod2" + __merge__: file_integrated_mod2.yaml + direction: input + required: true + - name: "--input_solution_mod1" + __merge__: file_solution_mod1.yaml + direction: input + required: true + - name: "--input_solution_mod2" + __merge__: file_solution_mod2.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + required: true + direction: output + test_resources: + - path: /resources_test/match_modalities/scicar_cell_lines + dest: resources_test/match_modalities/scicar_cell_lines + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /src/common/library.bib + diff --git a/src/tasks/match_modalities/api/comp_process_dataset.yaml b/src/tasks/match_modalities/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..a48a0957b1 --- /dev/null +++ b/src/tasks/match_modalities/api/comp_process_dataset.yaml @@ -0,0 +1,40 @@ +functionality: + namespace: "match_modalities" + info: + type: process_dataset + type_info: + label: Data processor + summary: A match modalities dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input_mod1" + __merge__: file_common_dataset_mod1.yaml + direction: input + required: true + - name: "--input_mod2" + __merge__: file_common_dataset_mod2.yaml + direction: input + required: true + - name: "--output_mod1" + __merge__: file_dataset_mod1.yaml + direction: output + required: true + - name: "--output_mod2" + __merge__: file_dataset_mod2.yaml + direction: output + required: true + - name: "--output_solution_mod1" + __merge__: file_solution_mod1.yaml + direction: output + required: true + - name: "--output_solution_mod2" + __merge__: file_solution_mod2.yaml + direction: output + required: true + test_resources: + - path: /resources_test/common/scicar_cell_lines + dest: resources_test/common/scicar_cell_lines + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + diff --git a/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml b/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml new file mode 100644 index 0000000000..cfb98e04ea --- /dev/null +++ b/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml @@ -0,0 +1,56 @@ +type: file +example: "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad" +info: + label: "Common dataset mod1" + summary: The first modality (RNA) of a dataset processed by the common multimodal dataset processing pipeline. + description: | + This dataset contains both raw counts and normalized data matrices, + as well as a PCA embedding, HVG selection and a kNN graph. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml b/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml new file mode 100644 index 0000000000..c42fbf525c --- /dev/null +++ b/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml @@ -0,0 +1,56 @@ +type: file +example: "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad" +info: + label: "Common dataset mod2" + summary: The second modality (ADT or ATAC) of a dataset processed by the common multimodal dataset processing pipeline. + description: | + This dataset contains both raw counts and normalized data matrices, + as well as a PCA embedding, HVG selection and a kNN graph. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/file_dataset_mod1.yaml b/src/tasks/match_modalities/api/file_dataset_mod1.yaml new file mode 100644 index 0000000000..aece4dc975 --- /dev/null +++ b/src/tasks/match_modalities/api/file_dataset_mod1.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad" +info: + label: "Modality 1" + summary: "The first modality of a multimodal dataset. The cells of this dataset are randomly permuted." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/file_dataset_mod2.yaml b/src/tasks/match_modalities/api/file_dataset_mod2.yaml new file mode 100644 index 0000000000..9c140e3de8 --- /dev/null +++ b/src/tasks/match_modalities/api/file_dataset_mod2.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad" +info: + label: "Modality 2" + summary: "The second modality of a multimodal dataset. The cells of this dataset are randomly permuted." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/file_integrated_mod1.yaml b/src/tasks/match_modalities/api/file_integrated_mod1.yaml new file mode 100644 index 0000000000..72f363de1f --- /dev/null +++ b/src/tasks/match_modalities/api/file_integrated_mod1.yaml @@ -0,0 +1,24 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad" +info: + label: "Integrated mod1" + summary: "The integrated embedding for the first modality" + slots: + obsm: + - type: double + name: integrated + description: An integrated embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "Which method was used" + required: true diff --git a/src/tasks/match_modalities/api/file_integrated_mod2.yaml b/src/tasks/match_modalities/api/file_integrated_mod2.yaml new file mode 100644 index 0000000000..644bf052d4 --- /dev/null +++ b/src/tasks/match_modalities/api/file_integrated_mod2.yaml @@ -0,0 +1,24 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad" +info: + label: "Integrated mod2" + summary: "The integrated embedding for the second modality" + slots: + obsm: + - type: double + name: integrated + description: An integrated embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "Which method was used" + required: true diff --git a/src/tasks/match_modalities/api/file_score.yaml b/src/tasks/match_modalities/api/file_score.yaml new file mode 100644 index 0000000000..7d66bde3c3 --- /dev/null +++ b/src/tasks/match_modalities/api/file_score.yaml @@ -0,0 +1,29 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/score.h5ad" +info: + label: "Score" + summary: "Metric score file" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/tasks/match_modalities/api/file_solution_mod1.yaml b/src/tasks/match_modalities/api/file_solution_mod1.yaml new file mode 100644 index 0000000000..490e005e0a --- /dev/null +++ b/src/tasks/match_modalities/api/file_solution_mod1.yaml @@ -0,0 +1,58 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad" +info: + label: "Solution mod1" + summary: "The ground truth information for the first modality" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: integer + name: permutation_indices + description: "Indices with which to revert the permutation of the cells" + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/file_solution_mod2.yaml b/src/tasks/match_modalities/api/file_solution_mod2.yaml new file mode 100644 index 0000000000..7cb21fef8e --- /dev/null +++ b/src/tasks/match_modalities/api/file_solution_mod2.yaml @@ -0,0 +1,58 @@ +type: file +example: "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad" +info: + label: "Solution mod1" + summary: "The ground truth information for the second modality" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: integer + name: permutation_indices + description: "Indices with which to revert the permutation of the cells" + required: true + obsm: + - type: double + name: X_svd + description: The resulting SVD PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/tasks/match_modalities/api/task_info.yaml b/src/tasks/match_modalities/api/task_info.yaml new file mode 100644 index 0000000000..bc5550df16 --- /dev/null +++ b/src/tasks/match_modalities/api/task_info.yaml @@ -0,0 +1,47 @@ +name: match_modalities +label: Match Modalities +summary: | + Match cells across datasets of the same set of samples on different technologies / modalities. +image: "thumbnail.svg" +motivation: | + Cellular function is regulated by the complex interplay of different types of biological + molecules (DNA, RNA, proteins, etc.), which determine the state of a cell. Several + recently described technologies allow for simultaneous measurement of different aspects + of cellular state. For example, sci-CAR [@cao2018joint] + jointly profiles RNA expression and chromatin accessibility on the same cell and + CITE-seq [@stoeckius2017simultaneous] measures + surface protein abundance and RNA expression from each cell. These technologies enable + us to better understand cellular function, however datasets are still rare and there are + tradeoffs that these measurements make for to profile multiple modalities. + + Joint methods can be more expensive or lower throughput or more noisy than measuring a + single modality at a time. Therefore it is useful to develop methods that are capable + of integrating measurements of the same biological system but obtained using different + technologies on different cells. +description: | + In this task, the goal is to learn a latent space where cells profiled by different + technologies in different modalities are matched if they have the same state. We use + jointly profiled data as ground truth so that we can evaluate when the observations + from the same cell acquired using different modalities are similar. A perfect result + has each of the paired observations sharing the same coordinates in the latent space. + A method that can achieve this would be able to match datasets across modalities to + enable multimodal cellular analysis from separately measured profiles. +authors: + - name: "Scott Gigante" + roles: [ author, maintainer ] + info: + github: scottgigante + orcid: "0000-0002-4544-2764" + - name: Alex Tong + roles: [ author ] + info: + github: atong01 + - name: Robrecht Cannoodt + roles: [ author ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [ contributor ] + info: + github: KaiWaldrant \ No newline at end of file diff --git a/src/tasks/match_modalities/api/thumbnail.svg b/src/tasks/match_modalities/api/thumbnail.svg new file mode 100644 index 0000000000..07e326bc4a --- /dev/null +++ b/src/tasks/match_modalities/api/thumbnail.svg @@ -0,0 +1 @@ +RNAATACdim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml new file mode 100644 index 0000000000..8c021c3bdf --- /dev/null +++ b/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "random_features" + info: + label: Random Features + summary: "Randomly permutated features" + description: | + "Randomly permuted twice, once for use as the output for each modality, producing random features with no correlation between modalities." + preferred_normalization: log_cp10k + v1: + path: openproblems/tasks/matching_modalities/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - numpy + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/random_features/script.py b/src/tasks/match_modalities/control_methods/random_features/script.py new file mode 100644 index 0000000000..d10bb72b27 --- /dev/null +++ b/src/tasks/match_modalities/control_methods/random_features/script.py @@ -0,0 +1,32 @@ +import anndata as ad +import numpy as np + +## VIASH START + +par = { + "input_mod1": "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + "input_mod2": "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + "output_mod1": "output.mod1.h5ad", + "output_mod2": "output.mod2.h5ad", +} + +meta = { + "functionality_name": "random_features" +} + +## VIASH END + +print("Reading input h5ad file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + +print("Generating random features", flush=True) +# todo: do we actually need to permute this once more +adata_mod1.obsm["integrated"] = adata_mod1.obsm["X_svd"][np.random.permutation(np.arange(adata_mod1.shape[0]))] +adata_mod2.obsm["integrated"] = adata_mod1.obsm["X_svd"][np.random.permutation(np.arange(adata_mod1.shape[0]))] + +print("Write output to file", flush=True) +adata_mod1.uns["method_id"] = meta["functionality_name"] +adata_mod2.uns["method_id"] = meta["functionality_name"] +adata_mod1.write_h5ad(par["output_mod1"], compression="gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml new file mode 100644 index 0000000000..bc897dd821 --- /dev/null +++ b/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: "true_features" + info: + label: True Features + summary: "A 1 to 1 mapping of features between modalities" + description: | + "use the same features for both modalities" + preferred_normalization: log_cp10k + v1: + path: openproblems/tasks/matching_modalities/methods/baseline.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/true_features/script.py b/src/tasks/match_modalities/control_methods/true_features/script.py new file mode 100644 index 0000000000..cf7abac8e5 --- /dev/null +++ b/src/tasks/match_modalities/control_methods/true_features/script.py @@ -0,0 +1,59 @@ +import anndata as ad +import numpy as np + +## VIASH START +par = { + "input_mod1": "resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad", + "input_mod2": "resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad", + "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", + "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", + "output_mod1": "output.mod1.h5ad", + "output_mod2": "output.mod2.h5ad", +} +meta = { + "functionality_name": "true_features" +} +## VIASH END + +print("Reading input h5ad file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + +solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) +solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) + +print("Storing true features", flush=True) +output_mod1 = ad.AnnData( + obs=adata_mod1.obs[[]], + var=adata_mod1.var[[]], + obsm={ + "integrated": adata_mod1.obsm["X_svd"] + }, + uns={ + "dataset_id": adata_mod1.uns["dataset_id"], + "normalization_id": adata_mod1.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +# Permutate mod1 according to mod2 +mod2_obsm = adata_mod1.obsm["X_svd"][solution_mod1.obs["permutation_indices"]] +reverse_indices_mod2 = np.argsort(solution_mod2.obs["permutation_indices"]) +mod2_obsm = mod2_obsm[reverse_indices_mod2] + +output_mod2 = ad.AnnData( + obs=adata_mod2.obs[[]], + var=adata_mod2.var[[]], + obsm={ + "integrated": mod2_obsm + }, + uns={ + "dataset_id": adata_mod2.uns["dataset_id"], + "normalization_id": adata_mod2.uns["normalization_id"], + "method_id": meta["functionality_name"] + } +) + +print("Write output to file", flush=True) +output_mod1.write_h5ad(par["output_mod1"], compression="gzip") +output_mod2.write_h5ad(par["output_mod2"], compression="gzip") diff --git a/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml b/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml new file mode 100644 index 0000000000..4e143ec67b --- /dev/null +++ b/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml @@ -0,0 +1,29 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "fastmnn" + info: + label: "fastMNN" + summary: "A simpler version of the original mnnCorrect algorithm." + description: | + FastMNN is a simplified version of the mnnCorrect algorithm. Both use Mutual Nearest Neighbors to integrate multimodal single-cell data. + preferred_normalization: "log_cp10k" + variants: + mnn_log_cp10k: + mnn_log_scran_pooling: + # "The normalization only changes for the first modality dataset, the second still uses log_cp10k" + preferred_normalization: "log_scran_pooling" + reference: "haghverdi2018batch" + repository_url: "https://github.com/LTLA/batchelor" + documentation_url: "https://github.com/LTLA/batchelor#readme" + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: batchelor + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/methods/fastmnn/script.R b/src/tasks/match_modalities/methods/fastmnn/script.R new file mode 100644 index 0000000000..129f134e16 --- /dev/null +++ b/src/tasks/match_modalities/methods/fastmnn/script.R @@ -0,0 +1,37 @@ +library(anndata, warn.conflicts = FALSE) +library(Matrix, warn.conflicts = FALSE) +requireNamespace("batchelor", quietly = TRUE) + +## VIASH START +par <- list( + input_mod1 = "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + input_mod2 = "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + output_mod1 = "output_mod1.h5ad", + output_mod2 = "output_mod2.h5ad" +) +## VIASH END + +cat("Reading input h5ad file\n") +adata_mod1 <- read_h5ad(par$input_mod1) +adata_mod2 <- read_h5ad(par$input_mod2) + +cat("Running MNN\n") +sce_mnn <- batchelor::fastMNN( + t(adata_mod1$obsm[["X_svd"]]), + t(adata_mod2$obsm[["X_svd"]]) +) + +cat("Storing output\n") +combined_recons <- t(SummarizedExperiment::assay(sce_mnn, "reconstructed")) +mode1_recons <- combined_recons[seq_len(nrow(adata_mod1$obsm[["X_svd"]])), , drop = FALSE] +mode2_recons <- combined_recons[-seq_len(nrow(adata_mod1$obsm[["X_svd"]])), , drop = FALSE] + +adata_mod1$obsm[["integrated"]] <- as.matrix(mode1_recons) +adata_mod2$obsm[["integrated"]] <- as.matrix(mode2_recons) + +cat("Writing to file\n") +adata_mod1$uns["method_id"] <- meta$functionality_name +adata_mod2$uns["method_id"] <- meta$functionality_name + +yyy <- adata_mod1$write_h5ad(par$output_mod1, compression = "gzip") +zzz <- adata_mod2$write_h5ad(par$output_mod2, compression = "gzip") diff --git a/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml b/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml new file mode 100644 index 0000000000..3146db56e0 --- /dev/null +++ b/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml @@ -0,0 +1,38 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "harmonic_alignment" + info: + label: "Harmonic Alignment" + summary: "Harmonic Alignment" + description: | + Harmonic Alignment is a method for integrating multimodal single-cell data. It is based on the idea of aligning the eigenvectors of the Laplacian matrices of the two modalities. The alignment is achieved by solving a generalized eigenvalue problem. The method is described in the following paper: https://doi.org/10.1137/1.9781611976236.36 + preferred_normalization: "log_cp10k" + v1: + path: openproblems/tasks/matching_modalities/methods/harmonic_alignment.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + reference: "stanley2020harmonic" + documentation_url: "https://github.com/KrishnaswamyLab/harmonic-alignment#readme" + repository_url: "https://github.com/KrishnaswamyLab/harmonic-alignment" + arguments: + - name: "--n_pca_XY" + type: "integer" + default: 100 + description: "Default number of principal components on which to build graph." + - name: "--n_eigenvectors" + type: "integer" + default: 100 + description: "Number of eigenvectors of the normalized Laplacian on which to perform alignment." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + github: + - KrishnaswamyLab/harmonic-alignment#subdirectory=python + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] + diff --git a/src/tasks/match_modalities/methods/harmonic_alignment/script.py b/src/tasks/match_modalities/methods/harmonic_alignment/script.py new file mode 100644 index 0000000000..abe2eece7c --- /dev/null +++ b/src/tasks/match_modalities/methods/harmonic_alignment/script.py @@ -0,0 +1,48 @@ +import anndata as ad +import harmonicalignment + +## VIASH START +par = { + "mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + "mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + "output" : "output.scot.h5ad", + "n_pca_XY" : 100, + "eigenvectors" : 100 +} +meta = { + "functionality_name" : "harmonic_alignment" +} +## VIASH END + + +print("Reading input h5ad file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + +print("Check parameters", flush=True) +n_eigenvectors = par["n_eigenvectors"] +n_pca_XY = par["n_pca_XY"] + +if adata_mod1.layers["normalized"].shape[0] <= n_eigenvectors: + n_eigenvectors = None +if adata_mod1.layers["normalized"].shape[0] <= n_pca_XY: + n_pca_XY = None + + +print("Running Harmonic Alignment", flush=True) +ha_op = harmonicalignment.HarmonicAlignment( + n_filters=8, n_pca_XY=n_pca_XY, n_eigenvectors=n_eigenvectors +) +ha_op.align(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) +XY_aligned = ha_op.diffusion_map(n_eigenvectors=n_eigenvectors) + +print("Storing output data structures", flush=True) + +adata_mod1.obsm["integrated"] = XY_aligned[: adata_mod1.obsm["X_svd"].shape[0]] +adata_mod2.obsm["integrated"] = XY_aligned[-adata_mod2.obsm["X_svd"].shape[0] :] + +print("Write output to file", flush=True) +adata_mod1.uns["method_id"] = meta["functionality_name"] +adata_mod2.uns["method_id"] = meta["functionality_name"] +adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml b/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml new file mode 100644 index 0000000000..db7b49383b --- /dev/null +++ b/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml @@ -0,0 +1,29 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "procrustes" + info: + label: Procrustes + summary: | + "Procrustes superimposition embeds cellular data from each modality into a common space." + description: | + "Procrustes superimposition embeds cellular data from each modality into a common space by aligning the 100-dimensional SVD embeddings to one another by using an isomorphic transformation that minimizes the root mean squared distance between points. The unmodified SVD embedding and the transformed second modality are used as output for the task." + v1: + path: openproblems/tasks/matching_modalities/methods/procrustes.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + reference: gower1975generalized + documentation_url: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.procrustes.html + repository_url: https://github.com/scipy/scipy + preferred_normalization: "log_cp10k" + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scipy + - type: nextflow + directives: + label: [midtime, midmem, midcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/methods/procrustes/script.py b/src/tasks/match_modalities/methods/procrustes/script.py new file mode 100644 index 0000000000..fad63fa658 --- /dev/null +++ b/src/tasks/match_modalities/methods/procrustes/script.py @@ -0,0 +1,34 @@ +import anndata as ad +import scipy.spatial + +## VIASH START + +par = { + "input_mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + "input_mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + "output_mod1" : "output.mod1.h5ad", + "output_mod2" : "output.mod2.h5ad", +} + +meta = { + "functionality_name" : "procrustes" +} + +## VIASH END + +print("Reading input h5ad file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + +print("procrustes alignment", flush=True) +X_proc, Y_proc, _ = scipy.spatial.procrustes(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) + +print("Storing output data", flush=True) +adata_mod1.obsm["integrated"] = X_proc +adata_mod2.obsm["integrated"] = Y_proc + +print("Write output to file", flush=True) +adata_mod1.uns["method_id"] = meta["functionality_name"] +adata_mod2.uns["method_id"] = meta["functionality_name"] +adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/methods/scot/config.vsh.yaml b/src/tasks/match_modalities/methods/scot/config.vsh.yaml new file mode 100644 index 0000000000..e86fe4438a --- /dev/null +++ b/src/tasks/match_modalities/methods/scot/config.vsh.yaml @@ -0,0 +1,30 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: "scot" + info: + label: "Single Cell Optimal Transport" + description: | + Single Cell Optimal Transport (SCOT) is a method for integrating multimodal single-cell data. It is based on the idea of aligning the distributions of the two modalities using optimal transport. + summary: "Run Single Cell Optimal Transport" + preferred_normalization: "log_cp10k" + reference: Demetci2020scot + documentation_url: "https://github.com/rsinghlab/SCOT#readme" + repository_url: "https://github.com/rsinghlab/SCOT" + arguments: + - name: "--balanced" + type: "boolean_true" + description: "Determines whether balanced or unbalanced optimal transport. In the balanced case, the target and source distributions are assumed to have equal mass." + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: apt + packages: git + - type: docker + run: "cd /opt && git clone --depth 1 https://github.com/rsinghlab/SCOT.git && cd SCOT && pip install -r requirements.txt" + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/methods/scot/script.py b/src/tasks/match_modalities/methods/scot/script.py new file mode 100644 index 0000000000..d6e629c565 --- /dev/null +++ b/src/tasks/match_modalities/methods/scot/script.py @@ -0,0 +1,45 @@ +import anndata as ad +import sys +sys.path.append("/opt/SCOT/src/") +import scotv1 +import pandas as pd + +# importing helper functions from common preprocessing.py file in resources dir +import sys + + +## VIASH START +par = { + "input_mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + "input_mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + "output_mod1" : "integrated_mod1.h5ad", + "output_mod2" : "integrated_mod2.h5ad", + "balanced":False, +} +## VIASH END + + +print("Reading input h5ad file", flush=True) +adata_mod1 = ad.read_h5ad(par["input_mod1"]) +adata_mod2 = ad.read_h5ad(par["input_mod2"]) + + +print("Initialize SCOT", flush=True) +scot = scotv1.SCOT(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) + +print("Call the unbalanced alignment", flush=True) +# From https://github.com/rsinghlab/SCOT/blob/master/examples/unbalanced_GW_SNAREseq.ipynb # noqa: 501 +X_new_unbal, y_new_unbal = scot.align( + k=50, e=1e-3, normalize=True +) + + +print("store output", flush=True) +adata_mod1.obsm["integrated"] = X_new_unbal +adata_mod2.obsm["integrated"] = y_new_unbal + +print("Write output to file", flush=True) +adata_mod1.uns["method_id"] = meta["functionality_name"] +adata_mod2.uns["method_id"] = meta["functionality_name"] +adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") +adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml b/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml new file mode 100644 index 0000000000..e7067a20b5 --- /dev/null +++ b/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml @@ -0,0 +1,36 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "knn_auc" + info: + metrics: + - label: kNN Area Under the Curve + name: knn_auc + summary: "Compute the kNN Area Under the Curve" + description: | + Let $f(i) \in F$ be the scRNA-seq measurement of cell $i$, and $g(i) \in G$ be the scATAC- seq measurement of cell $i$. kNN-AUC calculates the average percentage overlap of neighborhoods of $f(i)$ in $F$ with neighborhoods of $g(i)$ in $G$. Higher is better. + reference: "lance2022multimodal" + min: 0 + max: 1 + maximize: true + v1: + path: openproblems/tasks/matching_modalities/metrics/knn_auc.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + arguments: + - name: "--proportion_neighbors" + type: "double" + default: 0.1 + description: The proportion of neighbours to use in computing the KNN. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - numpy + - scikit-learn + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/metrics/knn_auc/script.py b/src/tasks/match_modalities/metrics/knn_auc/script.py new file mode 100644 index 0000000000..cf5c14b473 --- /dev/null +++ b/src/tasks/match_modalities/metrics/knn_auc/script.py @@ -0,0 +1,75 @@ +import anndata as ad +import numpy as np +import sklearn.decomposition +import sklearn.neighbors + +## VIASH START +par = { + "input_integrated_mod1": "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad", + "input_integrated_mod2": "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad", + "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", + "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", + "output": "resources_test/multimodal/score.h5ad", + "proportion_neighbors": 0.1, +} +meta = { + "functionality_name": "knn_auc" +} +## VIASH END + +print("Reading adata file", flush=True) +input_solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) +input_solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) + +input_integrated_mod1 = ad.read_h5ad(par["input_integrated_mod1"])[input_solution_mod1.obs["permutation_indices"]] +input_integrated_mod2 = ad.read_h5ad(par["input_integrated_mod2"])[input_solution_mod2.obs["permutation_indices"]] + +print("Checking parameters", flush=True) +n_neighbors = int(np.ceil(par["proportion_neighbors"] * input_solution_mod1.n_obs)) + +print("Compute KNN on PCA", flush=True) +_, indices_true = ( + sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors) + .fit(input_solution_mod1.obsm["X_svd"]) + .kneighbors(input_solution_mod1.obsm["X_svd"]) +) + +_, indices_pred = ( + sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors) + .fit(input_integrated_mod1.obsm["integrated"]) + .kneighbors(input_integrated_mod2.obsm["integrated"]) +) + +print("Check which neighbours match", flush=True) +neighbors_match = np.zeros(n_neighbors, dtype=int) +for i in range(input_solution_mod1.n_obs): + _, pred_matches, true_matches = np.intersect1d( + indices_pred[i], indices_true[i], return_indices=True + ) + neighbors_match_idx = np.maximum(pred_matches, true_matches) + neighbors_match += np.sum( + np.arange(n_neighbors) >= neighbors_match_idx[:, None], + axis=0, + ) + +print("Compute area under neighbours match curve", flush=True) +neighbors_match_curve = neighbors_match / ( + np.arange(1, n_neighbors + 1) * input_solution_mod1.n_obs +) +area_under_curve = np.mean(neighbors_match_curve) + +print("Store metric value", flush=True) +uns = { + "dataset_id": input_solution_mod1.uns["dataset_id"], + "normalization_id": input_solution_mod1.uns["normalization_id"], + "method_id": input_integrated_mod1.uns["method_id"], + "metric_ids": "knn_auc", + "metric_values": area_under_curve +} +output_metric = ad.AnnData( + shape=(0,0), + uns=uns +) + +print("Writing adata to file", flush=True) +output_metric.write_h5ad(par["output"], compression = "gzip") diff --git a/src/tasks/match_modalities/metrics/mse/config.vsh.yaml b/src/tasks/match_modalities/metrics/mse/config.vsh.yaml new file mode 100644 index 0000000000..b1dfc15746 --- /dev/null +++ b/src/tasks/match_modalities/metrics/mse/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: "mse" + info: + metrics: + - label: "Mean Squared Error" + name: "mse" + summary: Compute the mean squared error. + description: | + Mean squared error (MSE) is the average distance between each pair of matched observations of the same cell in the learned latent space. Lower is better. + reference: "lance2022multimodal" + maximize: false + min: 0 + max: "+.inf" + v1: + path: openproblems/tasks/matching_modalities/metrics/mse.py + commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - numpy<2 + - scipy + - scprep + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/metrics/mse/script.py b/src/tasks/match_modalities/metrics/mse/script.py new file mode 100644 index 0000000000..b03487c6eb --- /dev/null +++ b/src/tasks/match_modalities/metrics/mse/script.py @@ -0,0 +1,56 @@ +import anndata as ad +import numpy as np +from scipy import sparse + +## VIASH START +par = { + "input_integrated_mod1": "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad", + "input_integrated_mod2": "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad", + "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", + "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", + "output": "resources_test/multimodal/score.h5ad", +} +meta = { + "functionality_name": "knn_auc" +} +## VIASH END + +print("Reading adata file", flush=True) +input_solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) +input_solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) + +input_integrated_mod1 = ad.read_h5ad(par["input_integrated_mod1"])[input_solution_mod1.obs["permutation_indices"]] +input_integrated_mod2 = ad.read_h5ad(par["input_integrated_mod2"])[input_solution_mod2.obs["permutation_indices"]] + +print("Computing MSE", flush=True) +def _square(X): + if sparse.issparse(X): + X.data = X.data ** 2 + return X + else: + return X ** 2 + + +X = input_integrated_mod1.obsm["integrated"].toarray() +Y = input_integrated_mod2.obsm["integrated"].toarray() + +X_shuffled = X[np.random.permutation(np.arange(X.shape[0])), :] +error_random = np.mean(np.sum(_square(X_shuffled - Y))) +error_abs = np.mean(np.sum(_square(X - Y))) +metric_value = (error_abs / error_random).item() + +print("Store metric value", flush=True) +uns = { + "dataset_id": input_solution_mod1.uns["dataset_id"], + "normalization_id": input_solution_mod1.uns["normalization_id"], + "method_id": input_integrated_mod1.uns["method_id"], + "metric_ids": "mse", + "metric_values": metric_value +} +output_metric = ad.AnnData( + shape=(0,0), + uns=uns +) + +print("Writing adata to file", flush=True) +output_metric.write_h5ad(par["output"], compression = "gzip") diff --git a/src/tasks/match_modalities/process_dataset/config.vsh.yaml b/src/tasks/match_modalities/process_dataset/config.vsh.yaml new file mode 100644 index 0000000000..35dc757809 --- /dev/null +++ b/src/tasks/match_modalities/process_dataset/config.vsh.yaml @@ -0,0 +1,18 @@ +__merge__: ../api/comp_process_dataset.yaml +functionality: + name: "process_dataset" + arguments: + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/subset_anndata.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [highmem, midcpu , midtime] diff --git a/src/tasks/match_modalities/process_dataset/script.py b/src/tasks/match_modalities/process_dataset/script.py new file mode 100644 index 0000000000..d90d5e3965 --- /dev/null +++ b/src/tasks/match_modalities/process_dataset/script.py @@ -0,0 +1,64 @@ +import sys +import random +import numpy as np +import anndata as ad + +## VIASH START +par = { + "input_mod1": "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", + "input_mod2": "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", + "output_mod1": "output_mod1.h5ad", + "output_mod2": "output_mod2.h5ad", + "output_solution_mod1": "output_solution_mod1.h5ad", + "output_solution_mod2": "output_solution_mod2.h5ad", + "seed": 123 +} +meta = { + "resources_dir": "src/common/helper_functions/", + "config": "src/tasks/match_modalities/process_dataset/.config.vsh.yaml" +} +## VIASH END + +# import helper functions +sys.path.append(meta["resources_dir"]) +from subset_anndata import read_config_slots_info, subset_anndata + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +input_mod1 = ad.read_h5ad(par["input_mod1"]) +input_mod2 = ad.read_h5ad(par["input_mod2"]) + +print(f">> Permute input data") +mod1_perm = np.random.permutation(np.arange(input_mod1.n_obs)) +mod2_perm = np.random.permutation(np.arange(input_mod2.n_obs)) + +output_mod1 = input_mod1[mod1_perm] +output_mod1.obs_names = [f"cell_mod1_{i}" for i in range(output_mod1.n_obs)] +output_mod2 = input_mod2[mod2_perm] +output_mod2.obs_names = [f"cell_mod2_{i}" for i in range(output_mod2.n_obs)] + +print(f">> Create solution objects") +output_solution_mod1 = input_mod1.copy() +output_solution_mod1.obs["permutation_indices"] = np.argsort(mod1_perm) +output_solution_mod2 = input_mod2.copy() +output_solution_mod2.obs["permutation_indices"] = np.argsort(mod2_perm) + +# subset the different adatas +print(">> Read slot info from config file", flush=True) +slot_info = read_config_slots_info(meta["config"]) + +print(">> Subset anndatas", flush=True) +output_mod1 = subset_anndata(output_mod1, slot_info["output_mod1"]) +output_mod2 = subset_anndata(output_mod2, slot_info["output_mod2"]) +output_solution_mod1 = subset_anndata(output_solution_mod1, slot_info["output_solution_mod1"]) +output_solution_mod2 = subset_anndata(output_solution_mod2, slot_info["output_solution_mod2"]) + +print(">> Writing data", flush=True) +output_mod1.write_h5ad(par["output_mod1"]) +output_mod2.write_h5ad(par["output_mod2"]) +output_solution_mod1.write_h5ad(par["output_solution_mod1"]) +output_solution_mod2.write_h5ad(par["output_solution_mod2"]) diff --git a/src/tasks/match_modalities/resources_scripts/process_datasets.sh b/src/tasks/match_modalities/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..149130d0cf --- /dev/null +++ b/src/tasks/match_modalities/resources_scripts/process_datasets.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +id: match_modalities_process_datasets +input_states: s3://openproblems-data/resources/datasets/openproblems_v1_multimodal/**/state.yaml +rename_keys: 'input_mod1:output_mod1,input_mod2:output_mod2' +settings: '{"output_mod1": "$id/output_mod1.h5ad", "output_mod2": "$id/output_mod2.h5ad", "output_solution_mod1": "$id/output_solution_mod1.h5ad", "output_solution_mod2": "$id/output_solution_mod2.h5ad"}' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/match_modalities/datasets/openproblems_v1_multimodal +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/match_modalities/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config /tmp/nextflow.config \ + --labels match_modalities,process_datasets diff --git a/src/tasks/match_modalities/resources_scripts/run_benchmark.sh b/src/tasks/match_modalities/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..001ba3437b --- /dev/null +++ b/src/tasks/match_modalities/resources_scripts/run_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/match_modalities/results/${RUN_ID}" + +cat > /tmp/params.yaml << HERE +id: match_modalities +input_states: s3://openproblems-data/resources/match_modalities/datasets/**/state.yaml +rename_keys: 'input_mod1:output_mod1,input_mod2:output_mod2,input_solution_mod1:output_solution_mod1,input_solution_mod2:output_solution_mod2' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/match_modalities/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ + --labels match_modalities,full \ No newline at end of file diff --git a/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh b/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh new file mode 100755 index 0000000000..6a35138815 --- /dev/null +++ b/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/match_modalities + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/match_modalities/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input_mod1:output_mod1,input_mod2:output_mod2' \ + --settings '{"output_mod1": "$id/dataset_mod1.h5ad", "output_mod2": "$id/dataset_mod2.h5ad", "output_solution_mod1": "$id/solution_mod1.h5ad", "output_solution_mod2": "$id/solution_mod2.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' +# output_state should be moved to settings once workaround is solved + +# run one method +viash run src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml -- \ + --input_mod1 $DATASET_DIR/scicar_cell_lines/dataset_mod1.h5ad \ + --input_mod2 $DATASET_DIR/scicar_cell_lines/dataset_mod2.h5ad \ + --output_mod1 $DATASET_DIR/scicar_cell_lines/integrated_mod1.h5ad \ + --output_mod2 $DATASET_DIR/scicar_cell_lines/integrated_mod2.h5ad diff --git a/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml b/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..5427343f9f --- /dev/null +++ b/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,42 @@ +functionality: + name: "process_datasets" + namespace: "match_modalities/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_mod1" + __merge__: "/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml" + required: true + direction: input + - name: "--input_mod2" + __merge__: "/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_mod1" + __merge__: /src/tasks/match_modalities/api/file_dataset_mod1.yaml + required: true + direction: output + - name: "--output_mod2" + __merge__: /src/tasks/match_modalities/api/file_dataset_mod2.yaml + required: true + direction: output + - name: "--output_solution_mod1" + __merge__: /src/tasks/match_modalities/api/file_solution_mod1.yaml + required: true + direction: output + - name: "--output_solution_mod2" + __merge__: /src/tasks/match_modalities/api/file_solution_mod2.yaml + required: true + direction: output + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: match_modalities/process_dataset +platforms: + - type: nextflow diff --git a/src/tasks/match_modalities/workflows/process_datasets/main.nf b/src/tasks/match_modalities/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..ab5e9a83b0 --- /dev/null +++ b/src/tasks/match_modalities/workflows/process_datasets/main.nf @@ -0,0 +1,82 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + key: "check_dataset_schema_mod1", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input_mod1") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input_mod1, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset_mod1": checks["exit_code"] == 0 ? state.input_mod1 : null, + ] + } + ) + + | check_dataset_schema.run( + key: "check_dataset_schema_mod2", + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input_mod2") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input_mod2, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset_mod2": checks["exit_code"] == 0 ? state.input_mod2 : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset_mod1 != null && state.dataset_mod2 != null + } + + | process_dataset.run( + fromState: [ input_mod1: "dataset_mod1", input_mod2: "dataset_mod2" ], + toState: [ + "output_mod1", + "output_mod2", + "output_solution_mod1", + "output_solution_mod2" + ] + ) + + // only output the files for which an output file was specified + | setState([ + "output_mod1", + "output_mod2", + "output_solution_mod1", + "output_solution_mod2" + ]) + + emit: + output_ch +} diff --git a/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml b/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..89da796600 --- /dev/null +++ b/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,75 @@ +functionality: + name: "run_benchmark" + namespace: "match_modalities/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_mod1" + __merge__: /src/tasks/match_modalities/api/file_dataset_mod1.yaml + direction: input + required: true + - name: "--input_mod2" + __merge__: /src/tasks/match_modalities/api/file_dataset_mod2.yaml + direction: input + required: true + - name: "--input_solution_mod1" + __merge__: /src/tasks/match_modalities/api/file_solution_mod1.yaml + direction: input + required: true + - name: "--input_solution_mod2" + __merge__: /src/tasks/match_modalities/api/file_solution_mod2.yaml + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "/src/tasks/match_modalities/api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: match_modalities/control_methods/random_features + - name: match_modalities/control_methods/true_features + - name: match_modalities/methods/fastmnn + - name: match_modalities/methods/scot + - name: match_modalities/methods/harmonic_alignment + - name: match_modalities/methods/procrustes + - name: match_modalities/metrics/knn_auc + - name: match_modalities/metrics/mse +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/match_modalities/workflows/run_benchmark/main.nf b/src/tasks/match_modalities/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..53753f3981 --- /dev/null +++ b/src/tasks/match_modalities/workflows/run_benchmark/main.nf @@ -0,0 +1,202 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + random_features, + true_features, + scot, + harmonic_alignment, + fastmnn, + procrustes + ] + + // construct list of metrics + metrics = [ + knn_auc, + mse + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution_mod1"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.functionality.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_mod1: state.input_mod1, + input_mod2: state.input_mod2 + ] + if (comp.config.functionality.info.type == "control_method") { + new_args.input_solution_mod1 = state.input_solution_mod1 + new_args.input_solution_mod2 = state.input_solution_mod2 + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output_mod1: output.output_mod1, + method_output_mod2: output.output_mod2 + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_integrated_mod1: "method_output_mod1", + input_integrated_mod2: "method_output_mod2", + input_solution_mod1: "input_solution_mod1", + input_solution_mod2: "input_solution_mod2" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch + +} diff --git a/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh b/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh new file mode 100644 index 0000000000..ee7c4c9909 --- /dev/null +++ b/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources_test/match_modalities" +OUTPUT_DIR="resources_test/match_modalities/benchmarks/openproblems_v1" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=22.04.5 +nextflow run . \ + -main-script target/nextflow/match_modalities/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --id resources_test \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_mod1:output_mod1,input_mod2:output_mod2,input_solution_mod1:output_solution_mod1,input_solution_mod2:output_solution_mod2' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ No newline at end of file diff --git a/src/tasks/predict_modality/README.md b/src/tasks/predict_modality/README.md new file mode 100644 index 0000000000..add96684ce --- /dev/null +++ b/src/tasks/predict_modality/README.md @@ -0,0 +1,486 @@ +# Predict Modality + + +Predicting the profiles of one modality (e.g. protein abundance) from +another (e.g. mRNA expression). + +Path: +[`src/tasks/predict_modality`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/predict_modality) + +## Motivation + +Experimental techniques to measure multiple modalities within the same +single cell are increasingly becoming available. The demand for these +measurements is driven by the promise to provide a deeper insight into +the state of a cell. Yet, the modalities are also intrinsically linked. +We know that DNA must be accessible (ATAC data) to produce mRNA +(expression data), and mRNA in turn is used as a template to produce +protein (protein abundance). These processes are regulated often by the +same molecules that they produce: for example, a protein may bind DNA to +prevent the production of more mRNA. Understanding these regulatory +processes would be transformative for synthetic biology and drug target +discovery. Any method that can predict a modality from another must have +accounted for these regulatory processes, but the demand for multi-modal +data shows that this is not trivial. + +## Description + +In this task, the goal is to take one modality and predict the other +modality for all features in each cell. This task requires translating +information between multiple layers of gene regulation. In some ways, +this is similar to the task of machine translation. In machine +translation, the same sentiment is expressed in multiple languages and +the goal is to train a model to represent the same meaning in a +different language. In this context, the same cellular state is measured +in two different feature sets and the goal of this task is to translate +the information about cellular state from one modality to the other. + +## Authors & contributors + +| name | roles | +|:-------------------|:-------------------| +| Robrecht Cannoodt | author, maintainer | +| Kai Waldrant | contributor | +| Louise Deconinck | author | +| Alex Tong | author | +| Bastian Rieck | author | +| Daniel Burkhardt | author | +| Alejandro Granados | author | + +## API + +``` mermaid +flowchart LR + file_common_dataset_mod1("Raw dataset RNA") + comp_process_dataset[/"Data processor"/] + file_train_mod1("Train mod1") + file_train_mod2("Train mod2") + file_test_mod1("Test mod1") + file_test_mod2("Test mod2") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_prediction("Prediction") + file_score("Score") + file_common_dataset_mod2("Raw dataset mod2") + file_common_dataset_mod1---comp_process_dataset + comp_process_dataset-->file_train_mod1 + comp_process_dataset-->file_train_mod2 + comp_process_dataset-->file_test_mod1 + comp_process_dataset-->file_test_mod2 + file_train_mod1---comp_control_method + file_train_mod1---comp_method + file_train_mod2---comp_control_method + file_train_mod2---comp_method + file_test_mod1---comp_control_method + file_test_mod1---comp_method + file_test_mod2---comp_control_method + file_test_mod2---comp_metric + comp_control_method-->file_prediction + comp_method-->file_prediction + comp_metric-->file_score + file_prediction---comp_metric + file_common_dataset_mod2---comp_process_dataset +``` + +## File format: Raw dataset RNA + +The RNA modality of the raw dataset. + +Example file: +`resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod1.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'feature_id', 'feature_name' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["feature_id"]` | `string` | Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ +## Component type: Data processor + +Path: +[`src/predict_modality`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/predict_modality) + +A predict modality dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:----------------------|:----------|:---------------------------------------------------------------------------| +| `--input_mod1` | `file` | The RNA modality of the raw dataset. | +| `--input_mod2` | `file` | The second modality of the raw dataset. Must be an ADT or an ATAC dataset. | +| `--output_train_mod1` | `file` | (*Output*) The mod1 expression values of the train cells. | +| `--output_train_mod2` | `file` | (*Output*) The mod2 expression values of the train cells. | +| `--output_test_mod1` | `file` | (*Output*) The mod1 expression values of the test cells. | +| `--output_test_mod2` | `file` | (*Output*) The mod2 expression values of the test cells. | +| `--seed` | `integer` | (*Optional*) The seed for determining the train/test split. Default: `1`. | + +
+ +## File format: Train mod1 + +The mod1 expression values of the train cells. + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'gene_ids' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'common_dataset_id', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ +## File format: Train mod2 + +The mod2 expression values of the train cells. + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'gene_ids' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'common_dataset_id', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ +## File format: Test mod1 + +The mod1 expression values of the test cells. + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'gene_ids' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'common_dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ +## File format: Test mod2 + +The mod2 expression values of the test cells. + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod2.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'gene_ids' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'common_dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ +## Component type: Control method + +Path: +[`src/predict_modality/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/predict_modality/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:-------------------------------------------------------------------------| +| `--input_train_mod1` | `file` | The mod1 expression values of the train cells. | +| `--input_train_mod2` | `file` | The mod2 expression values of the train cells. | +| `--input_test_mod1` | `file` | The mod1 expression values of the test cells. | +| `--input_test_mod2` | `file` | The mod2 expression values of the test cells. | +| `--output` | `file` | (*Output*) A prediction of the mod2 expression values of the test cells. | + +
+ +## Component type: Method + +Path: +[`src/predict_modality/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/predict_modality/methods) + +A regression method. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:-------------------------------------------------------------------------| +| `--input_train_mod1` | `file` | The mod1 expression values of the train cells. | +| `--input_train_mod2` | `file` | The mod2 expression values of the train cells. | +| `--input_test_mod1` | `file` | The mod1 expression values of the test cells. | +| `--output` | `file` | (*Output*) A prediction of the mod2 expression values of the test cells. | + +
+ +## Component type: Metric + +Path: +[`src/predict_modality/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/predict_modality/metrics) + +A predict modality metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------------|:-------|:--------------------------------------------------------------| +| `--input_prediction` | `file` | A prediction of the mod2 expression values of the test cells. | +| `--input_test_mod2` | `file` | The mod2 expression values of the test cells. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Prediction + +A prediction of the mod2 expression values of the test cells + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad` + +Format: + +
+ + AnnData object + layers: 'normalized' + uns: 'dataset_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:---------|:----------------------------------------| +| `layers["normalized"]` | `double` | Predicted normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +Metric score file + +Example file: +`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ +## File format: Raw dataset mod2 + +The second modality of the raw dataset. Must be an ADT or an ATAC +dataset + +Example file: +`resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch', 'size_factors' + var: 'feature_id', 'feature_name' + obsm: 'gene_activity' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:---------------------------------|:----------|:-------------------------------------------------------------------------------| +| `obs["batch"]` | `string` | Batch information. | +| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | +| `var["feature_id"]` | `string` | Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | +| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | + +
+ diff --git a/src/tasks/predict_modality/api/comp_control_method.yaml b/src/tasks/predict_modality/api/comp_control_method.yaml new file mode 100644 index 0000000000..82ab6e441f --- /dev/null +++ b/src/tasks/predict_modality/api/comp_control_method.yaml @@ -0,0 +1,42 @@ +functionality: + namespace: "predict_modality/control_methods" + info: + type: control_method + preferred_normalization: counts # there is currently only one type of normalization + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_train_mod1" + __merge__: file_train_mod1.yaml + direction: input + required: true + - name: "--input_train_mod2" + __merge__: file_train_mod2.yaml + direction: input + required: true + - name: "--input_test_mod1" + __merge__: file_test_mod1.yaml + direction: input + required: true + - name: "--input_test_mod2" + __merge__: file_test_mod2.yaml + direction: input + required: true + - name: "--output" + __merge__: file_prediction.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap + dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method.yaml b/src/tasks/predict_modality/api/comp_method.yaml new file mode 100644 index 0000000000..49ccc1e27b --- /dev/null +++ b/src/tasks/predict_modality/api/comp_method.yaml @@ -0,0 +1,34 @@ +functionality: + namespace: "predict_modality/methods" + info: + type: method + type_info: + label: Method + summary: A regression method. + description: | + A regression method to predict the expression of one modality from another. + arguments: + - name: "--input_train_mod1" + __merge__: file_train_mod1.yaml + direction: input + required: true + - name: "--input_train_mod2" + __merge__: file_train_mod2.yaml + direction: input + required: true + - name: "--input_test_mod1" + __merge__: file_test_mod1.yaml + direction: input + required: true + - name: "--output" + __merge__: file_prediction.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap + dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap + - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method_predict.yaml b/src/tasks/predict_modality/api/comp_method_predict.yaml new file mode 100644 index 0000000000..a43cd1e5c5 --- /dev/null +++ b/src/tasks/predict_modality/api/comp_method_predict.yaml @@ -0,0 +1,30 @@ +functionality: + namespace: "predict_modality/methods" + info: + type: method_predict + type_info: + label: Predict + summary: Make predictions using a trained model. + description: | + This method makes predictions using a trained model. + arguments: + - name: "--input_train_mod1" + __merge__: file_train_mod1.yaml + direction: input + required: false + - name: "--input_train_mod2" + __merge__: file_train_mod2.yaml + direction: input + required: false + - name: "--input_test_mod1" + __merge__: file_test_mod1.yaml + direction: input + required: true + - name: "--input_model" + __merge__: file_pretrained_model.yaml + direction: input + required: true + - name: "--output" + __merge__: file_prediction.yaml + direction: output + required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method_train.yaml b/src/tasks/predict_modality/api/comp_method_train.yaml new file mode 100644 index 0000000000..3f07c1efcf --- /dev/null +++ b/src/tasks/predict_modality/api/comp_method_train.yaml @@ -0,0 +1,26 @@ +functionality: + namespace: "predict_modality/methods" + info: + type: method_train + type_info: + label: Train + summary: Train a model to predict the expression of one modality from another. + description: | + This method trains a model to predict the expression of one modality from another. + arguments: + - name: "--input_train_mod1" + __merge__: file_train_mod1.yaml + direction: input + required: true + - name: "--input_train_mod2" + __merge__: file_train_mod2.yaml + direction: input + required: true + - name: "--input_test_mod1" + __merge__: file_test_mod1.yaml + direction: input + required: false + - name: "--output" + __merge__: file_pretrained_model.yaml + direction: output + required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_metric.yaml b/src/tasks/predict_modality/api/comp_metric.yaml new file mode 100644 index 0000000000..c85f900e46 --- /dev/null +++ b/src/tasks/predict_modality/api/comp_metric.yaml @@ -0,0 +1,30 @@ +functionality: + namespace: "predict_modality/metrics" + info: + type: metric + type_info: + label: Metric + summary: A predict modality metric. + description: | + A metric for evaluating predicted expression. + arguments: + - name: --input_prediction + __merge__: file_prediction.yaml + direction: input + required: true + - name: --input_test_mod2 + __merge__: file_test_mod2.yaml + direction: input + required: true + - name: --output + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap + dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap + - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_process_dataset.yaml b/src/tasks/predict_modality/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..c2c5feb2eb --- /dev/null +++ b/src/tasks/predict_modality/api/comp_process_dataset.yaml @@ -0,0 +1,43 @@ +functionality: + namespace: "predict_modality" + info: + type: process_dataset + type_info: + label: Data processor + summary: A predict modality dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input_mod1" + __merge__: file_common_dataset_mod1.yaml + direction: input + required: true + - name: "--input_mod2" + __merge__: file_common_dataset_mod2.yaml + direction: input + required: true + - name: "--output_train_mod1" + __merge__: file_train_mod1.yaml + direction: output + required: true + - name: "--output_train_mod2" + __merge__: file_train_mod2.yaml + direction: output + required: true + - name: "--output_test_mod1" + __merge__: file_test_mod1.yaml + direction: "output" + required: true + - name: "--output_test_mod2" + __merge__: file_test_mod2.yaml + direction: output + required: true + - name: "--seed" + type: integer + default: 1 + description: "The seed for determining the train/test split." + test_resources: + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/common/openproblems_neurips2021/bmmc_cite + dest: resources_test/common/openproblems_neurips2021/bmmc_cite \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml new file mode 100644 index 0000000000..4824a05c46 --- /dev/null +++ b/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml @@ -0,0 +1,98 @@ +type: file +example: "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod1.h5ad" +info: + label: "Raw dataset RNA" + summary: "The RNA modality of the raw dataset." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: true + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: normalization_id + type: string + description: The unique identifier of the normalization method used. + required: true + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml new file mode 100644 index 0000000000..e0b1b3bae9 --- /dev/null +++ b/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml @@ -0,0 +1,98 @@ +type: file +example: "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad" +info: + label: "Raw dataset mod2" + summary: "The second modality of the raw dataset. Must be an ADT or an ATAC dataset" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: true + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: normalization_id + type: string + description: The unique identifier of the normalization method used. + required: true + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_prediction.yaml b/src/tasks/predict_modality/api/file_prediction.yaml new file mode 100644 index 0000000000..0464b323d1 --- /dev/null +++ b/src/tasks/predict_modality/api/file_prediction.yaml @@ -0,0 +1,20 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad" +info: + label: "Prediction" + summary: "A prediction of the mod2 expression values of the test cells" + slots: + layers: + - type: double + name: normalized + description: Predicted normalized expression values + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_pretrained_model.yaml b/src/tasks/predict_modality/api/file_pretrained_model.yaml new file mode 100644 index 0000000000..f8c4a717ac --- /dev/null +++ b/src/tasks/predict_modality/api/file_pretrained_model.yaml @@ -0,0 +1,4 @@ +type: file +info: + label: "Pretrained model" + summary: "A pretrained model for predicting the expression of one modality from another." diff --git a/src/tasks/predict_modality/api/file_score.yaml b/src/tasks/predict_modality/api/file_score.yaml new file mode 100644 index 0000000000..928e18eebf --- /dev/null +++ b/src/tasks/predict_modality/api/file_score.yaml @@ -0,0 +1,25 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/score.h5ad" +info: + label: "Score" + summary: "Metric score file" + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/tasks/predict_modality/api/file_test_mod1.yaml b/src/tasks/predict_modality/api/file_test_mod1.yaml new file mode 100644 index 0000000000..fa67672104 --- /dev/null +++ b/src/tasks/predict_modality/api/file_test_mod1.yaml @@ -0,0 +1,85 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad" +info: + label: "Test mod1" + summary: "The mod1 expression values of the test cells." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: gene_ids + description: The gene identifiers (if available) + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: common_dataset_id + description: "A common identifier for the dataset" + required: false + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: normalization_id + type: string + description: The unique identifier of the normalization method used. + required: true + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false diff --git a/src/tasks/predict_modality/api/file_test_mod2.yaml b/src/tasks/predict_modality/api/file_test_mod2.yaml new file mode 100644 index 0000000000..417edf6162 --- /dev/null +++ b/src/tasks/predict_modality/api/file_test_mod2.yaml @@ -0,0 +1,81 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod2.h5ad" +info: + label: "Test mod2" + summary: "The mod2 expression values of the test cells." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: gene_ids + description: The gene identifiers (if available) + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: common_dataset_id + description: "A common identifier for the dataset" + required: false + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_train_mod1.yaml b/src/tasks/predict_modality/api/file_train_mod1.yaml new file mode 100644 index 0000000000..a4919ee7bd --- /dev/null +++ b/src/tasks/predict_modality/api/file_train_mod1.yaml @@ -0,0 +1,65 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad" +info: + label: "Train mod1" + summary: "The mod1 expression values of the train cells." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: gene_ids + description: The gene identifiers (if available) + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: common_dataset_id + description: "A common identifier for the dataset" + required: false + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: normalization_id + type: string + description: The unique identifier of the normalization method used. + required: true + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_train_mod2.yaml b/src/tasks/predict_modality/api/file_train_mod2.yaml new file mode 100644 index 0000000000..dcbfae45de --- /dev/null +++ b/src/tasks/predict_modality/api/file_train_mod2.yaml @@ -0,0 +1,65 @@ +type: file +example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad" +info: + label: "Train mod2" + summary: "The mod2 expression values of the train cells." + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + - type: double + name: size_factors + description: The size factors of the cells prior to normalization. + required: false + var: + - type: string + name: gene_ids + description: The gene identifiers (if available) + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: common_dataset_id + description: "A common identifier for the dataset" + required: false + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: normalization_id + type: string + description: The unique identifier of the normalization method used. + required: true + - type: string + name: gene_activity_var_names + description: "Names of the gene activity matrix" + required: false + obsm: + - type: double + name: gene_activity + description: ATAC gene activity + required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/task_info.yaml b/src/tasks/predict_modality/api/task_info.yaml new file mode 100644 index 0000000000..e0d1ed9da7 --- /dev/null +++ b/src/tasks/predict_modality/api/task_info.yaml @@ -0,0 +1,67 @@ +name: predict_modality +label: Predict Modality +summary: "Predicting the profiles of one modality (e.g. protein abundance) from another (e.g. mRNA expression)." +image: "thumbnail.svg" +motivation: | + Experimental techniques to measure multiple modalities within the same single cell are increasingly becoming available. + The demand for these measurements is driven by the promise to provide a deeper insight into the state of a cell. + Yet, the modalities are also intrinsically linked. We know that DNA must be accessible (ATAC data) to produce mRNA + (expression data), and mRNA in turn is used as a template to produce protein (protein abundance). These processes + are regulated often by the same molecules that they produce: for example, a protein may bind DNA to prevent the production + of more mRNA. Understanding these regulatory processes would be transformative for synthetic biology and drug target discovery. + Any method that can predict a modality from another must have accounted for these regulatory processes, but the demand for + multi-modal data shows that this is not trivial. +description: | + In this task, the goal is to take one modality and predict the other modality for all + features in each cell. This task requires translating information between multiple layers of + gene regulation. In some ways, this is similar to the task of machine translation. In machine translation, the same + sentiment is expressed in multiple languages and the goal is to train a model to represent the same meaning in a different + language. In this context, the same cellular state is measured in two different feature sets and the goal of this task + is to translate the information about cellular state from one modality to the other. +authors: + - name: Robrecht Cannoodt + roles: [ author, maintainer ] + info: + github: rcannood + orcid: "0000-0003-3641-729X" + - name: Kai Waldrant + roles: [ contributor ] + info: + github: KaiWaldrant + orcid: "0009-0003-8555-1361" + - name: Louise Deconinck + roles: [ author ] + info: + github: LouiseDck + - name: Alex Tong + roles: [ author ] + info: + github: atong01 + - name: Bastian Rieck + roles: [ author ] + info: + github: Pseudomanifold + - name: Daniel Burkhardt + roles: [ author ] + info: + github: dburkhardt + - name: Alejandro Granados + roles: [ author ] + info: + github: agranado + - name: Kaiwen Deng + roles: [ contributor ] + info: + email: dengkw@umich.edu + github: nonztalk + - name: Xueer Chen + roles: [ contributor ] + info: + github: xuerchen + email: xc2579@columbia.edu + - name: Jiwei Liu + roles: [ contributor ] + info: + github: daxiongshu + email: jiweil@nvidia.com + orcid: "0000-0002-8799-9763" diff --git a/src/tasks/predict_modality/api/thumbnail.svg b/src/tasks/predict_modality/api/thumbnail.svg new file mode 100644 index 0000000000..59436e6187 --- /dev/null +++ b/src/tasks/predict_modality/api/thumbnail.svg @@ -0,0 +1,666 @@ + + + + + + + + Gene + Expression + A + B + C + + + + + + True + Predicted + + + + + + + + + + + Chromatin Accessibility + Gene Expression + + Cell 1 + Cell 2 + + + + + + + + + + + + + + + Cell 3 + + + + + + + + A + B + C + Gene + + + + + + + + Task + Metric + Root mean square error + + + + + + + + + + + + + + A + B + C + Gene + + + + + + + + + + + + + + Ground-truth + Predicted + + Value Type + + + + + + Gene A + Genes + Gene B + Gene C + + diff --git a/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml b/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml new file mode 100644 index 0000000000..9521b90508 --- /dev/null +++ b/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml @@ -0,0 +1,17 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: mean_per_gene + info: + label: Mean per gene + summary: Returns the mean expression value per gene. + description: Returns the mean expression value per gene. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] + \ No newline at end of file diff --git a/src/tasks/predict_modality/control_methods/meanpergene/script.py b/src/tasks/predict_modality/control_methods/meanpergene/script.py new file mode 100644 index 0000000000..043f19d42a --- /dev/null +++ b/src/tasks/predict_modality/control_methods/meanpergene/script.py @@ -0,0 +1,37 @@ +import anndata as ad +from scipy.sparse import csc_matrix +import numpy as np + +# VIASH START +par = { + "input_train_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", + "input_test_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", + "input_train_mod2": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", + "output": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/prediction.h5ad", +} + +meta = { + "functionality_name": "foo" +} +# VIASH END + +input_test_mod1 = ad.read_h5ad(par["input_test_mod1"]) +input_train_mod2 = ad.read_h5ad(par["input_train_mod2"]) + + +# Find the correct shape +mean = np.array(input_train_mod2.layers["normalized"].mean(axis=0)).flatten() +prediction = csc_matrix(np.tile(mean, (input_test_mod1.shape[0], 1))) + +# Write out prediction +out = ad.AnnData( + layers={"normalized": prediction}, + shape=prediction.shape, + obs=input_test_mod1.obs, + var=input_train_mod2.var, + uns={ + "dataset_id": input_test_mod1.uns["dataset_id"], + "method_id": meta["functionality_name"], + } +) +out.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml b/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml new file mode 100644 index 0000000000..3324c53a91 --- /dev/null +++ b/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml @@ -0,0 +1,16 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: random_predict + info: + label: Random predictions + summary: Returns random training profiles. + description: Returns random training profiles. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/random_predict/script.R b/src/tasks/predict_modality/control_methods/random_predict/script.R new file mode 100644 index 0000000000..ab96dcc26a --- /dev/null +++ b/src/tasks/predict_modality/control_methods/random_predict/script.R @@ -0,0 +1,34 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +par <- list( + input_train_mod1 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", + input_test_mod1 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", + input_train_mod2 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", + output = "output.h5ad" +) +meta <- list(functionality_name = "foo") +## VIASH END + +cat("Reading h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +cat("Creating outputs object\n") +sample_ix <- sample.int(nrow(input_train_mod2), nrow(input_test_mod1), replace = TRUE) +prediction <- input_train_mod2$layers[["normalized"]][sample_ix, , drop = FALSE] +rownames(prediction) <- rownames(input_test_mod1) + +out <- anndata::AnnData( + layers = list(normalized = prediction), + shape = dim(prediction), + uns = list( + dataset_id = input_train_mod2$uns[["dataset_id"]], + method_id = meta[["functionality_name"]] + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml b/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml new file mode 100644 index 0000000000..350b0e79ea --- /dev/null +++ b/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml @@ -0,0 +1,16 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: solution + info: + label: Solution + summary: Returns the ground-truth solution. + description: Returns the ground-truth solution. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/solution/script.R b/src/tasks/predict_modality/control_methods/solution/script.R new file mode 100644 index 0000000000..ae7c288e29 --- /dev/null +++ b/src/tasks/predict_modality/control_methods/solution/script.R @@ -0,0 +1,20 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) + +## VIASH START +par <- list( + input_test_mod2 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod2.h5ad", + output = "output.h5ad" +) + +meta <- list( + functionality_name = "foo" +) +## VIASH END + +cat("Reading h5ad files\n") +ad2_test <- anndata::read_h5ad(par$input_test_mod2) +ad2_test$uns[["method_id"]] <- meta$functionality_name + +cat("Writing predictions to file\n") +zzz <- ad2_test$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml b/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml new file mode 100644 index 0000000000..344df9c338 --- /dev/null +++ b/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml @@ -0,0 +1,16 @@ +__merge__: ../../api/comp_control_method.yaml +functionality: + name: zeros + info: + label: Zeros + summary: Returns a prediction consisting of all zeros. + description: Returns a prediction consisting of all zeros. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/zeros/script.py b/src/tasks/predict_modality/control_methods/zeros/script.py new file mode 100644 index 0000000000..600b5c696c --- /dev/null +++ b/src/tasks/predict_modality/control_methods/zeros/script.py @@ -0,0 +1,37 @@ +import anndata +from scipy.sparse import csc_matrix +import numpy as np + +# VIASH START +par = { + "input_train_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", + "input_test_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", + "input_train_mod2": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", + "output": "output.h5ad", +} + +meta = { + "functionality_name": "foo" +} +# VIASH END + +print("Reading h5ad files", flush=True) +ad_mod1_test = anndata.read_h5ad(par["input_test_mod1"]) +ad_mod2 = anndata.read_h5ad(par["input_train_mod2"]) + +print("create output objects", flush=True) +prediction = csc_matrix((ad_mod1_test.n_obs, ad_mod2.n_vars), dtype = np.float32) + +out = anndata.AnnData( + layers={"normalized": prediction}, + shape=prediction.shape, + obs=ad_mod1_test.obs, + var=ad_mod2.var, + uns={ + "dataset_id": ad_mod2.uns["dataset_id"], + "method_id": meta["functionality_name"], + } +) + +print("write predictions to file", flush=True) +out.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml new file mode 100644 index 0000000000..8663123ad9 --- /dev/null +++ b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: guanlab_dengkw_pm + info: + label: Guanlab-dengkw + summary: A kernel ridge regression method with RBF kernel. + description: | + This is a solution developed by Team Guanlab - dengkw in the Neurips 2021 competition to predict one modality + from another using kernel ridge regression (KRR) with RBF kernel. Truncated SVD is applied on the combined + training and test data from modality 1 followed by row-wise z-score normalization on the reduced matrix. The + truncated SVD of modality 2 is predicted by training a KRR model on the normalized training matrix of modality 1. + Predictions on the normalized test matrix are then re-mapped to the modality 2 feature space via the right + singular vectors. + preferred_normalization: log_cp10k + reference: lance2022multimodal + documentation_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw + repository_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw + competition_submission_id: 170636 + arguments: + - name: "--distance_method" + type: "string" + default: "minkowski" + description: The distance metric to use. Possible values include `euclidean` and `minkowski`. + choices: [euclidean, minkowski] + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of components to use for dimensionality reduction. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - pandas + - numpy + - type: nextflow + directives: + label: [hightime, highmem, highcpu] diff --git a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py new file mode 100644 index 0000000000..aafd2948c8 --- /dev/null +++ b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py @@ -0,0 +1,136 @@ +import anndata as ad +import numpy as np +from scipy.sparse import csc_matrix +from sklearn.decomposition import TruncatedSVD +from sklearn.gaussian_process.kernels import RBF +from sklearn.kernel_ridge import KernelRidge + +## VIASH START +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/test_mod1.h5ad', + 'output': 'output.h5ad', + 'distance_method': 'minkowski', + 'n_pcs': 50 +} +meta = { + 'functionality_name': 'guanlab_dengkw_pm' +} +## VIASH END + + +## Removed PCA and normalization steps, as they arr already performed with the input data +print('Reading input files', flush=True) +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +batches = input_train_mod1.obs.batch.unique().tolist() +batch_len = len(batches) + +# combine the train and test data +input_train = ad.concat( + {"train": input_train_mod1, "test": input_test_mod1}, + axis=0, + join="outer", + label="group", + fill_value=0, + index_unique="-" +) + +print('Determine parameters by the modalities', flush=True) +mod1_type = input_train_mod1.uns["modality"].upper() +mod2_type = input_train_mod2.uns["modality"].upper() +n_comp_dict = { + ("GEX", "ADT"): (300, 70, 10, 0.2), + ("ADT", "GEX"): (None, 50, 10, 0.2), + ("GEX", "ATAC"): (1000, 50, 10, 0.1), + ("ATAC", "GEX"): (100, 70, 10, 0.1) +} +print(f"{mod1_type}, {mod2_type}", flush=True) +n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)] +print(f"{n_mod1}, {n_mod2}, {scale}, {alpha}", flush=True) + +# Perform PCA on the input data +print('Models using the Truncated SVD to reduce the dimension', flush=True) + +if n_mod1 is not None and n_mod1 < input_train.n_vars: + embedder_mod1 = TruncatedSVD(n_components=n_mod1) + mod1_pca = embedder_mod1.fit_transform(input_train.layers["normalized"]).astype(np.float32) + train_matrix = mod1_pca[input_train.obs['group'] == 'train'] + test_matrix = mod1_pca[input_train.obs['group'] == 'test'] +else: + train_matrix = input_train_mod1.to_df(layer="normalized").values.astype(np.float32) + test_matrix = input_test_mod1.to_df(layer="normalized").values.astype(np.float32) + +if n_mod2 is not None and n_mod2 < input_train_mod2.n_vars: + embedder_mod2 = TruncatedSVD(n_components=n_mod2) + train_gs = embedder_mod2.fit_transform(input_train_mod2.layers["normalized"]).astype(np.float32) +else: + train_gs = input_train_mod2.to_df(layer="normalized").values.astype(np.float32) + +del input_train + +print('Running normalization ...', flush=True) +train_sd = np.std(train_matrix, axis=1).reshape(-1, 1) +train_sd[train_sd == 0] = 1 +train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd +train_norm = train_norm.astype(np.float32) +del train_matrix + +test_sd = np.std(test_matrix, axis=1).reshape(-1, 1) +test_sd[test_sd == 0] = 1 +test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd +test_norm = test_norm.astype(np.float32) +del test_matrix + +print('Running KRR model ...', flush=True) +if batch_len == 1: + # just in case there is only one batch + batch_subsets = [batches] +elif mod1_type == "ADT" or mod2_type == "ADT": + # two fold consensus predictions + batch_subsets = [ + batches[:batch_len//2], + batches[batch_len//2:] + ] +else: + # leave-one-batch-out consensus predictions + batch_subsets = [ + batches[:i] + batches[i+1:] + for i in range(batch_len) + ] + +y_pred = np.zeros((input_test_mod1.n_obs, input_train_mod2.n_vars), dtype=np.float32) +for batch in batch_subsets: + print(batch, flush=True) + kernel = RBF(length_scale = scale) + krr = KernelRidge(alpha=alpha, kernel=kernel) + print('Fitting KRR ... ', flush=True) + krr.fit( + train_norm[input_train_mod1.obs.batch.isin(batch)], + train_gs[input_train_mod2.obs.batch.isin(batch)] + ) + y_pred += (krr.predict(test_norm) @ embedder_mod2.components_) + +np.clip(y_pred, a_min=0, a_max=None, out=y_pred) +y_pred /= len(batch_subsets) + +# Store as sparse matrix to be efficient. +# Note that this might require different classifiers/embedders before-hand. +# Not every class is able to support such data structures. +## Changed from csr to csc matrix as this is more supported. +y_pred = csc_matrix(y_pred) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + layers = { 'normalized': y_pred }, + obs = input_test_mod1.obs[[]], + var = input_train_mod2.var[[]], + uns = { + 'dataset_id': input_train_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'] + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml new file mode 100644 index 0000000000..543ee71fa1 --- /dev/null +++ b/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml @@ -0,0 +1,33 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: knnr_py + info: + label: KNNR (Py) + summary: K-nearest neighbor regression in Python. + description: K-nearest neighbor regression in Python. + reference: fix1989discriminatory + documentation_url: https://scikit-learn.org/stable/modules/neighbors.html + repository_url: https://github.com/scikit-learn/scikit-learn + preferred_normalization: log_cp10k + arguments: + - name: "--distance_method" + type: "string" + default: "minkowski" + description: The distance metric to use. Possible values include `euclidean` and `minkowski`. + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of components to use for dimensionality reduction. + - name: "--n_neighbors" + type: "integer" + default: 100 + description: Number of neighbors to use. + resources: + - type: python_script + path: script.py +platforms: + - type: docker + image: openproblems/base_python:1.0.0 + - type: nextflow + directives: + label: [hightime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/knnr_py/script.py b/src/tasks/predict_modality/methods/knnr_py/script.py new file mode 100644 index 0000000000..f08c335ffe --- /dev/null +++ b/src/tasks/predict_modality/methods/knnr_py/script.py @@ -0,0 +1,67 @@ +import anndata as ad +from scipy.sparse import csc_matrix +from sklearn.decomposition import TruncatedSVD +from sklearn.neighbors import KNeighborsRegressor + +## VIASH START +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad', + 'distance_method': 'minkowski', + 'output': 'output.h5ad', + 'n_pcs': 4, + 'n_neighbors': 5, +} +meta = { 'functionality_name': 'foo' } +## VIASH END + +print('Reading `h5ad` files...', flush=True) +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +input_train = ad.concat( + {"train": input_train_mod1, "test": input_test_mod1}, + axis=0, + join="outer", + label="group", + fill_value=0, + index_unique="-" +) + +print('Performing dimensionality reduction on modality 1 values...', flush=True) +embedder = TruncatedSVD(n_components=par['n_pcs']) +X = embedder.fit_transform(input_train.layers["normalized"]) + +# split dimred back up +X_train = X[input_train.obs['group'] == 'train'] +X_test = X[input_train.obs['group'] == 'test'] +y_train = input_train_mod2.layers["normalized"].toarray() + +assert len(X_train) + len(X_test) == len(X) + +print('Running KNN regression...', flush=True) + +reg = KNeighborsRegressor( + n_neighbors=par['n_neighbors'], + metric=par['distance_method'] +) + +reg.fit(X_train, y_train) +y_pred = reg.predict(X_test) + +y_pred = csc_matrix(y_pred) + +adata = ad.AnnData( + layers={"normalized": y_pred}, + obs=input_test_mod1.obs, + var=input_train_mod2.var, + uns={ + 'dataset_id': input_train_mod1.uns['dataset_id'], + 'method_id': meta["functionality_name"], + }, +) + +print('Storing annotated data...', flush=True) +adata.write_h5ad(par['output'], compression = "gzip") diff --git a/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml new file mode 100644 index 0000000000..448b3ca0b8 --- /dev/null +++ b/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml @@ -0,0 +1,36 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: knnr_r + info: + label: KNNR (R) + summary: K-nearest neighbor regression in R. + description: K-nearest neighbor regression in R. + reference: fix1989discriminatory + documentation_url: https://cran.r-project.org/package=FNN + repository_url: https://github.com/cran/FNN + preferred_normalization: log_cp10k + arguments: + - name: "--distance_method" + type: "string" + default: "spearman" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of principal components to use. + - name: "--n_neighbors" + type: "integer" + default: 20 + description: Number of neighbors to use in the knn regression. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ lmds, FNN, proxyC] + - type: nextflow + directives: + label: [hightime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/knnr_r/script.R b/src/tasks/predict_modality/methods/knnr_r/script.R new file mode 100644 index 0000000000..5679f8dd2d --- /dev/null +++ b/src/tasks/predict_modality/methods/knnr_r/script.R @@ -0,0 +1,81 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + output = "output.h5ad", + n_pcs = 4L, + n_neighbors = 3, + distance_method = "pearson" +) +## VIASH END + +cat("Reading mod1 h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +dataset_id <- input_train_mod1$uns[["dataset_id"]] + +# subset to HVG to reduce memory consumption +train_mod1_sd <- proxyC::colSds(input_train_mod1$layers[["normalized"]]) +ix <- order(train_mod1_sd, decreasing = TRUE)[seq_len(min(1000, length(train_mod1_sd)))] +input_train_mod1 <- input_train_mod1[,ix]$copy() +gc() + +# subset to HVG to reduce memory consumption +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) +input_test_mod1 <- input_test_mod1[,ix]$copy() +gc() + +cat("Performing DR on the mod1 values\n") +# LMDS is more efficient than regular MDS because +# it does not compure a square distance matrix. +dr_mod1 <- lmds::lmds( + rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), + ndim = par$n_pcs, + distance_method = par$distance_method +) + +ix <- seq_len(nrow(input_train_mod1)) +dr_mod1_train <- dr_mod1[ix, , drop = FALSE] +dr_mod1_test <- dr_mod1[-ix, , drop = FALSE] + +# remove previous objects to save memory +rm(input_train_mod1, input_test_mod1) +gc() + +cat("Reading mod2 h5ad files\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) + +cat("Predicting for each column in modality 2\n") +# precompute knn indices +knn_ix <- FNN::get.knnx( + dr_mod1_train, + dr_mod1_test, + k = par$n_neighbors +)$nn.index + +# perform knn regression. +pred <- input_train_mod2$layers[["normalized"]][knn_ix[, 1], , drop = FALSE] +if (par$n_neighbors > 1) { + for (k in seq(2, par$n_neighbors)) { + pred <- pred + input_train_mod2$layers[["normalized"]][knn_ix[, k], , drop = FALSE] + } +} +pred <- pred / par$n_neighbors +rownames(pred) <- rownames(dr_mod1_test) + +out <- anndata::AnnData( + layers = list(normalized = pred), + shape = dim(pred), + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/lm/config.vsh.yaml b/src/tasks/predict_modality/methods/lm/config.vsh.yaml new file mode 100644 index 0000000000..3fdbc0f243 --- /dev/null +++ b/src/tasks/predict_modality/methods/lm/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: lm + info: + label: Linear Model + summary: Linear model regression. + description: A linear model regression method. + reference: wilkinson1973symbolic + repository_url: https://github.com/RcppCore/RcppArmadillo + documentation_url: https://cran.r-project.org/package=RcppArmadillo + preferred_normalization: log_cp10k + arguments: + - name: "--distance_method" + type: "string" + default: "spearman" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_pcs" + type: "integer" + default: 50 + description: Number of principal components to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ lmds, RcppArmadillo, pbapply] + - type: nextflow + directives: + label: [hightime, highmem, highcpu] diff --git a/src/tasks/predict_modality/methods/lm/script.R b/src/tasks/predict_modality/methods/lm/script.R new file mode 100644 index 0000000000..58d3febfb5 --- /dev/null +++ b/src/tasks/predict_modality/methods/lm/script.R @@ -0,0 +1,74 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +requireNamespace("pbapply", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + output = "output.h5ad", + n_pcs = 4L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +cat("Reading mod1 files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + + +cat("Performing DR on the mod1 values\n") +dr <- lmds::lmds( + rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), + ndim = par$n_pcs, + distance_method = par$distance_method +) + +ix <- seq_len(nrow(input_train_mod1)) +dr_train <- dr[ix, , drop = FALSE] +dr_test <- dr[-ix, , drop = FALSE] + +rm(input_test_mod1) +gc() + + +cat("Reading mod2 files\n") +X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] + +cat("Predicting for each column in modality 2\n") +preds <- pbapply::pblapply( + seq_len(ncol(X_mod2)), + function(i) { + y <- X_mod2[, i] + uy <- unique(y) + if (length(uy) > 1) { + fit <- RcppArmadillo::fastLm(dr_train, y) + # fit <- lm(y ~ ., dr_train) + stats::predict(fit, dr_test) + } else { + rep(uy, nrow(dr_test)) + } + } +) + +cat("Creating outputs object\n") +prediction <- Matrix::Matrix(do.call(cbind, preds), sparse = TRUE) +rownames(prediction) <- rownames(dr_test) +colnames(prediction) <- colnames(X_mod2) + +out <- anndata::AnnData( + layers = list(normalized = prediction), + shape = dim(prediction), + uns = list( + dataset_id = input_train_mod1$uns[["dataset_id"]], + method_id = meta$functionality_name + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml b/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml new file mode 100644 index 0000000000..ba86f0631e --- /dev/null +++ b/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: lmds_irlba_rf + info: + label: LMDS + IRLBA + RF + summary: A random forest regression using LMDS of modality 1 to predict a PCA embedding of modality 2, which is then reversed to predict the original modality 2. + description: | + A random forest regression using LMDS of modality 1 to predict a PCA embedding of modality 2, which is then reversed to predict the original modality 2. + reference: lance2022multimodal + documentation_url: https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/predict_modality/methods #/lmds_irlba_rf + repository_url: https://github.com/openproblems-bio/openproblems-v2 + preferred_normalization: log_cp10k + arguments: + - name: "--distance_method" + type: "string" + default: "pearson" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_pcs" + type: "integer" + default: 20 + description: Number of principal components to use. + - name: "--n_trees" + type: "integer" + default: 500 + description: Number of trees to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [lmds, ranger, pbapply, irlba] + - type: nextflow + directives: + label: [hightime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R b/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R new file mode 100644 index 0000000000..6a5b7ed595 --- /dev/null +++ b/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R @@ -0,0 +1,93 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +requireNamespace("pbapply", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + output = "output.h5ad", + n_pcs = 20L, + n_trees = 50L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +cat("Reading mod1 files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +dataset_id <- input_train_mod1$uns[["dataset_id"]] + +cat("Performing DR on the mod1 values\n") +dr <- lmds::lmds( + rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), + ndim = par$n_pcs, + distance_method = par$distance_method +) +# alternative: +# pr_out <- irlba::prcomp_irlba( +# rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), +# n = par$n_pcs +# ) +# dr <- pr_out$x + +# split up dr data +ix <- seq_len(nrow(input_train_mod1)) +dr_train <- as.data.frame(dr[ix, , drop = FALSE]) +dr_test <- as.data.frame(dr[-ix, , drop = FALSE]) +dr_train <- dr[ix, , drop = FALSE] +dr_test <- dr[-ix, , drop = FALSE] + +rm(input_train_mod1, input_test_mod1) +gc() + + +cat("Reading mod2 files\n") +X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] +prcomp_mod2 <- irlba::prcomp_irlba(X_mod2, n = par$n_pcs) +dr_mod2 <- prcomp_mod2$x + +cat("Predicting for each column in modality 2\n") +pred_drs <- pbapply::pblapply( + seq_len(ncol(dr_mod2)), + function(i) { + y <- dr_mod2[, i] + uy <- unique(y) + if (length(uy) > 1) { + rf <- ranger::ranger( + x = dr_train, + y = y, + num.trees = par$n_trees, + num.threads = n_cores + ) + stats::predict(rf, dr_test)$prediction + } else { + rep(uy, nrow(dr_test)) + } + } +) + +cat("Creating outputs object\n") +pred_dr <- Matrix::Matrix(do.call(cbind, pred_drs), sparse = TRUE) +prediction <- pred_dr %*% t(prcomp_mod2$rotation) +rownames(prediction) <- rownames(dr_test) +colnames(prediction) <- colnames(X_mod2) + +out <- anndata::AnnData( + layers = list(normalized = as(prediction, "CsparseMatrix")), + shape = dim(prediction), + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ) +) + + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml b/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml new file mode 100644 index 0000000000..385f1234bb --- /dev/null +++ b/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: newwave_knnr + status: disabled # disabled due to poor performance and long execution times + info: + label: NewWave+KNNR + summary: Perform DR with NewWave, predict modality with KNN regression. + description: Perform DR with NewWave, predict modality with KNN regression. + reference: agostinis2022newwave + repository_url: https://github.com/fedeago/NewWave + documentation_url: https://bioconductor.org/packages/release/bioc/html/NewWave.html + preferred_normalization: log_cp10k + arguments: + - name: "--newwave_maxiter" + type: "integer" + default: 40 + description: Maximum number of NewWave iterations. + - name: "--newwave_ngene" + type: "integer" + default: 200 + description: Setting of the n_gene_par NewWave parameter. + - name: "--newwave_ncell" + type: "integer" + default: 200 + description: Setting of the n_cell_par NewWave parameter. + - name: "--n_neighbors" + type: "integer" + default: 20 + description: Number of neighbors to use in the knn regression. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ lmds, FNN, proxy, proxyC ] + bioc: [ SingleCellExperiment, NewWave ] + - type: nextflow + directives: + label: [hightime, highmem, highcpu, highsharedmem] diff --git a/src/tasks/predict_modality/methods/newwave_knnr/script.R b/src/tasks/predict_modality/methods/newwave_knnr/script.R new file mode 100644 index 0000000000..84f8a0b469 --- /dev/null +++ b/src/tasks/predict_modality/methods/newwave_knnr/script.R @@ -0,0 +1,107 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) +requireNamespace("NewWave", quietly = TRUE) +requireNamespace("FNN", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) + +## VIASH START +path <- "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + output = "output.h5ad", + newwave_maxiter = 40L, + newwave_ngene = 200L, + newwave_ncell = 200L, + n_neighbors = 20L +) +meta <- list(functionality_name = "foo") +## VIASH END + +print(par) + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +method_id <- meta$functionality_name + +cat("Reading h5ad files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +# fetch batch labels +batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch)) +batch2 <- as.character(input_train_mod1$obs$batch) + +# create SummarizedExperiment object +data1 <- SummarizedExperiment::SummarizedExperiment( + assays = list( + counts = as( + cbind( + t(input_train_mod1$layers[["counts"]]), + t(input_test_mod1$layers[["counts"]]) + ), + "CsparseMatrix" + ) + ), + colData = data.frame(batch = factor(batch1)) +) +data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ] +rm(input_train_mod1, input_test_mod1) +gc() + +cat("Running NewWave on mod1\n") +res1 <- NewWave::newWave( + data1, + X = "~batch", + verbose = TRUE, + K = 10, + maxiter_optimize = par$newwave_maxiter, + n_gene_par = min(par$newwave_ngene, nrow(data1)), + n_cell_par = min(par$newwave_ncell, ncol(data1)), + commondispersion = FALSE +) +dr_mod1 <- SingleCellExperiment::reducedDim(res1) +colnames(dr_mod1) <- paste0("comp_", seq_len(ncol(dr_mod1))) +rm(data1) +gc() + +# split DR matrices +train_ix <- seq_along(batch2) +dr_mod1_train <- dr_mod1[train_ix, , drop = FALSE] +dr_mod1_test <- dr_mod1[-train_ix, , drop = FALSE] + + +cat("Predicting for each column in modality 2\n") +input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) + +# precompute knn indices +knn_ix <- FNN::get.knnx( + dr_mod1_train, + dr_mod1_test, + k = min(nrow(dr_mod1_train), par$n_neighbors) +)$nn.index + +# perform knn regression. +pred <- input_train_mod2$layers[["normalized"]][knn_ix[, 1], , drop = FALSE] +if (par$n_neighbors > 1) { + for (k in seq(2, par$n_neighbors)) { + pred <- pred + input_train_mod2$layers[["normalized"]][knn_ix[, k], , drop = FALSE] + } +} +pred <- pred / par$n_neighbors +rownames(pred) <- rownames(dr_mod1_test) + +cat("Creating outputs object\n") +out <- anndata::AnnData( + layers = list(normalized = pred), + shape = dim(pred), + uns = list( + dataset_id = input_train_mod2$uns[["dataset_id"]], + method_id = meta$functionality_name + ) +) + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/novel/helper_functions.py b/src/tasks/predict_modality/methods/novel/helper_functions.py new file mode 100644 index 0000000000..17c57c9b3b --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/helper_functions.py @@ -0,0 +1,247 @@ +import torch + +from torch import nn +import torch.nn.functional as F + +from torch.utils.data import Dataset + +from typing import Optional + +import anndata +import numpy as np +import pandas as pd +import scipy.sparse +import sklearn.decomposition +import sklearn.feature_extraction.text +import sklearn.preprocessing +import sklearn.neighbors +import sklearn.utils.extmath + +class tfidfTransformer(): + def __init__(self): + self.idf = None + self.fitted = False + + def fit(self, X): + self.idf = X.shape[0] / X.sum(axis=0) + self.fitted = True + + def transform(self, X): + if not self.fitted: + raise RuntimeError('Transformer was not fitted on any data') + if scipy.sparse.issparse(X): + tf = X.multiply(1 / X.sum(axis=1)) + return tf.multiply(self.idf) + else: + tf = X / X.sum(axis=1, keepdims=True) + return tf * self.idf + + def fit_transform(self, X): + self.fit(X) + return self.transform(X) + +class lsiTransformer(): + def __init__(self, + n_components: int = 20, + use_highly_variable = None + ): + self.n_components = n_components + self.use_highly_variable = use_highly_variable + self.tfidfTransformer = tfidfTransformer() + self.normalizer = sklearn.preprocessing.Normalizer(norm="l1") + self.pcaTransformer = sklearn.decomposition.TruncatedSVD(n_components = self.n_components, random_state=777) + # self.lsi_mean = None + # self.lsi_std = None + self.fitted = None + + def fit(self, adata: anndata.AnnData): + if self.use_highly_variable is None: + self.use_highly_variable = "hvg" in adata.var + adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata + X = self.tfidfTransformer.fit_transform(adata_use.X) + X_norm = self.normalizer.fit_transform(X) + X_norm = np.log1p(X_norm * 1e4) + X_lsi = self.pcaTransformer.fit_transform(X_norm) + # self.lsi_mean = X_lsi.mean(axis=1, keepdims=True) + # self.lsi_std = X_lsi.std(axis=1, ddof=1, keepdims=True) + self.fitted = True + + def transform(self, adata): + if not self.fitted: + raise RuntimeError('Transformer was not fitted on any data') + adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata + X = self.tfidfTransformer.transform(adata_use.X) + X_norm = self.normalizer.transform(X) + X_norm = np.log1p(X_norm * 1e4) + X_lsi = self.pcaTransformer.transform(X_norm) + X_lsi -= X_lsi.mean(axis=1, keepdims=True) + X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True) + lsi_df = pd.DataFrame(X_lsi, index = adata_use.obs_names) + return lsi_df + + def fit_transform(self, adata): + self.fit(adata) + return self.transform(adata) + +class ModalityMatchingDataset(Dataset): + def __init__( + self, df_modality1, df_modality2, is_train=True + ): + super().__init__() + self.df_modality1 = df_modality1 + self.df_modality2 = df_modality2 + self.is_train = is_train + def __len__(self): + return self.df_modality1.shape[0] + + def __getitem__(self, index: int): + if self.is_train == True: + x = self.df_modality1.iloc[index].values + y = self.df_modality2.iloc[index].values + return x, y + else: + x = self.df_modality1.iloc[index].values + return x + +class Swish(torch.autograd.Function): + @staticmethod + def forward(ctx, i): + result = i * sigmoid(i) + ctx.save_for_backward(i) + return result + @staticmethod + def backward(ctx, grad_output): + i = ctx.saved_variables[0] + sigmoid_i = sigmoid(i) + return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) + +class Swish_module(nn.Module): + def forward(self, x): + return Swish.apply(x) + +sigmoid = torch.nn.Sigmoid() + +class ModelRegressionGex2Atac(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionGex2Atac, self).__init__() + #self.bn = torch.nn.BatchNorm1d(1024) + self.input_ = nn.Linear(dim_mod1, 1024) + self.fc = nn.Linear(1024, 256) + self.fc1 = nn.Linear(256, 2048) + self.dropout1 = nn.Dropout(p=0.298885630228993) + self.dropout2 = nn.Dropout(p=0.11289717442776658) + self.dropout3 = nn.Dropout(p=0.13523634924414762) + self.output = nn.Linear(2048, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionAtac2Gex(nn.Module): # + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionAtac2Gex, self).__init__() + self.input_ = nn.Linear(dim_mod1, 2048) + self.fc = nn.Linear(2048, 2048) + self.fc1 = nn.Linear(2048, 512) + self.dropout1 = nn.Dropout(p=0.2649138776004753) + self.dropout2 = nn.Dropout(p=0.1769628308148758) + self.dropout3 = nn.Dropout(p=0.2516791883012817) + self.output = nn.Linear(512, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionAdt2Gex(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionAdt2Gex, self).__init__() + self.input_ = nn.Linear(dim_mod1, 512) + self.dropout1 = nn.Dropout(p=0.0) + self.swish = Swish_module() + self.fc = nn.Linear(512, 512) + self.fc1 = nn.Linear(512, 512) + self.fc2 = nn.Linear(512, 512) + self.output = nn.Linear(512, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = F.gelu(self.fc(x)) + x = F.gelu(self.fc1(x)) + x = F.gelu(self.fc2(x)) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionGex2Adt(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionGex2Adt, self).__init__() + self.input_ = nn.Linear(dim_mod1, 512) + self.dropout1 = nn.Dropout(p=0.20335661386636347) + self.dropout2 = nn.Dropout(p=0.15395289261127876) + self.dropout3 = nn.Dropout(p=0.16902655078832815) + self.fc = nn.Linear(512, 512) + self.fc1 = nn.Linear(512, 2048) + self.output = nn.Linear(2048, dim_mod2) + def forward(self, x): + # x = self.batchswap_noise(x) + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +def rmse(y, y_pred): + return np.sqrt(np.mean(np.square(y - y_pred))) + +def train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, name_model, device): + best_score = 100000 + for i in range(100): + train_losses = [] + test_losses = [] + model.train() + + for x, y in dataloader_train: + optimizer.zero_grad() + output = model(x.float().to(device)) + loss = torch.sqrt(loss_fn(output, y.float().to(device))) + loss.backward() + train_losses.append(loss.item()) + optimizer.step() + + model.eval() + with torch.no_grad(): + for x, y in dataloader_test: + output = model(x.float().to(device)) + output[output<0] = 0.0 + loss = torch.sqrt(loss_fn(output, y.float().to(device))) + test_losses.append(loss.item()) + + outputs = [] + targets = [] + model.eval() + with torch.no_grad(): + for x, y in dataloader_test: + output = model(x.float().to(device)) + + outputs.append(output.detach().cpu().numpy()) + targets.append(y.float().detach().cpu().numpy()) + cat_outputs = np.concatenate(outputs) + cat_targets = np.concatenate(targets) + cat_outputs[cat_outputs<0.0] = 0 + + if best_score > rmse(cat_targets,cat_outputs): + torch.save(model.state_dict(), name_model) + best_score = rmse(cat_targets,cat_outputs) + print("best rmse: ", best_score) + diff --git a/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml new file mode 100644 index 0000000000..72e3292407 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../../api/comp_method_predict.yaml +functionality: + name: novel_predict + arguments: + - name: "--input_transform" + type: file + direction: input + required: false + example: "lsi_transformer.pickle" + resources: + - type: python_script + path: script.py + - path: ../helper_functions.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - networkx + - type: nextflow + directives: + label: [highmem, hightime, midcpu, highsharedmem, gpu] + diff --git a/src/tasks/predict_modality/methods/novel/predict/run_test.sh b/src/tasks/predict_modality/methods/novel/predict/run_test.sh new file mode 100644 index 0000000000..af5550e5d7 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/run_test.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +viash run src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml -- \ + --input_train_mod2 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/train_mod2.h5ad' \ + --input_test_mod1 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/test_mod1.h5ad' \ + --input_model output/novel/model.pt \ + --input_transform output/novel/lsi_transform.pickle \ + --output 'output/novel/novel_test.h5ad' \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/predict/script.py b/src/tasks/predict_modality/methods/novel/predict/script.py new file mode 100644 index 0000000000..5f336ce7b0 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/script.py @@ -0,0 +1,119 @@ +import sys +import torch +from torch.utils.data import DataLoader + +import anndata as ad +import pickle +import numpy as np +from scipy.sparse import csc_matrix + +#check gpu available +if (torch.cuda.is_available()): + device = 'cuda:0' #switch to current device + print('current device: gpu', flush=True) +else: + device = 'cpu' + print('current device: cpu', flush=True) + + +## VIASH START + +par = { + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad', + 'input_model': 'resources_test/predict_modality/neurips2021_bmmc_cite/model.pt', + 'input_transform': 'transformer.pickle' +} +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/novel', + 'functionality_name': '171129' +} +## VIASH END + +sys.path.append(meta['resources_dir']) +from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModalityMatchingDataset + +print("Load data", flush=True) + +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) + +mod1 = input_test_mod1.uns['modality'] +mod2 = input_train_mod2.uns['modality'] + +n_vars_mod1 = input_train_mod2.uns["model_dim"]["mod1"] +n_vars_mod2 = input_train_mod2.uns["model_dim"]["mod2"] + +input_test_mod1.X = input_test_mod1.layers['normalized'].tocsr() + +# Remove vars that were removed from training set. Mostlyy only applicable for testing. +if input_train_mod2.uns.get("removed_vars"): + rem_var = input_train_mod2.uns["removed_vars"] + input_test_mod1 = input_test_mod1[:, ~input_test_mod1.var_names.isin(rem_var)] + +del input_train_mod2 + + +model_fp = par['input_model'] + +print("Start predict", flush=True) + +if mod1 == 'GEX' and mod2 == 'ADT': + model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'GEX' and mod2 == 'ATAC': + model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'ATAC' and mod2 == 'GEX': + model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'ADT' and mod2 == 'GEX': + model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + + model.load_state_dict(weight) + input_test_mod1_ = input_test_mod1.to_df() + +dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False) +dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) + +outputs = [] +model.eval() +with torch.no_grad(): + for x in dataloader_test: + output = model(x.float()) + outputs.append(output.detach().cpu().numpy()) + +outputs = np.concatenate(outputs) +outputs[outputs<0] = 0 +outputs = csc_matrix(outputs) + +adata = ad.AnnData( + layers={"normalized": outputs}, + shape=outputs.shape, + uns={ + 'dataset_id': input_test_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'], + }, +) +adata.write_h5ad(par['output'], compression = "gzip") + + diff --git a/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml new file mode 100644 index 0000000000..682782e059 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../../api/comp_method.yaml +functionality: + name: novel + info: + label: Novel + summary: A method using encoder-decoder MLP model + description: This method trains an encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework. + documentation_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel#readme + repository_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel + reference: pmlr-v176-lance2022multimodal + submission_id: "169769" + preferred_normalization: log_cp10k + resources: + - path: main.nf + type: nextflow_script + entrypoint: run_wf + dependencies: + - name: predict_modality/methods/novel_train + - name: predict_modality/methods/novel_predict +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/main.nf b/src/tasks/predict_modality/methods/novel/run/main.nf new file mode 100644 index 0000000000..59111194cb --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/main.nf @@ -0,0 +1,25 @@ +workflow run_wf { + take: input_ch + main: + output_ch = input_ch + | novel_train.run( + fromState: ["input_train_mod1", "input_train_mod2"], + toState: ["input_model": "output", "input_transform": "output_transform", "output_train_mod2": "output_train_mod2"] + ) + | novel_predict.run( + fromState: { id, state -> + [ + "input_train_mod2": state.output_train_mod2, + "input_test_mod1": state.input_test_mod1, + "input_model": state.input_model, + "input_transform": state.input_transform, + "output": state.output]}, + toState: ["output": "output"] + ) + + | map { tup -> + [tup[0], [output: tup[1].output]] + } + + emit: output_ch +} \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/run_test.sh b/src/tasks/predict_modality/methods/novel/run/run_test.sh new file mode 100644 index 0000000000..f6da6b0863 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/run_test.sh @@ -0,0 +1,15 @@ +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +nextflow run . \ + -main-script target/nextflow/predict_modality/methods/novel/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ + --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ + --input_test_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad \ + --publish_dir output/novel/nextflow diff --git a/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml new file mode 100644 index 0000000000..87ea471301 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../../api/comp_method_train.yaml +functionality: + name: novel_train + arguments: + - name: --output_transform + type: file + description: "The output transform file" + required: false + default: "lsi_transformer.pickle" + direction: output + - name: --output_train_mod2 + type: file + description: copy of the input with model dim in `.uns` + direction: output + default: "train_mod2.h5ad" + required: false + resources: + - path: script.py + type: python_script + - path: ../helper_functions.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - networkx + - type: nextflow + directives: + label: [highmem, hightime, midcpu, highsharedmem, gpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/train/run_test.sh b/src/tasks/predict_modality/methods/novel/train/run_test.sh new file mode 100644 index 0000000000..08630b1ac0 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Run script for all test resources + +echo "GEX2ADT" +viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ + --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ + --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ + --output output/model.pt + +# echo "ADT2GEX" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad \ +# --output output/model.pt + +# echo "GEX2ATAC" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad \ +# --output output/model.pt + +# echo "ATAC2GEX" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad \ +# --output output/model.pt + + diff --git a/src/tasks/predict_modality/methods/novel/train/script.py b/src/tasks/predict_modality/methods/novel/train/script.py new file mode 100644 index 0000000000..39ea8b4778 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/script.py @@ -0,0 +1,148 @@ +import sys + +import torch +from torch.utils.data import DataLoader +# from sklearn.model_selection import train_test_split + +import anndata as ad +import pickle + +#check gpu available +if (torch.cuda.is_available()): + device = 'cuda:0' #switch to current device + print('current device: gpu', flush=True) +else: + device = 'cpu' + print('current device: cpu', flush=True) + + +## VIASH START + +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', + 'output_train_mod2': 'train_mod2.h5ad', + 'output': 'model.pt' +} + +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/novel', +} +## VIASH END + + +sys.path.append(meta['resources_dir']) +from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset +from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac + +print('Load data', flush=True) + +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) + +adata = input_train_mod2.copy() + +mod1 = input_train_mod1.uns['modality'] +mod2 = input_train_mod2.uns['modality'] + +input_train_mod1.X = input_train_mod1.layers['normalized'] +input_train_mod2.X = input_train_mod2.layers['normalized'] + +input_train_mod2_df = input_train_mod2.to_df() + +del input_train_mod2 + +print('Start train', flush=True) + + +# Check for zero divide +zero_row = input_train_mod1.X.sum(axis=0) == 0 + +rem_var = None +if True in zero_row: + rem_var = input_train_mod1[:, zero_row].var_names + input_train_mod1 = input_train_mod1[:, ~zero_row] + + +# select number of variables for LSI +n_comp = input_train_mod1.n_vars -1 if input_train_mod1.n_vars < 256 else 256 + +if mod1 != 'ADT': + lsi_transformer_gex = lsiTransformer(n_components=n_comp) + input_train_mod1_df = lsi_transformer_gex.fit_transform(input_train_mod1) +else: + input_train_mod1_df = input_train_mod1.to_df() + +# reproduce train/test split from phase 1 +batch = input_train_mod1.obs["batch"] +train_ix = [ k for k,v in enumerate(batch) if v not in {'s1d2', 's3d7'} ] +test_ix = [ k for k,v in enumerate(batch) if v in {'s1d2', 's3d7'} ] + +train_mod1 = input_train_mod1_df.iloc[train_ix, :] +train_mod2 = input_train_mod2_df.iloc[train_ix, :] +test_mod1 = input_train_mod1_df.iloc[test_ix, :] +test_mod2 = input_train_mod2_df.iloc[test_ix, :] + +n_vars_train_mod1 = train_mod1.shape[1] +n_vars_train_mod2 = train_mod2.shape[1] +n_vars_test_mod1 = test_mod1.shape[1] +n_vars_test_mod2 = test_mod2.shape[1] + +n_vars_mod1 = input_train_mod1_df.shape[1] +n_vars_mod2 = input_train_mod2_df.shape[1] + +if mod1 == 'ATAC' and mod2 == 'GEX': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808) + +elif mod1 == 'ADT' and mod2 == 'GEX': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) + + model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139) + + +elif mod1 == 'GEX' and mod2 == 'ADT': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426) + + +elif mod1 == 'GEX' and mod2 == 'ATAC': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058) + +loss_fn = torch.nn.MSELoss() +train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output'], device) + +# Add model dim for use in predict part +adata.uns["model_dim"] = {"mod1": n_vars_mod1, "mod2": n_vars_mod2} +if rem_var: + adata.uns["removed_vars"] = [rem_var[0]] +adata.write_h5ad(par['output_train_mod2'], compression="gzip") + +if mod1 != 'ADT': + with open(par['output_transform'], 'wb') as f: + pickle.dump(lsi_transformer_gex, f) + diff --git a/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml b/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml new file mode 100644 index 0000000000..a1ee69041d --- /dev/null +++ b/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml +functionality: + name: random_forest + status: disabled # disabled due to long execution times + info: + label: Random Forests + summary: Random forest regression. + description: A random forest regression method. + reference: breiman2001random + documentation_url: https://www.stat.berkeley.edu/~breiman/RandomForests/reg_home.htm + repository_url: https://github.com/cran/randomForest + preferred_normalization: log_cp10k + arguments: + - name: "--distance_method" + type: "string" + default: "pearson" + description: The distance method to use. Possible values are euclidean, pearson, spearman and others. + - name: "--n_pcs" + type: "integer" + default: 20 + description: Number of principal components to use. + - name: "--n_trees" + type: "integer" + default: 50 + description: Number of trees to use. + resources: + - type: r_script + path: script.R +platforms: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + cran: [ lmds, ranger, pbapply] + - type: nextflow + directives: + label: [hightime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/random_forest/script.R b/src/tasks/predict_modality/methods/random_forest/script.R new file mode 100644 index 0000000000..e148eefbf7 --- /dev/null +++ b/src/tasks/predict_modality/methods/random_forest/script.R @@ -0,0 +1,83 @@ +cat("Loading dependencies\n") +requireNamespace("anndata", quietly = TRUE) +requireNamespace("pbapply", quietly = TRUE) +library(Matrix, warn.conflicts = FALSE, quietly = TRUE) + +## VIASH START +path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" +par <- list( + input_train_mod1 = paste0(path, "train_mod1.h5ad"), + input_test_mod1 = paste0(path, "test_mod1.h5ad"), + input_train_mod2 = paste0(path, "train_mod2.h5ad"), + output = "output.h5ad", + n_pcs = 20L, + n_trees = 50L +) +meta <- list(functionality_name = "foo") +## VIASH END + +n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) + +cat("Reading mod1 files\n") +input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) +input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) + +dataset_id <- input_train_mod1$uns[["dataset_id"]] + +cat("Performing DR on the mod1 values\n") +dr <- lmds::lmds( + rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), + ndim = par$n_pcs, + distance_method = par$distance_method +) + +ix <- seq_len(nrow(input_train_mod1)) +dr_train <- as.data.frame(dr[ix, , drop = FALSE]) +dr_test <- as.data.frame(dr[-ix, , drop = FALSE]) +dr_train <- dr[ix, , drop = FALSE] +dr_test <- dr[-ix, , drop = FALSE] + +rm(input_train_mod1, input_test_mod1) +gc() + + +cat("Reading mod2 files\n") +X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] + +cat("Predicting for each column in modality 2\n") +preds <- pbapply::pblapply( + seq_len(ncol(X_mod2)), + cl = n_cores, + function(i) { + y <- X_mod2[, i] + uy <- unique(y) + if (length(uy) > 1) { + rf <- ranger::ranger( + x = dr_train, + y = y, + num.trees = par$n_trees + ) + stats::predict(rf, dr_test)$prediction + } else { + rep(uy, nrow(dr_test)) + } + } +) + +cat("Creating outputs object\n") +prediction <- Matrix::Matrix(do.call(cbind, preds), sparse = TRUE) +rownames(prediction) <- rownames(dr_test) +colnames(prediction) <- colnames(X_mod2) + +out <- anndata::AnnData( + layers = list(normalized = prediction), + shape = dim(prediction), + uns = list( + dataset_id = dataset_id, + method_id = meta$functionality_name + ) +) + + +cat("Writing predictions to file\n") +zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml new file mode 100644 index 0000000000..ef972e416f --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../../api/comp_method_predict.yaml +functionality: + name: simplemlp_predict + resources: + - type: python_script + path: script.py + - path: ../resources/ +platforms: + - type: docker + # image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + image: openproblems/base_pytorch_nvidia:1.0.0 + # run_args: ["--gpus all --ipc=host"] + setup: + - type: python + pypi: + - scikit-learn + - scanpy + - pytorch-lightning + - type: nextflow + directives: + label: [highmem, hightime, midcpu, gpu, highsharedmem] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/script.py b/src/tasks/predict_modality/methods/simple_mlp/predict/script.py new file mode 100644 index 0000000000..b67284e348 --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/predict/script.py @@ -0,0 +1,104 @@ +from glob import glob +import sys +import numpy as np +from scipy.sparse import csc_matrix +import anndata as ad +import torch +from torch.utils.data import TensorDataset,DataLoader + +## VIASH START +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', + 'input_model': 'output/model', + 'output': 'output/prediction' +} +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/simple_mlp', + 'cpus': 10 +} +## VIASH END + +resources_dir = f"{meta['resources_dir']}/resources" +sys.path.append(resources_dir) +from models import MLP +import utils + +def _predict(model,dl): + model = model.cuda() + model.eval() + yps = [] + for x in dl: + with torch.no_grad(): + yp = model(x[0].cuda()) + yps.append(yp.detach().cpu().numpy()) + yp = np.vstack(yps) + return yp + + +print('Load data', flush=True) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +# determine variables +mod_1 = input_test_mod1.uns['modality'] +mod_2 = input_train_mod2.uns['modality'] + +task = f'{mod_1}2{mod_2}' + +print('Load ymean', flush=True) +ymean_path = f"{par['input_model']}/{task}_ymean.npy" +ymean = np.load(ymean_path) + +print('Start predict', flush=True) +if task == 'GEX2ATAC': + y_pred = ymean*np.ones([input_test_mod1.n_obs, input_test_mod1.n_vars]) +else: + folds = [0, 1, 2] + + ymean = torch.from_numpy(ymean).float() + yaml_path=f"{resources_dir}/yaml/mlp_{task}.yaml" + config = utils.load_yaml(yaml_path) + X = input_test_mod1.layers["normalized"].toarray() + X = torch.from_numpy(X).float() + + te_ds = TensorDataset(X) + + yp = 0 + for fold in folds: + # load_path = f"{par['input_model']}/{task}_fold_{fold}/version_0/checkpoints/*" + load_path = f"{par['input_model']}/{task}_fold_{fold}/**.ckpt" + print(load_path) + ckpt = glob(load_path)[0] + model_inf = MLP.load_from_checkpoint( + ckpt, + in_dim=X.shape[1], + out_dim=input_test_mod1.n_vars, + ymean=ymean, + config=config + ) + te_loader = DataLoader( + te_ds, + batch_size=config.batch_size, + num_workers=0, + shuffle=False, + drop_last=False + ) + yp = yp + _predict(model_inf, te_loader) + + y_pred = yp/len(folds) + +y_pred = csc_matrix(y_pred) + +adata = ad.AnnData( + layers={"normalized": y_pred}, + shape=y_pred.shape, + uns={ + 'dataset_id': input_test_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'], + }, +) + +print('Write data', flush=True) +adata.write_h5ad(par['output'], compression = "gzip") \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/resources/models.py b/src/tasks/predict_modality/methods/simple_mlp/resources/models.py new file mode 100644 index 0000000000..25ce9b2995 --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/resources/models.py @@ -0,0 +1,68 @@ +import torch +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F + +class MLP(pl.LightningModule): + def __init__(self,in_dim,out_dim,ymean,config): + super(MLP, self).__init__() + self.ymean = ymean.cuda() + H1 = config.H1 + H2 = config.H2 + p = config.dropout + self.config = config + self.fc1 = nn.Linear(in_dim, H1) + self.fc2 = nn.Linear(H1,H2) + self.fc3 = nn.Linear(H1+H2, out_dim) + self.dp2 = nn.Dropout(p=p) + + def forward(self, x): + x0 = x + x1 = F.relu(self.fc1(x)) + x1 = self.dp2(x1) + x = F.relu(self.fc2(x1)) + x = torch.cat([x,x1],dim=1) + x = self.fc3(x) + x = self.apply_mask(x) + return x + + def apply_mask(self,yp): + tmp = torch.ones_like(yp).float()*self.ymean + mask = tmp