Merge branch 'master' into decouple-ensemble

Reed-CompBio · Sep 10, 2024 · 5ff27bf · 5ff27bf
2 parents 4f78d80 + cd1efd7
commit 5ff27bf
Show file tree

Hide file tree

Showing 19 changed files with 462 additions and 59 deletions.
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -94,6 +94,12 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/omics-integrator-1:latest
         push: false
+    - name: Remove Omics Integrator 1 Docker image
+      # Remove the image to prevent the cache from being used. Here we use
+      # `|| true` to prevent the job from failing if the image doesn't exist or
+      # can't be removed for some reason
+      run: docker rmi reedcompbio/omics-integrator-1:latest || true
+
     - name: Build Omics Integrator 2 Docker image
       uses: docker/build-push-action@v1
       with:
@@ -103,6 +109,9 @@ jobs:
         tags: v2
         cache_froms: reedcompbio/omics-integrator-2:latest
         push: false
+    - name: Remove Omics Integrator 2 Docker image
+      run: docker rmi reedcompbio/omics-integrator-2:latest || true
+
     - name: Build PathLinker Docker image
       uses: docker/build-push-action@v1
       with:
@@ -112,6 +121,9 @@ jobs:
         tags: v2
         cache_froms: reedcompbio/pathlinker:latest
         push: false
+    - name: Remove PathLinker Docker image
+      run: docker rmi reedcompbio/pathlinker:latest || true
+
     - name: Build Maximum Edge Orientation Docker image
       uses: docker/build-push-action@v1
       with:
@@ -121,6 +133,9 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/meo:latest
         push: false
+    - name: Remove MEO Docker image
+      run: docker rmi reedcompbio/meo:latest || true
+
     - name: Build MinCostFlow Docker image
       uses: docker/build-push-action@v1
       with:
@@ -130,6 +145,9 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/mincostflow:latest
         push: false
+    - name: Remove MinCostFlow Docker image
+      run: docker rmi reedcompbio/mincostflow:latest || true
+
     - name: Build All Pairs Shortest Paths Docker image
       uses: docker/build-push-action@v1
       with:
@@ -139,6 +157,9 @@ jobs:
         tags: v2
         cache_froms: reedcompbio/allpairs:latest
         push: false
+    - name: Remove All Pairs Shortest Paths Docker image
+      run: docker rmi reedcompbio/allpairs:latest || true
+
     - name: Build DOMINO Docker image
       uses: docker/build-push-action@v1
       with:
@@ -148,6 +169,9 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/domino:latest
         push: false
+    - name: Remove DOMINO Docker image
+      run: docker rmi reedcompbio/domino:latest || true
+
     - name: Build Cytoscape Docker image
       uses: docker/build-push-action@v1
       with:
@@ -157,15 +181,20 @@ jobs:
         tags: v3
         cache_froms: reedcompbio/py4cytoscape:v3
         push: false
+    - name: Remove Cytoscape Docker image
+      run: docker rmi reedcompbio/py4cytoscape:v3 || true
+
     - name: Build SPRAS Docker image
       uses: docker/build-push-action@v1
       with:
         path: .
         dockerfile: docker-wrappers/SPRAS/Dockerfile
         repository: reedcompbio/spras
-        tags: v0.1.0
-        cache_froms: reedcompbio/spras:v0.1.0
+        tags: v0.2.0
+        cache_froms: reedcompbio/spras:v0.2.0
         push: false
+    - name: Remove SPRAS Docker image
+      run: docker rmi reedcompbio/spras:v0.2.0 || true
 
   # Run pre-commit checks on source files
   pre-commit:

diff --git a/.gitignore b/.gitignore
@@ -141,3 +141,6 @@ TempMat.mat
 
 # OSX-specific stuff
 **/.DS_Store
+
+# SPRAS singularity container
+spras.sif
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -104,7 +104,7 @@ docker push <username>/local-neighborhood
 Pushing an image requires being logged in, so run `docker login` first if needed using your Docker Hub username and password.
 
 ### Step 3: Write the Local Neighborhood wrapper functions
-Add a new Python file `src/local_neighborhood.py` to implement the wrapper functions for the Local Neighborhood algorithm.
+Add a new Python file `spras/local_neighborhood.py` to implement the wrapper functions for the Local Neighborhood algorithm.
 Use `pathlinker.py` as an example.
 
 Call the new class within `local_neighborhood.py` `LocalNeighborhood` and set `__all__` so the class can be [imported](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package).
@@ -114,7 +114,7 @@ These entries are used to tell Snakemake what input files should be present befo
 Before implementing the `generate_inputs` function, explore the structure of the `Dataset` class interactively.
 In an interactive Python session, run the following commands to load the `data0` dataset and explore the nodes and interactome.
 ```python
-> from src.dataset import Dataset
+> from spras.dataset import Dataset
 > dataset_dict = {'label': 'data0', 'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'], 'edge_files': ['network.txt'], 'other_files': [], 'data_dir': 'input'}
 > data = Dataset(dataset_dict)
 > data.node_table.head()
@@ -136,12 +136,13 @@ Also test the functions available in the `Dataset` class.
 Note the behaviors of the `request_node_columns` function when there are missing values in that column of the node table and when multiple columns are requested.
 `request_node_columns` always returns the `NODEID` column in addition to the requested columns.
 
-Now implement the `generate_inputs` function, following the `omicsintegrator1.py` example.
+Now implement the `generate_inputs` function.
+Start by inspecting the `omicsintegrator1.py` example, but note the differences in the expected file formats generated for the two algorithms with respect to the header rows and node prize column.
 The selected nodes should be any node in the dataset that has a prize set, any node that is active, any node that is a source, or any node that is a target.
 As shown in the example dataset above, "active", "sources", and "targets" are Boolean attributes.
 A "prize" is a term for a numeric score on a node in a network, so nodes that have non-empty prizes are considered relevant nodes for the Local Neighborhood algorithm along with active nodes, sources, and targets.
-The network should be all of the edges written in the format `<vertex1>|<vertex2>`.
-`src/dataset.py` provides functions that provide access to node information and the interactome (edge list).
+The network should be all of the edges written in the format `<vertex1>|<vertex2>`, which also differs from the `omicsintegrator1.py` example.
+`spras/dataset.py` provides functions that provide access to node information and the interactome (edge list).
 
 Implement the `run` function, following the PathLinker example.
 The `prepare_volume` utility function is needed to prepare the network and nodes input files to be mounted and used inside the container.
@@ -155,22 +156,29 @@ Use the `run_container` utility function to run the command in the container `<u
 Implement the `parse_output` function.
 The edges in the Local Neighborhood output have the same format as the input, `<vertex1>|<vertex2>`.
 Convert these to be tab-separated vertex pairs followed by a tab `1` and tab `U` at the end of every line, which indicates all edges have the same rank and are undirected.
-See the `add_rank_column` and `raw_pathway_df` function in `src.util.py` and `reinsert_direction_col_undirected` function in `src.interactome.py`.
+See the `add_rank_column` and `raw_pathway_df` function in `spras.util.py` and `reinsert_direction_col_undirected` function in `spras.interactome.py`.
 Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created.
 The output should have the format `<vertex1> <vertex2> 1 U`.
 
 ### Step 4: Make the Local Neighborhood wrapper accessible through SPRAS
-Import the new class `LocalNeighborhood` in `src/runner.py` so the wrapper functions can be accessed.
+Import the new class `LocalNeighborhood` in `spras/runner.py` so the wrapper functions can be accessed.
 Add an entry for Local Neighborhood to the configuration file `config/config.yaml` and set `include: true`.
 As a convention, algorithm names are written in all lowercase without special characters.
 Local Neighborhood has no other parameters.
 Optionally set `include: false` for the other pathway reconstruction algorithms to make testing faster.
 
+The config file has an option `owner` under the `container_registry` settings that controls which Docker Hub account will be used when pulling Docker images.
+The same Docker Hub account will be used for all images and cannot currently be set different for each algorithm.
+Set the `owner` to match your Docker Hub username from Step 2.
+
 After completing this step, try running the Local Neighborhood algorithm through SPRAS with
 ```bash
 snakemake --cores 1 --configfile config/config.yaml
 ```
-Make sure to run the command inside the `spras` conda environment. If installing via `pip` instead of using conda, install with the `-e .[dev]` options (the full command to run from the repo root is `python -m pip install -e .[dev]`) so that Python picks up any changes you make and installs all optional development packages. Omitting the `-e` flag will prevent your changes from being reflected unless you force re-install, and omitting `.[dev]` will prevent pip from installing `pre-commit` and `pytest`.
+Make sure to run the command inside the `spras` conda environment.
+
+If installing via `pip` instead of using conda, install with the `-e .[dev]` options (the full command to run from the repo root is `python -m pip install -e .[dev]`) so that Python picks up any changes you make and installs all optional development packages.
+Omitting the `-e` flag will prevent your changes from being reflected unless you force re-install, and omitting `.[dev]` will prevent pip from installing `pre-commit` and `pytest`.
 
 As a workflow manager, Snakemake will consider the work described in the configuration file to be completed once the necessary output files have been written to the relevant output directory (`output` in the `config/config.yaml` configuration).
 That means that if you change your code and rerun the Snakemake command above, nothing may happen if the output files already exist.
@@ -211,8 +219,8 @@ The pull request will be closed so that the `master` branch of the fork stays sy
 1. Open a [GitHub issue](https://github.com/Reed-CompBio/spras/issues/new/choose) to propose adding a new algorithm and discuss it with the SPRAS maintainers
 1. Add a new subdirectory to `docker-wrappers` with the name `<algorithm>`, write a `Dockerfile` to build an image for `<algorithm>`, and include any other files required to build that image in the subdirectory
 1. Build and push the Docker image to the [reedcompbio](https://hub.docker.com/orgs/reedcompbio) Docker organization (SPRAS maintainer required)
-1. Add a new Python file `src/<algorithm>.py` to implement the wrapper functions for `<algorithm>`: specify the list of `required_input` files and the `generate_inputs`, `run`, and `parse_output` functions
-1. Import the new class in `src/runner.py` so the wrapper functions can be accessed
+1. Add a new Python file `spras/<algorithm>.py` to implement the wrapper functions for `<algorithm>`: specify the list of `required_input` files and the `generate_inputs`, `run`, and `parse_output` functions
+1. Import the new class in `spras/runner.py` so the wrapper functions can be accessed
 1. Document the usage of the Docker wrapper and the assumptions made when implementing the wrapper
 1. Add example usage for the new algorithm and its parameters to the template config file
 1. Write test functions and provide example input data in a new test subdirectory `test/<algorithm>`. Provide example data and algorithm/expected files names to lists or dicts in `test/generate-inputs` and `test/parse-outputs`. Use the full path with the names of the test files.
@@ -243,3 +251,35 @@ Additional hooks are [available](https://github.com/pre-commit/pre-commit-hooks#
 These are configured in `.pre-commit-config.yaml`.
 SPRAS also runs [`ruff`](https://github.com/charliermarsh/ruff) as part of the pre-commit hooks to perform the Python code analysis, which supports many more [rules](https://beta.ruff.rs/docs/rules/).
 These are configured in `pyproject.toml`.
+
+## Reviewing pull requests
+Contributors may help review pull requests from other contributors.
+Part of the review process includes running the updated code locally.
+This requires checking out a branch from the other contributor's fork.
+
+We'll use pull request [170](https://github.com/Reed-CompBio/spras/pull/170) as an example from the `ntalluri` fork with branch `implement-eval`.
+First, you need to add the `ntalluri` fork as a git remote from the command line so that you can pull branches from it.
+```
+git remote add ntalluri https://github.com/ntalluri/spras.git
+```
+The first `ntalluri` is the name we give to the new remote.
+It doesn't have to match the GitHub user name, but that is a convenient convention.
+
+Then, confirm the new remote was added
+```
+git remote -v
+```
+You should see the new remote along with your `origin` remote and any others you added previously.
+Now you can pull and fetch branches from any of these remotes and push to any remotes where you have permissions.
+
+To checkout the branch in the pull request locally run
+```
+git fetch ntalluri
+git checkout implement-eval
+```
+Optionally run
+```
+git log
+```
+To confirm that the most recent commit matches the most recent commit in the pull request.
+Now your local version of SPRAS matches the code in the pull request and you can test the code to confirm it runs as expected.
diff --git a/README.md b/README.md
@@ -56,6 +56,9 @@ Output files will be written to the `output` directory.
 You do not need to manually download Docker images from DockerHub before running SPRAS.
 The workflow will automatically download any missing images as long as Docker is running.
 
+### Running SPRAS with HTCondor
+Large SPRAS workflows may benefit from execution with HTCondor, a scheduler/manager for distributed high-throughput computing workflows that allows many Snakemake steps to be run in parallel. For instructions on running SPRAS in this setting, see `docker-wrappers/SPRAS/README.md`.
+
 ## Components
 **Configuration file**: Specifies which pathway reconstruction algorithms to run, which hyperparameter combinations to use, and which datasets to run them on.
 

diff --git a/Snakefile b/Snakefile
@@ -3,6 +3,7 @@ from spras import runner
 import shutil
 import yaml
 from spras.dataset import Dataset
+from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, graphspace, cytoscape
 import spras.config as _config
 
@@ -27,13 +28,14 @@ hac_params = _config.config.hac_params
 FRAMEWORK = _config.config.container_framework
 print(f"Running {FRAMEWORK} containers")
 
-# Return the dataset dictionary from the config file given the label
+# Return the dataset or gold_standard dictionary from the config file given the label
 def get_dataset(_datasets, label):
     return _datasets[label]
 
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
+dataset_gold_standard_pairs = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standards.values() for dataset in gs_values['dataset_labels']]
 
 # Get algorithms that are running multiple parameter combinations
 def algo_has_mult_param_combos(algo):
@@ -102,6 +104,9 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
+    if _config.config.analysis_include_evaluation:
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -153,6 +158,21 @@ rule merge_input:
         dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset)
         runner.merge_input(dataset_dict, output.dataset_file)
 
+# Return all files used in the gold standard
+def get_gold_standard_dependencies(wildcards):
+    gs = _config.config.gold_standards[wildcards.gold_standard]
+    all_files = gs["node_files"]
+    all_files = [gs["data_dir"] + SEP + data_file for data_file in all_files]
+    return all_files
+
+# Merge all node files for a gold_standard into a single node table
+rule merge_gs_input:
+    input: get_gold_standard_dependencies
+    output: gold_standard_file = SEP.join([out_dir, '{gold_standard}-merged.pickle'])
+    run:
+        gold_standard_dict = get_dataset(_config.config.gold_standards, wildcards.gold_standard)
+        Evaluation.merge_gold_standard_input(gold_standard_dict, output.gold_standard_file)
+
 # The checkpoint is like a rule but can be used in dynamic workflows
 # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs
 # If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the
@@ -340,6 +360,28 @@ rule ensemble_per_algo:
         summary_df = ml.summarize_networks(input.pathways)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
+# Return the gold standard pickle file for a specific gold standard
+def get_gold_standard_pickle_file(wildcards):
+    parts = wildcards.dataset_gold_standard_pairs.split('-')
+    gs = parts[1]
+    return SEP.join([out_dir, f'{gs}-merged.pickle'])
+
+# Returns the dataset corresponding to the gold standard pair
+def get_dataset_label(wildcards):
+    parts = wildcards.dataset_gold_standard_pairs.split('-')
+    dataset = parts[0]
+    return dataset
+
+# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
+rule evaluation:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
+    output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        Evaluation.precision(input.pathways, node_table, output.eval_file)
+
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'
diff --git a/config/config.yaml b/config/config.yaml
@@ -117,6 +117,21 @@ datasets:
       # Relative path from the spras directory
       data_dir: "input"
 
+gold_standards:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
+      # edge_files: [] TODO: later iteration
+      data_dir: "input"
+      # List of dataset labels to compare with the specific gold standard dataset
+      dataset_labels: ["data0"]
+    -
+      label: gs1
+      node_files: ["gs_nodes1.txt"]
+      data_dir: "input"
+      dataset_labels: ["data1", "data0"]
+
 # If we want to reconstruct then we should set run to true.
 # TODO: if include is true above but run is false here, algs are not run.
 # is this the behavior we want?
@@ -157,3 +172,5 @@ analysis:
         linkage: 'ward'
         # 'euclidean', 'manhattan', 'cosine'
         metric: 'euclidean'
+      evaluation:
+        include: true
diff --git a/config/egfr.yaml b/config/egfr.yaml
@@ -89,3 +89,5 @@ analysis:
     include: true
   ml:
     include: false
+  evaluation:
+        include: false
diff --git a/docker-wrappers/SPRAS/Dockerfile b/docker-wrappers/SPRAS/Dockerfile
@@ -1,12 +1,12 @@
 FROM almalinux:9
 
-RUN dnf install -y epel-release
-
 # gcc/g++ are required for building several of the packages if you're using apple silicon
 RUN dnf update -y && \
+    dnf install -y epel-release && \
     dnf install -y gcc gcc-c++ \
     python3.11 python3.11-pip python3.11-devel \
-    docker apptainer
+    docker apptainer && \
+    dnf clean all
 
 COPY / /spras/
 RUN chmod -R 777 /spras