Reed-CompBio · cgsze · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Dec 2, 2024
diff --git a/Snakefile b/Snakefile
@@ -296,7 +296,7 @@ rule summary_table:
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,3 +1,4 @@
+import json
 import os
 import sys
 from pathlib import Path
@@ -7,7 +8,7 @@
 import pandas as pd
 
 
-def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> pd.DataFrame:
+def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params, algo_with_params) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths,
     including which nodes are present in node_table columns.
@@ -31,6 +32,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
     # Initialize list to store network summary data
     nw_info = []
 
+    index = 0
+
     # Iterate through each network file path
     for file_path in sorted(file_paths):
 
@@ -44,28 +47,43 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
         number_nodes = nw.number_of_nodes()
         number_edges = nw.number_of_edges()
         ncc = nx.number_connected_components(nw)
+
         # Initialize list to store current network information
         cur_nw_info = [nw_name, number_nodes, number_edges, ncc]
+
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
             num_nodes = len(set(nw).intersection(node_list))
             cur_nw_info.append(num_nodes)
+
+        # String split name to access algorithm and hashcode from filepath
+        # Name of filepath follows format "output/.../data#-algo-params-hashcode/pathway.txt"
+        # algorithm parameters have format { algo : { hashcode : { parameter combos } } }
+        filename = sorted(algo_with_params)[index].split("-")
+        algo = filename[0]
+        hashcode = filename[2]
+        index = index + 1
+
+        param_combo = algo_params[algo][hashcode]
+        params = json.dumps(param_combo)
+        params = params.replace("\"", "") #removes extra double quotes from string
+        cur_nw_info.append(params)
+
+        # Prepare column names
+        col_names = ["Name", "Number of nodes", "Number of undirected edges", "Number of connected components"]
+        col_names.extend(nodes_by_col_labs)
+        col_names.append("Parameter combination")
+
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
     # Convert the network summary data to pandas dataframe
     # Could refactor to create the dataframe line by line instead of storing data as lists and then converting
     nw_info = pd.DataFrame(
         nw_info,
-        columns=[
-                    "Name",
-                    "Number of nodes",
-                    "Number of undirected edges",
-                    "Number of connected components"
-                ]
-                +
-                nodes_by_col_labs
+        columns=[col_names]
     )
+
     return nw_info
 
 

diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
@@ -0,0 +1,176 @@
+# Global workflow control
+
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# Specify the container framework. Current supported versions include 'docker' and
+# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
+container_framework: docker
+
+# Only used if container_framework is set to singularity, this will unpack the singularity containers
+# to the local filesystem. This is useful when PRM containers need to run inside another container,
+# such as would be the case in an HTCondor/OSPool environment.
+# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
+# that persists after the workflow is complete. To clean up the unpacked containers, the user must
+# manually delete them.
+unpack_singularity: false
+
+# Allow the user to configure which container registry containers should be pulled from
+# Note that this assumes container names are consistent across registries, and that the
+# registry being passed doesn't require authentication for pull actions
+container_registry:
+   base_url: docker.io
+   # The owner or project of the registry
+   # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
+   owner: reedcompbio
+
+# This list of algorithms should be generated by a script which checks the filesystem for installs.
+# It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm
+# in the list to reduce the number of assumptions of the program at the cost of making the config a little more involved)
+# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
+# which algorithms are run in a given experiment.
+#
+# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
+# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
+# combinations. For instance if we have the following:
+# - name: "myAlg"
+#   params:
+#         include: true
+#         a: [1,2]
+#         b: [0.5,0.75]
+#
+# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
+# careful: too many parameters might make your runs take a long time.
+
+algorithms:
+      - name: "pathlinker"
+        params:
+              include: true
+              run1:
+                  k: range(100,201,100)
+
+      - name: "omicsintegrator1"
+        params:
+              include: true
+              run1:
+                  b: [5, 6]
+                  w: np.linspace(0,5,2)
+                  d: [10]
+
+      - name: "omicsintegrator2"
+        params:
+              include: true
+              run1:
+                  b: [4]
+                  g: [0]
+              run2:
+                  b: [2]
+                  g: [3]
+
+      - name: "meo"
+        params:
+              include: true
+              run1:
+                  max_path_length: [3]
+                  local_search: ["Yes"]
+                  rand_restarts: [10]
+
+      - name: "mincostflow"
+        params:
+              include: true
+              run1:
+                  flow: [1] # The flow must be an int
+                  capacity: [1]
+
+      - name: "allpairs"
+        params:
+              include: true
+
+      - name: "domino"
+        params:
+              include: true
+              run1:
+                  slice_threshold: [0.3]
+                  module_threshold: [0.05]
+
+
+# Here we specify which pathways to run and other file location information.
+# DataLoader.py can currently only load a single dataset
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: data0
+      node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
+      # DataLoader.py can currently only load a single edge file, which is the primary network
+      edge_files: ["network.txt"]
+      # Placeholder
+      other_files: []
+      # Relative path from the spras directory
+      data_dir: "input"
+    -
+    #label: data1
+      # Reuse some of the same sources file as 'data0' but different network and targets
+      # node_files: ["node-prizes.txt", "sources.txt", "alternative-targets.txt"]
+      # edge_files: ["alternative-network.txt"]
+      # other_files: []
+      # Relative path from the spras directory
+      # data_dir: "input"
+
+gold_standards:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
+      # edge_files: [] TODO: later iteration
+      data_dir: "input"
+      # List of dataset labels to compare with the specific gold standard dataset
+      dataset_labels: ["data0"]
+    -
+    #label: gs1
+    # node_files: ["gs_nodes1.txt"]
+    # data_dir: "input"
+    # dataset_labels: ["data1", "data0"]
+
+# If we want to reconstruct then we should set run to true.
+# TODO: if include is true above but run is false here, algs are not run.
+# is this the behavior we want?
+reconstruction_settings:
+
+        #set where everything is saved
+        locations:
+
+              #place the save path here
+              # TODO move to global
+              reconstruction_dir: "output"
+
+        run: true
+
+analysis:
+      # Create one summary per pathway file and a single summary table for all pathways for each dataset
+      summary:
+        include: true
+      # Create output files for each pathway that can be visualized with GraphSpace
+      graphspace:
+        include: true
+      # Create Cytoscape session file with all pathway graphs for each dataset
+      cytoscape:
+        include: true
+      # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
+      ml:
+        # ml analysis per dataset
+        include: true
+        # adds ml analysis per algorithm output
+        # only runs for algorithms with multiple parameter combinations chosen
+        aggregate_per_algorithm: true
+        # specify how many principal components to calculate
+        components: 2
+        # boolean to show the labels on the pca graph
+        labels: true
+        # 'ward', 'complete', 'average', 'single'
+        # if linkage: ward, must use metric: euclidean
+        linkage: 'ward'
+        # 'euclidean', 'manhattan', 'cosine'
+        metric: 'euclidean'
+      evaluation:
+        include: true
diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml
@@ -0,0 +1,93 @@
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# If true, use Singularity instead of Docker
+# Singularity support is only available on Unix
+singularity: false
+
+algorithms:
+  -
+    name: pathlinker
+    params:
+      include: true
+      run1:
+        k:
+          - 10
+          - 20
+  -
+    name: omicsintegrator1
+    params:
+      include: true
+      run1:
+        b:
+          - 0.55
+          - 2
+          - 10
+        d:
+          - 10
+        g:
+          - 1e-3
+        r:
+          - 0.01
+        w:
+          - 0.1
+        mu:
+          - 0.008
+  -
+    name: omicsintegrator2
+    params:
+      include: true
+      run1:
+        b:
+          - 4
+        g:
+          - 0
+      run2:
+        b:
+          - 2
+        g:
+          - 3
+  -
+    name: meo
+    params:
+      include: true
+      run1:
+        local_search:
+          - "Yes"
+        max_path_length:
+          - 3
+        rand_restarts:
+          - 10
+  -
+    name: domino
+    params:
+      include: true
+      run1:
+        slice_threshold:
+          - 0.3
+        module_threshold:
+          - 0.05
+datasets:
+  -
+    data_dir: input
+    edge_files:
+      - phosphosite-irefindex13.0-uniprot.txt
+    label: tps_egfr
+    node_files:
+      - tps-egfr-prizes.txt
+    other_files: []
+reconstruction_settings:
+  locations:
+    reconstruction_dir: output/egfr
+  run: true
+analysis:
+  graphspace:
+    include: false
+  cytoscape:
+    include: true
+  summary:
+    include: true
+  ml:
+    include: false
+  evaluation:
+        include: false