Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update summary.py to include parameter combinations #194

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ rule summary_table:
run:
# Load the node table from the pickled dataset file
node_table = Dataset.from_file(input.dataset_file).node_table
summary_df = summary.summarize_networks(input.pathways, node_table)
summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
summary_df.to_csv(output.summary_table, sep='\t', index=False)

# Cluster the output pathways for each dataset
Expand Down
36 changes: 27 additions & 9 deletions spras/analysis/summary.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import sys
from pathlib import Path
Expand All @@ -7,7 +8,7 @@
import pandas as pd


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> pd.DataFrame:
def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params, algo_with_params) -> pd.DataFrame:
"""
Generate a table that aggregates summary information about networks in file_paths,
including which nodes are present in node_table columns.
Expand All @@ -31,6 +32,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
# Initialize list to store network summary data
nw_info = []

index = 0

# Iterate through each network file path
for file_path in sorted(file_paths):

Expand All @@ -44,28 +47,43 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
number_nodes = nw.number_of_nodes()
number_edges = nw.number_of_edges()
ncc = nx.number_connected_components(nw)

# Initialize list to store current network information
cur_nw_info = [nw_name, number_nodes, number_edges, ncc]

# Iterate through each node property and save the intersection with the current network
for node_list in nodes_by_col:
num_nodes = len(set(nw).intersection(node_list))
cur_nw_info.append(num_nodes)

# String split name to access algorithm and hashcode from filepath
# Name of filepath follows format "output/.../data#-algo-params-hashcode/pathway.txt"
# algorithm parameters have format { algo : { hashcode : { parameter combos } } }
filename = sorted(algo_with_params)[index].split("-")
algo = filename[0]
hashcode = filename[2]
index = index + 1

param_combo = algo_params[algo][hashcode]
params = json.dumps(param_combo)
params = params.replace("\"", "") #removes extra double quotes from string
cur_nw_info.append(params)

# Prepare column names
col_names = ["Name", "Number of nodes", "Number of undirected edges", "Number of connected components"]
col_names.extend(nodes_by_col_labs)
col_names.append("Parameter combination")

# Save the current network information to the network summary list
nw_info.append(cur_nw_info)

# Convert the network summary data to pandas dataframe
# Could refactor to create the dataframe line by line instead of storing data as lists and then converting
nw_info = pd.DataFrame(
nw_info,
columns=[
"Name",
"Number of nodes",
"Number of undirected edges",
"Number of connected components"
]
+
nodes_by_col_labs
columns=[col_names]
)

return nw_info


Expand Down
176 changes: 176 additions & 0 deletions test/analysis/input/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Global workflow control

# The length of the hash used to identify a parameter combination
hash_length: 7

# Specify the container framework. Current supported versions include 'docker' and
# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
container_framework: docker

# Only used if container_framework is set to singularity, this will unpack the singularity containers
# to the local filesystem. This is useful when PRM containers need to run inside another container,
# such as would be the case in an HTCondor/OSPool environment.
# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
# that persists after the workflow is complete. To clean up the unpacked containers, the user must
# manually delete them.
unpack_singularity: false

# Allow the user to configure which container registry containers should be pulled from
# Note that this assumes container names are consistent across registries, and that the
# registry being passed doesn't require authentication for pull actions
container_registry:
base_url: docker.io
# The owner or project of the registry
# For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
owner: reedcompbio

# This list of algorithms should be generated by a script which checks the filesystem for installs.
# It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm
# in the list to reduce the number of assumptions of the program at the cost of making the config a little more involved)
# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
# which algorithms are run in a given experiment.
#
# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
# combinations. For instance if we have the following:
# - name: "myAlg"
# params:
# include: true
# a: [1,2]
# b: [0.5,0.75]
#
# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
# careful: too many parameters might make your runs take a long time.

algorithms:
- name: "pathlinker"
params:
include: true
run1:
k: range(100,201,100)

- name: "omicsintegrator1"
params:
include: true
run1:
b: [5, 6]
w: np.linspace(0,5,2)
d: [10]

- name: "omicsintegrator2"
params:
include: true
run1:
b: [4]
g: [0]
run2:
b: [2]
g: [3]

- name: "meo"
params:
include: true
run1:
max_path_length: [3]
local_search: ["Yes"]
rand_restarts: [10]

- name: "mincostflow"
params:
include: true
run1:
flow: [1] # The flow must be an int
capacity: [1]

- name: "allpairs"
params:
include: true

- name: "domino"
params:
include: true
run1:
slice_threshold: [0.3]
module_threshold: [0.05]


# Here we specify which pathways to run and other file location information.
# DataLoader.py can currently only load a single dataset
# Assume that if a dataset label does not change, the lists of associated input files do not change
datasets:
-
# Labels can only contain letters, numbers, or underscores
label: data0
node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
# DataLoader.py can currently only load a single edge file, which is the primary network
edge_files: ["network.txt"]
# Placeholder
other_files: []
# Relative path from the spras directory
data_dir: "input"
-
#label: data1
# Reuse some of the same sources file as 'data0' but different network and targets
# node_files: ["node-prizes.txt", "sources.txt", "alternative-targets.txt"]
# edge_files: ["alternative-network.txt"]
# other_files: []
# Relative path from the spras directory
# data_dir: "input"

gold_standards:
-
# Labels can only contain letters, numbers, or underscores
label: gs0
node_files: ["gs_nodes0.txt"]
# edge_files: [] TODO: later iteration
data_dir: "input"
# List of dataset labels to compare with the specific gold standard dataset
dataset_labels: ["data0"]
-
#label: gs1
# node_files: ["gs_nodes1.txt"]
# data_dir: "input"
# dataset_labels: ["data1", "data0"]

# If we want to reconstruct then we should set run to true.
# TODO: if include is true above but run is false here, algs are not run.
# is this the behavior we want?
reconstruction_settings:

#set where everything is saved
locations:

#place the save path here
# TODO move to global
reconstruction_dir: "output"

run: true

analysis:
# Create one summary per pathway file and a single summary table for all pathways for each dataset
summary:
include: true
# Create output files for each pathway that can be visualized with GraphSpace
graphspace:
include: true
# Create Cytoscape session file with all pathway graphs for each dataset
cytoscape:
include: true
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
# ml analysis per dataset
include: true
# adds ml analysis per algorithm output
# only runs for algorithms with multiple parameter combinations chosen
aggregate_per_algorithm: true
# specify how many principal components to calculate
components: 2
# boolean to show the labels on the pca graph
labels: true
# 'ward', 'complete', 'average', 'single'
# if linkage: ward, must use metric: euclidean
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
evaluation:
include: true
93 changes: 93 additions & 0 deletions test/analysis/input/egfr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# The length of the hash used to identify a parameter combination
hash_length: 7

# If true, use Singularity instead of Docker
# Singularity support is only available on Unix
singularity: false

algorithms:
-
name: pathlinker
params:
include: true
run1:
k:
- 10
- 20
-
name: omicsintegrator1
params:
include: true
run1:
b:
- 0.55
- 2
- 10
d:
- 10
g:
- 1e-3
r:
- 0.01
w:
- 0.1
mu:
- 0.008
-
name: omicsintegrator2
params:
include: true
run1:
b:
- 4
g:
- 0
run2:
b:
- 2
g:
- 3
-
name: meo
params:
include: true
run1:
local_search:
- "Yes"
max_path_length:
- 3
rand_restarts:
- 10
-
name: domino
params:
include: true
run1:
slice_threshold:
- 0.3
module_threshold:
- 0.05
datasets:
-
data_dir: input
edge_files:
- phosphosite-irefindex13.0-uniprot.txt
label: tps_egfr
node_files:
- tps-egfr-prizes.txt
other_files: []
reconstruction_settings:
locations:
reconstruction_dir: output/egfr
run: true
analysis:
graphspace:
include: false
cytoscape:
include: true
summary:
include: true
ml:
include: false
evaluation:
include: false
Loading