Skip to content

Commit

Permalink
Merge pull request #160 from ntalluri/ml-aggregate
Browse files Browse the repository at this point in the history
Post processing / ML Code  - Aggregate per algorithm
  • Loading branch information
agitter authored Jun 29, 2024
2 parents 55979e8 + 122fb66 commit ad4da94
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 22 deletions.
75 changes: 57 additions & 18 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import spras.config as _config
SEP = '/'

wildcard_constraints:
params="params-\w+"
params="params-\w+",
dataset="\w+"

# Elsewhere we import this as config, but in the Snakefile, the variable config is already populated
# with the parsed config.yaml. This is done by Snakemake, which magically pipes config into this file
Expand All @@ -23,7 +24,6 @@ algorithm_params = _config.config.algorithm_params
algorithm_directed = _config.config.algorithm_directed
pca_params = _config.config.pca_params
hac_params = _config.config.hac_params

FRAMEWORK = _config.config.container_framework
print(f"Running {FRAMEWORK} containers")

Expand All @@ -35,6 +35,12 @@ algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())

# Get algorithms that are running multiple parameter combinations
def algo_has_mult_param_combos(algo):
return len(algorithm_params.get(algo, {})) > 1

algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)]

# Get the parameter dictionary for the specified
# algorithm and parameter combination hash
def reconstruction_params(algorithm, params_hash):
Expand Down Expand Up @@ -77,14 +83,24 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels))

if _config.config.analysis_include_ml:
final_input.extend(expand('{out_dir}{sep}{dataset}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
Expand Down Expand Up @@ -268,21 +284,44 @@ rule ml_analysis:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
pca_image = SEP.join([out_dir, '{dataset}-pca.png']),
pca_variance= SEP.join([out_dir, '{dataset}-pca-variance.txt']),
pca_coordinates = SEP.join([out_dir, '{dataset}-pca-coordinates.txt']),
hac_image_vertical = SEP.join([out_dir, '{dataset}-hac-vertical.png']),
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ensemble-pathway.txt'])
pca_image = SEP.join([out_dir, '{dataset}-ml', 'pca.png']),
pca_variance= SEP.join([out_dir, '{dataset}-ml', 'pca-variance.txt']),
pca_coordinates = SEP.join([out_dir, '{dataset}-ml', 'pca-coordinates.txt']),
hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-vertical.png']),
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

def collect_pathways_per_algo(wildcards):
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)

rule ml_analysis_aggregate_algo:
input:
pathways = collect_pathways_per_algo
output:
pca_image = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca.png']),
pca_variance= SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-variance.txt']),
pca_coordinates = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-coordinates.txt']),
hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-vertical.png']),
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Remove the output directory
rule clean:
shell: f'rm -rf {out_dir}'
6 changes: 5 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ container_registry:
algorithms:
- name: "pathlinker"
params:
include: false
include: true
run1:
k: range(100,201,100)

Expand Down Expand Up @@ -134,7 +134,11 @@ analysis:
include: true
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
# ml analysis per dataset
include: true
# adds ml analysis per algorithm output
# only runs for algorithms with multiple parameter combinations chosen
aggregate_per_algorithm: true
# specify how many principal components to calculate
components: 2
# boolean to show the labels on the pca graph
Expand Down
5 changes: 5 additions & 0 deletions spras/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,8 @@ def process_config(self, raw_config):
self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]

if 'aggregate_per_algorithm' not in self.ml_params:
self.analysis_include_ml_aggregate_algo = False
else:
self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
3 changes: 0 additions & 3 deletions test/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,3 @@ def test_config_container_registry(self):
test_config["container_registry"]["owner"] = ""
config.init_global(test_config)
assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX)



0 comments on commit ad4da94

Please sign in to comment.