Skip to content

Commit

Permalink
second pass of implementing evaluation code, added dataset-goldstanda…
Browse files Browse the repository at this point in the history
…rd pairs
  • Loading branch information
ntalluri committed Jul 17, 2024
1 parent 99691a9 commit 2c6d1a5
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 13 deletions.
25 changes: 18 additions & 7 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par
dataset_labels = list(_config.config.datasets.keys())
gold_standard_labels = list(_config.config.gold_standard.keys())

# TODO: create something that will be gs to dataset pairing
dataset_gs_pairs_tuples = [(gs_values['label'], dataset) for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']]
# am I able to send tuples around?
dataset_gs_pairs_formatted = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']]
# prefomatting makes it easier to send around but requires more functions to use

# Get algorithms that are running multiple parameter combinations
def algo_has_mult_param_combos(algo):
Expand Down Expand Up @@ -107,8 +110,7 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_evalution:
# TODO: update to using gs to specific dataset pairing
final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gs_pairs_formatted,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
Expand Down Expand Up @@ -343,12 +345,21 @@ rule ml_analysis_aggregate_algo:
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# update to use specific gs to dataset pairing
def get_gs_pickle_file(wildcards):
parts = wildcards.dataset_gs_pairs_formatted.split('-')
gs = parts[1]
return SEP.join([out_dir, f'{gs}-merged.pickle'])

def get_dataset_label(wildcards):
parts = wildcards.dataset_gs_pairs_formatted.split('-')
dataset = parts[0]
return dataset

rule evaluation:
input:
gs_file = SEP.join([out_dir,'{gold_standard}-merged.pickle']),
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output: eval_file = SEP.join([out_dir, "{dataset}-{gold_standard}-evaluation.txt"])
gs_file = get_gs_pickle_file,
pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
output: eval_file = SEP.join([out_dir, "{dataset_gs_pairs_formatted}-evaluation.txt"])
run:
node_table = Evaluation.from_file(input.gs_file).node_table
Evaluation.precision(input.pathways, node_table, output.eval_file)
Expand Down
11 changes: 8 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,16 @@ datasets:

gold_standard:
-
label: gs1
node_files: ["gs_nodes.txt"]
label: gs0
node_files: ["gs_nodes0.txt"]
# edge_files: [] TODO: later iteration
data_dir: "input"
# TODO: dataset: []
datasets: ["data0"]
-
label: gs1
node_files: ["gs_nodes1.txt"]
data_dir: "input"
datasets: ["data1", "data0"]

# If we want to reconstruct then we should set run to true.
# TODO: if include is true above but run is false here, algs are not run.
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions input/gs_nodes1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
C
8 changes: 5 additions & 3 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, gold_standard_dict):
self.node_table = None
# self.edge_table = None TODO: later iteration
self.load_files_from_dict(gold_standard_dict)
# TODO add a self.dataset_somthing = None
self.datasets = None
return

def to_file(self, file_name):
Expand All @@ -38,8 +38,8 @@ def from_file(cls, file_name):
def load_files_from_dict(self, gold_standard_dict):

self.label = gold_standard_dict["label"]
# TODO: set self.datasets
self.datasets = gold_standard_dict["datasets"]

node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now
data_loc = gold_standard_dict["data_dir"]

Expand All @@ -50,6 +50,8 @@ def load_files_from_dict(self, gold_standard_dict):
# TODO: are we allowing multiple node files or single in node_files for gs
# if yes, a for loop is needed

# TODO: later iteration - chose between node and edge file, or allow both

def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):

y_true = node_table['NODEID'].tolist()
Expand Down

0 comments on commit 2c6d1a5

Please sign in to comment.