second pass of implementing evaluation code, added dataset-goldstanda…

…rd pairs
Reed-CompBio · Jul 17, 2024 · 2c6d1a5 · 2c6d1a5
1 parent 99691a9
commit 2c6d1a5
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 13 deletions.
diff --git a/Snakefile b/Snakefile
@@ -37,7 +37,10 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par
 dataset_labels = list(_config.config.datasets.keys())
 gold_standard_labels = list(_config.config.gold_standard.keys())
 
-# TODO: create something that will be gs to dataset pairing
+dataset_gs_pairs_tuples = [(gs_values['label'], dataset) for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']]
+# am I able to send tuples around?
+dataset_gs_pairs_formatted = [f"{dataset}-{gs_values['label']}" for gs_values in _config.config.gold_standard.values() for dataset in gs_values['datasets']]
+# prefomatting makes it easier to send around but requires more functions to use
 
 # Get algorithms that are running multiple parameter combinations
 def algo_has_mult_param_combos(algo):
@@ -107,8 +110,7 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
 
     if _config.config.analysis_include_evalution:
-        # TODO: update to using gs to specific dataset pairing
-        final_input.extend(expand('{out_dir}{sep}{dataset}-{gold_standard}-evaluation.txt',out_dir=out_dir, sep=SEP,dataset=dataset_labels,gold_standard=gold_standard_labels, algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gs_pairs_formatted,algorithm_params=algorithms_with_params))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
@@ -343,12 +345,21 @@ rule ml_analysis_aggregate_algo:
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
-# update to use specific gs to dataset pairing
+def get_gs_pickle_file(wildcards):
+    parts = wildcards.dataset_gs_pairs_formatted.split('-')
+    gs = parts[1]
+    return SEP.join([out_dir, f'{gs}-merged.pickle'])
+
+def get_dataset_label(wildcards):
+    parts = wildcards.dataset_gs_pairs_formatted.split('-')
+    dataset = parts[0]
+    return dataset
+
 rule evaluation:
     input: 
-        gs_file = SEP.join([out_dir,'{gold_standard}-merged.pickle']),
-        pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
-    output: eval_file = SEP.join([out_dir, "{dataset}-{gold_standard}-evaluation.txt"])
+        gs_file = get_gs_pickle_file,
+        pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
+    output: eval_file = SEP.join([out_dir, "{dataset_gs_pairs_formatted}-evaluation.txt"])
     run:
         node_table = Evaluation.from_file(input.gs_file).node_table
         Evaluation.precision(input.pathways, node_table, output.eval_file)

diff --git a/config/config.yaml b/config/config.yaml
@@ -118,11 +118,16 @@ datasets:
 
 gold_standard:
     -
-      label: gs1
-      node_files: ["gs_nodes.txt"]
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
       # edge_files: [] TODO: later iteration
       data_dir: "input"
-      # TODO: dataset: []
+      datasets: ["data0"]
+    -
+      label: gs1
+      node_files: ["gs_nodes1.txt"]
+      data_dir: "input"
+      datasets: ["data1", "data0"]
 
 # If we want to reconstruct then we should set run to true.
 # TODO: if include is true above but run is false here, algs are not run.

diff --git a/input/gs_nodes.txt → input/gs_nodes0.txt b/input/gs_nodes.txt → input/gs_nodes0.txt
diff --git a/input/gs_nodes1.txt b/input/gs_nodes1.txt
@@ -0,0 +1 @@
+C
diff --git a/spras/evaluation.py b/spras/evaluation.py
@@ -16,7 +16,7 @@ def __init__(self, gold_standard_dict):
         self.node_table = None
         # self.edge_table = None TODO: later iteration
         self.load_files_from_dict(gold_standard_dict)
-        # TODO add a self.dataset_somthing = None
+        self.datasets = None
         return
 
     def to_file(self, file_name):
@@ -38,8 +38,8 @@ def from_file(cls, file_name):
     def load_files_from_dict(self, gold_standard_dict):
 
         self.label = gold_standard_dict["label"]
-        # TODO: set self.datasets
-        
+        self.datasets = gold_standard_dict["datasets"]
+
         node_data_files = gold_standard_dict["node_files"][0] # TODO: single file for now
         data_loc = gold_standard_dict["data_dir"]
 
@@ -50,6 +50,8 @@ def load_files_from_dict(self, gold_standard_dict):
         # TODO: are we allowing multiple node files or single in node_files for gs
         # if yes, a for loop is needed
 
+        # TODO: later iteration - chose between node and edge file, or allow both
+
     def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
 
         y_true = node_table['NODEID'].tolist()