From 1872c4835ca9b1e68985249ae4316acb1b5febc9 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 28 Oct 2024 13:37:32 -0500
Subject: [PATCH 01/22] ideas for parameter tuning

---
 Snakefile            |  14 +++
 config/config.yaml   |   5 +
 parameter-tuning.py  | 288 +++++++++++++++++++++++++++++++++++++++++++
 spras/analysis/ml.py |   6 +-
 spras/evaluation.py  |   8 +-
 5 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 parameter-tuning.py

diff --git a/Snakefile b/Snakefile
index 9d2debe3..65143cf3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -381,6 +381,20 @@ rule evaluation:
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision(input.pathways, node_table, output.eval_file)
+        # add recall
+        # Run "PR" curves for output pathays precision and recall 
+        # Run PR curves for ensemble files only 
+        # Run PCA "tuning" idea
+
+# parameter tuning section? 
+# does there need to be a seperate section for parameter tuning if evaluation will deal with it
+# PCA
+# - only one that isn't taken care of by the evaluation code directly, but can be added as something to look at in evaluation
+# no parameter tuning
+# - will use the outputs that can be put into evaluation
+# ensembling 
+# - will use the outputs that can be put into evaluation
+
 
 # Remove the output directory
 rule clean:
diff --git a/config/config.yaml b/config/config.yaml
index b87bcd45..4b473050 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -174,3 +174,8 @@ analysis:
         metric: 'euclidean'
       evaluation:
         include: true
+        # update to decouple the evaluation parts? 
+        # - ensemble vs all pathways vs pca chosen pathway
+        # pr curves from ensemble files 
+        # "pr" curves from all pathways
+        # p and r from pca chosen pathway
diff --git a/parameter-tuning.py b/parameter-tuning.py
new file mode 100644
index 00000000..0094e3c1
--- /dev/null
+++ b/parameter-tuning.py
@@ -0,0 +1,288 @@
+import glob
+import os
+import pickle as pkl
+from pathlib import Path
+from typing import Dict, Iterable
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    PrecisionRecallDisplay,
+    average_precision_score,
+    precision_recall_curve,
+    precision_score,
+    recall_score,
+)
+
+from spras.analysis.ml import summarize_networks
+from spras.evaluation import Evaluation
+
+# make directories
+directories = ["parameter-tuning","parameter-tuning/ensembling-parameter-tuning", "parameter-tuning/no-parameter-tuning", "parameter-tuning/pca-parameter-tuning"]
+
+for directory in directories:
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+        print(f"Directory {directory} was created.")
+    else:
+        print(f"Directory {directory} already exists.")
+
+
+# #################################################################################################################################################
+# Parameter Tuning with Ensemble networks
+
+def select_max_freq_and_node(row):
+        max_freq = 0
+        node = ""
+        if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
+            max_freq = row['Freq1']
+            node = row['Node1']
+        elif pd.isna(row['Node1']) and pd.isna(row['Freq1']):
+            max_freq = row['Freq2']
+            node = row['Node2']
+        else:
+            max_freq = max(row['Freq1'], row['Freq2'])
+            node = row['Node1']
+        return node, max_freq
+
+def precision_recall(file, node_table, node_freq_filename, output_file):
+    gold_standard_nodes = set(node_table['NODEID'])
+
+    df = pd.read_table(file, sep="\t", header=0)
+
+    node1_freq = df.drop(columns = ['Node2', 'Direction'])
+    node2_freq = df.drop(columns = ['Node1', 'Direction'])
+    max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
+    max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
+    max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
+    max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
+    node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
+    node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(select_max_freq_and_node, axis=1, result_type='expand')
+    node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
+
+    node_df_merged.sort_values('max_freq', ascending= False, inplace = True)
+    node_df_merged.to_csv(node_freq_filename, sep = "\t",header=True, index=False)
+
+    y_true = [1 if node in gold_standard_nodes else 0 for node in node_df_merged['Node']]
+    y_scores = node_df_merged['max_freq'].tolist()
+
+    # print(f"y_true:\n{y_true}")
+    # print(f"y_score:\n{y_scores}")
+
+    plt.figure()
+    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
+    # print(f"precision:{precision}\n recall:{recall}\n thresholds:{thresholds}\n")
+    auc_precision_recall = average_precision_score(y_true, y_scores)
+
+    plt.plot(recall, precision, marker='o', label='Precision-Recall curve')
+    plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.title('Precision-Recall Curve')
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(output_filename)
+
+    # print(f"overlapping nodes: {len(set(node_df_merged['Node'].tolist()) & gold_standard_nodes)}")
+    # print(f"average_precision_score: {auc_precision_recall}")
+
+# TODO: fix mincostflow bug with summarize networks
+algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino']
+
+gold_standard_file = "output/gs_egfr-merged.pickle"
+node_table = Evaluation.from_file(gold_standard_file).node_table
+new_folder_path = 'parameter-tuning/ensembling-parameter-tuning/'
+
+for algo in algorithms:
+    ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt"
+    node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt"
+    output_filename = f"{new_folder_path}{algo}-pr.png"
+    try:
+        precision_recall(ensemble_filename, node_table, node_freq_filename, output_filename)
+    except Exception as error:
+        print(error)
+
+# code to work for MEO
+algorithms = ['meo']
+
+for algo in algorithms:
+    ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt"
+    df = pd.read_table(ensemble_filename, sep="\t", header=0)
+    df['Node1'] = df['Node1'] + '_HUMAN'
+    df['Node2'] = df['Node2'] + '_HUMAN'
+    df['Node1'] = df['Node1'].replace({
+    'Ca++_HUMAN': 'Ca++_PSEUDONODE',
+    'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
+    'DAG_HUMAN': 'DAG_PSEUDONODE'
+    })
+    df['Node2'] = df['Node2'].replace({
+    'Ca++_HUMAN': 'Ca++_PSEUDONODE',
+    'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
+    'DAG_HUMAN': 'DAG_PSEUDONODE'
+    })
+
+    updated_ensemble_filename = f"{new_folder_path}meo-ensemble-pathway-updated.txt"
+    df.to_csv(updated_ensemble_filename, sep="\t", header=True, index=False)
+    node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt"
+    output_filename = f"{new_folder_path}{algo}-pr.png"
+    try:
+        precision_recall(updated_ensemble_filename, node_table, node_freq_filename, output_filename)
+    except Exception as error:
+        print(error)
+
+
+#################################################################################################################################################
+# No Parameter Tuning
+
+def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+    """
+    Takes in file paths for a specific dataset and an associated gold standard node table.
+    Calculates recall for each pathway file
+    Returns output back to output_file
+    @param file_paths: file paths of pathway reconstruction algorithm outputs
+    @param node_table: the gold standard nodes
+    @param output_file: the filename to save the precision of each pathway
+    """
+    y_true = set(node_table['NODEID'])
+    results = []
+
+    for file in file_paths:
+        df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
+        y_pred = set(df['Node1']).union(set(df['Node2']))
+        all_nodes = y_true.union(y_pred)
+        y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
+        y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
+
+        # default to 0.0 if there is a divide by 0 error
+        precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
+        recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
+        results.append({"Pathway": file, "Precision": precision, "Recall": recall})
+
+    pr_df = pd.DataFrame(results)
+    pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True)
+    pr_df.to_csv(output_file, sep="\t", index=False)
+    return pr_df
+
+
+algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino']
+
+gold_standard_file = "output/gs_egfr-merged.pickle"
+node_table = Evaluation.from_file(gold_standard_file).node_table
+folder_path = 'output/'
+new_folder_path = 'parameter-tuning/no-parameter-tuning/'
+
+for algo in algorithms:
+    file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt")
+    files = glob.glob(file_pattern)
+    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
+    prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png"
+
+    pr_df = precision_and_recall(file_paths=files, node_table=node_table, output_file=output_file)
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title(f"{algo} Precision-Recall Curve")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(prcurve_filename)
+
+
+# code to work for MEO
+def precision_and_recall_meo(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+    """
+    Takes in file paths for a specific dataset and an associated gold standard node table.
+    Calculates recall for each pathway file
+    Returns output back to output_file
+    @param file_paths: file paths of pathway reconstruction algorithm outputs
+    @param node_table: the gold standard nodes
+    @param output_file: the filename to save the precision of each pathway
+    """
+    y_true = set(node_table['NODEID'])
+    results = []
+
+    for file in file_paths:
+        df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
+        df['Node1'] = df['Node1'] + '_HUMAN'
+        df['Node2'] = df['Node2'] + '_HUMAN'
+        df['Node1'] = df['Node1'].replace({
+        'Ca++_HUMAN': 'Ca++_PSEUDONODE',
+        'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
+        'DAG_HUMAN': 'DAG_PSEUDONODE'
+        })
+        df['Node2'] = df['Node2'].replace({
+        'Ca++_HUMAN': 'Ca++_PSEUDONODE',
+        'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
+        'DAG_HUMAN': 'DAG_PSEUDONODE'
+        })
+
+        y_pred = set(df['Node1']).union(set(df['Node2']))
+        all_nodes = y_true.union(y_pred)
+        y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
+        y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
+
+        # default to 0.0 if there is a divide by 0 error
+        precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
+        recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
+        results.append({"Pathway": file, "Precision": precision, "Recall": recall})
+
+    pr_df = pd.DataFrame(results)
+    pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True)
+    pr_df.to_csv(output_file, sep="\t", index=False)
+    return pr_df
+
+algorithms = ['meo']
+
+for algo in algorithms:
+
+    file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt")
+    files = glob.glob(file_pattern)
+    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
+    prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png"
+
+    pr_df = precision_and_recall_meo(file_paths=files, node_table=node_table, output_file=output_file)
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title(f"{algo} Precision-Recall Curve")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(prcurve_filename)
+
+#################################################################################################################################################
+# PCA parameter tuning
+
+algorithms = ['omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'domino', 'meo', 'allpairs']
+folder_path = 'output/'
+gold_standard_file = "output/gs_egfr-merged.pickle"
+node_table = Evaluation.from_file(gold_standard_file).node_table
+new_folder_path = 'parameter-tuning/pca-parameter-tuning/'
+
+for algo in algorithms:
+    file_path = os.path.join(folder_path, f"tps_egfr-ml", f"{algo}-pca-coordinates.txt")
+    try:
+        coord_df = pd.read_csv(file_path, delimiter="\t", header=0)
+    except Exception as error:
+        print(f"PCA parameter tuning: {error}")
+        continue
+
+    # centroid 
+    centroid_row = coord_df[coord_df['algorithm'] == 'centroid']
+    centroid = centroid_row.iloc[0, 1:].tolist()
+
+    # update df to exclude centroid point
+    coord_df = coord_df[coord_df['algorithm'] != 'centroid']
+
+    # euclidean distance
+    pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
+    coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
+    closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
+    
+    # finding the rep pathway
+    rep_pathway = [os.path.join(folder_path, f"{closest_to_centroid['algorithm']}", "pathway.txt")]
+    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
+    precision_and_recall(rep_pathway, node_table, output_file)
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 3dad8775..9fdc6e03 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -146,6 +146,8 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     scaler.fit(X)  # calc mean and standard deviation
     X_scaled = scaler.transform(X)
 
+    # TODO: add in centroid code from other branch
+
     # choosing the PCA
     pca_instance = PCA(n_components=components)
     pca_instance.fit(X_scaled)
@@ -163,9 +165,11 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     # saving the coordinates of each algorithm
     make_required_dirs(output_coord)
     coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
-    coordinates_df.insert(0, 'algorithm', columns.tolist())
+    coordinates_df.insert(0, 'algorithm', columns.tolist()) # update the algortihms to somehting else (datapoints labels?)
     coordinates_df.to_csv(output_coord, sep='\t', index=False)
 
+    # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint
+
     # saving the principal components
     make_required_dirs(output_var)
     with open(output_var, "w") as f:
diff --git a/spras/evaluation.py b/spras/evaluation.py
index 5d00e7d4..344e06a5 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -71,7 +71,7 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
 
         # TODO: later iteration - chose between node and edge file, or allow both
 
-    @staticmethod
+    @staticmethod # TODO update to do precision and recall in the same function for the nodes
     def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
@@ -98,3 +98,9 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file:
 
         precision_df = pd.DataFrame(results)
         precision_df.to_csv(output_file, sep="\t", index=False)
+
+    # TODO make PR curves for the nodes from ensembled files outputs
+    # TODO make the edge frequency node ensembles 
+
+    # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway
+    
\ No newline at end of file

From 7e0a990d98ec77c3be8b28f140f816a17154f71e Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Tue, 29 Oct 2024 10:59:17 -0500
Subject: [PATCH 02/22] update to ml code for centroid, update to eval code to
 have recall

---
 Snakefile            |  5 +++--
 spras/analysis/ml.py | 20 ++++++++++++++++----
 spras/evaluation.py  | 23 +++++++++++++++++------
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/Snakefile b/Snakefile
index 65143cf3..cc39780b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -373,18 +373,19 @@ def get_dataset_label(wildcards):
     return dataset
 
 # Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
-rule evaluation:
+rule evaluation: # update to be per algorithm and for all algortihms
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
     output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision(input.pathways, node_table, output.eval_file)
+        Evaluation.precision_and_recall(input.pathways, node_table, output.eval_file)
         # add recall
         # Run "PR" curves for output pathays precision and recall 
         # Run PR curves for ensemble files only 
         # Run PCA "tuning" idea
+        # - will either need to read file from ml_analysis or rerun pca rule
 
 # parameter tuning section? 
 # does there need to be a seperate section for parameter tuning if evaluation will deal with it
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 9fdc6e03..1585db89 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -146,7 +146,6 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     scaler.fit(X)  # calc mean and standard deviation
     X_scaled = scaler.transform(X)
 
-    # TODO: add in centroid code from other branch
 
     # choosing the PCA
     pca_instance = PCA(n_components=components)
@@ -154,21 +153,34 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     X_pca = pca_instance.transform(X_scaled)
     variance = pca_instance.explained_variance_ratio_ * 100
 
+    # TODO: add in centroid code from other branch
+    # calculating the centroid
+    centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples
+
     # making the plot
     label_color_map = create_palette(column_names)
     plt.figure(figsize=(10, 7))
-    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, legend=True, palette=label_color_map)
+    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, palette=label_color_map)
+    plt.scatter(centroid[0], centroid[1], color='red', marker='X', s=100, label='Centroid')
     plt.title("PCA")
+    plt.legend()
     plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)")
     plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)")
 
+    # saving the coordinates of each algorithm
+    # make_required_dirs(output_coord)
+    # coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
+    # coordinates_df.insert(0, 'algorithm', columns.tolist())
+    # coordinates_df.to_csv(output_coord, sep='\t', index=False)
+
     # saving the coordinates of each algorithm
     make_required_dirs(output_coord)
     coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
-    coordinates_df.insert(0, 'algorithm', columns.tolist()) # update the algortihms to somehting else (datapoints labels?)
+    coordinates_df.insert(0, 'datapoint_labels', columns.tolist())
+    centroid_row = ['centroid'] + centroid.tolist() # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint
+    coordinates_df.loc[len(coordinates_df)] = centroid_row
     coordinates_df.to_csv(output_coord, sep='\t', index=False)
 
-    # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint
 
     # saving the principal components
     make_required_dirs(output_var)
diff --git a/spras/evaluation.py b/spras/evaluation.py
index 344e06a5..1330ee2b 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -4,7 +4,7 @@
 from typing import Dict, Iterable
 
 import pandas as pd
-from sklearn.metrics import precision_score
+from sklearn.metrics import precision_score, recall_score
 
 
 class Evaluation:
@@ -71,11 +71,11 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
 
         # TODO: later iteration - chose between node and edge file, or allow both
 
-    @staticmethod # TODO update to do precision and recall in the same function for the nodes
-    def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+    @staticmethod
+    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
-        Calculates precision for each pathway file
+        Calculates precision and recall for each pathway file
         Returns output back to output_file
         @param file_paths: file paths of pathway reconstruction algorithm outputs
         @param node_table: the gold standard nodes
@@ -93,14 +93,25 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file:
 
             # default to 0.0 if there is a divide by 0 error
             precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
-
-            results.append({"Pathway": file, "Precision": precision})
+            recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
+            results.append({"Pathway": file, "Precision": precision, "Recall": recall})
 
         precision_df = pd.DataFrame(results)
         precision_df.to_csv(output_file, sep="\t", index=False)
 
+        # TODO make "PR" curves from the precision_and_recall file
+
+    def edge_frequency_nodes(ensemble_file: str, node_table:pd.DataFrame, output_file: str, output_png: str):
+        None
+        # create one per ensemble file 
+    
+    def pr_curves ():
+        None
+
     # TODO make PR curves for the nodes from ensembled files outputs
     # TODO make the edge frequency node ensembles 
 
+    def pca_chosen_pathway():
+        None
     # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway
     
\ No newline at end of file

From d5d0461598eb4c2d2f7e2fe0a9b27227794cea67 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 1 Nov 2024 12:52:05 -0500
Subject: [PATCH 03/22] integrated all the code

---
 Snakefile                     |  103 +-
 config/egfr-param-tuning.yaml | 3459 +++++++++++++++++++++++++++++++++
 input/gs-egfr.txt             |  324 +++
 spras/evaluation.py           |  108 +-
 4 files changed, 3959 insertions(+), 35 deletions(-)
 create mode 100644 config/egfr-param-tuning.yaml
 create mode 100644 input/gs-egfr.txt

diff --git a/Snakefile b/Snakefile
index cc39780b..bfd09d35 100644
--- a/Snakefile
+++ b/Snakefile
@@ -42,7 +42,6 @@ def algo_has_mult_param_combos(algo):
     return len(algorithm_params.get(algo, {})) > 1
 
 algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)]
-
 # Get the parameter dictionary for the specified
 # algorithm and parameter combination hash
 def reconstruction_params(algorithm, params_hash):
@@ -105,8 +104,18 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
     if _config.config.analysis_include_evaluation:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-    
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -372,29 +381,81 @@ def get_dataset_label(wildcards):
     dataset = parts[0]
     return dataset
 
-# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
-rule evaluation: # update to be per algorithm and for all algortihms
+
+# Run evaluation for all pathway outputs and ensemble.txt for a dataset against its paired gold standard
+rule evaluation:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
-    output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
+        ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt",
+        # add PCA coordinates file 
+        pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt"
+    output: 
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]),
+        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']),
+        pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']),
+        # add pca png and file that is needed by Evaluation.precision_and_recall
+        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]),
+        pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]),
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png)
+        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table)
+        Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png)
+        pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
+
+        
+
+# Run evaluation per algortihm for all associated pathway outputs and ensemble.txt for a dataset against its paired gold standard
+
+def collect_pathways_per_algo_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
+    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)
+
+def collect_ensemble_per_algo_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    print(dataset_label)
+    return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt"
+
+def collect_pca_coordinates_per_algo_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt"
+
+rule evaluation_per_algo_pathways:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        pathways =  collect_pathways_per_algo_per_dataset,
+    output: 
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), # these all need to be updated to use the algortihm in it
+        pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']),
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png)
+
+rule evaluation_per_algo_ensemble_pr_curve:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        ensemble_file = collect_ensemble_per_algo_per_dataset,
+    output: 
+        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']),
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table)
+        Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png)
+
+rule evaluation_per_algo_pca_chosen:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset
+    output: 
+        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.txt"]),
+        pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.png"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, output.eval_file)
-        # add recall
-        # Run "PR" curves for output pathays precision and recall 
-        # Run PR curves for ensemble files only 
-        # Run PCA "tuning" idea
-        # - will either need to read file from ml_analysis or rerun pca rule
-
-# parameter tuning section? 
-# does there need to be a seperate section for parameter tuning if evaluation will deal with it
-# PCA
-# - only one that isn't taken care of by the evaluation code directly, but can be added as something to look at in evaluation
-# no parameter tuning
-# - will use the outputs that can be put into evaluation
-# ensembling 
-# - will use the outputs that can be put into evaluation
+        pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
 
 
 # Remove the output directory
diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml
new file mode 100644
index 00000000..c219a471
--- /dev/null
+++ b/config/egfr-param-tuning.yaml
@@ -0,0 +1,3459 @@
+hash_length: 7
+container_framework: docker
+unpack_singularity: false
+container_registry:
+  base_url: docker.io
+  owner: reedcompbio
+algorithms:
+  - name: omicsintegrator2
+    params:
+      include: true
+      run1:
+        b:
+          - 1
+        g:
+          - 3
+        w:
+          - 5
+      run2:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 6
+      run3:
+        b:
+          - 1
+        g:
+          - 5
+        w:
+          - 8
+      run4:
+        b:
+          - 9
+        g:
+          - 7
+        w:
+          - 8
+      run5:
+        b:
+          - 7
+        g:
+          - 7
+        w:
+          - 10
+      run6:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 4
+      run7:
+        b:
+          - 9
+        g:
+          - 7
+        w:
+          - 3
+      run8:
+        b:
+          - 1
+        g:
+          - 6
+        w:
+          - 2
+      run9:
+        b:
+          - 9
+        g:
+          - 7
+        w:
+          - 4
+      run10:
+        b:
+          - 3
+        g:
+          - 6
+        w:
+          - 3
+      run11:
+        b:
+          - 4
+        g:
+          - 6
+        w:
+          - 2
+      run12:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 6
+      run13:
+        b:
+          - 5
+        g:
+          - 7
+        w:
+          - 5
+      run14:
+        b:
+          - 7
+        g:
+          - 6
+        w:
+          - 1
+      run15:
+        b:
+          - 1
+        g:
+          - 4
+        w:
+          - 8
+      run16:
+        b:
+          - 1
+        g:
+          - 4
+        w:
+          - 1
+      run17:
+        b:
+          - 1
+        g:
+          - 3
+        w:
+          - 3
+      run18:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 2
+      run19:
+        b:
+          - 1
+        g:
+          - 6
+        w:
+          - 7
+      run20:
+        b:
+          - 1
+        g:
+          - 4
+        w:
+          - 4
+      run21:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 9
+      run22:
+        b:
+          - 1
+        g:
+          - 2
+        w:
+          - 4
+      run23:
+        b:
+          - 1
+        g:
+          - 6
+        w:
+          - 3
+      run24:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 9
+      run25:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 8
+      run26:
+        b:
+          - 5
+        g:
+          - 5
+        w:
+          - 1
+      run27:
+        b:
+          - 9
+        g:
+          - 7
+        w:
+          - 7
+      run28:
+        b:
+          - 7
+        g:
+          - 7
+        w:
+          - 4
+      run29:
+        b:
+          - 1
+        g:
+          - 3
+        w:
+          - 9
+      run30:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 4
+      run31:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 5
+      run32:
+        b:
+          - 7
+        g:
+          - 7
+        w:
+          - 9
+      run33:
+        b:
+          - 4
+        g:
+          - 6
+        w:
+          - 1
+      run34:
+        b:
+          - 9
+        g:
+          - 6
+        w:
+          - 2
+      run35:
+        b:
+          - 8
+        g:
+          - 6
+        w:
+          - 2
+      run36:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 10
+      run37:
+        b:
+          - 7
+        g:
+          - 7
+        w:
+          - 8
+      run38:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 10
+      run39:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 10
+      run40:
+        b:
+          - 1
+        g:
+          - 5
+        w:
+          - 4
+      run41:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 5
+      run42:
+        b:
+          - 1
+        g:
+          - 3
+        w:
+          - 1
+      run43:
+        b:
+          - 1
+        g:
+          - 5
+        w:
+          - 1
+      run44:
+        b:
+          - 3
+        g:
+          - 6
+        w:
+          - 2
+      run45:
+        b:
+          - 3
+        g:
+          - 6
+        w:
+          - 4
+      run46:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 7
+      run47:
+        b:
+          - 6
+        g:
+          - 5
+        w:
+          - 1
+      run48:
+        b:
+          - 10
+        g:
+          - 5
+        w:
+          - 1
+      run49:
+        b:
+          - 1
+        g:
+          - 2
+        w:
+          - 2
+      run50:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 6
+      run51:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 10
+      run52:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 9
+      run53:
+        b:
+          - 8
+        g:
+          - 5
+        w:
+          - 1
+      run54:
+        b:
+          - 6
+        g:
+          - 6
+        w:
+          - 2
+      run55:
+        b:
+          - 6
+        g:
+          - 7
+        w:
+          - 5
+      run56:
+        b:
+          - 2
+        g:
+          - 5
+        w:
+          - 1
+      run57:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 5
+      run58:
+        b:
+          - 9
+        g:
+          - 7
+        w:
+          - 10
+      run59:
+        b:
+          - 7
+        g:
+          - 7
+        w:
+          - 6
+      run60:
+        b:
+          - 5
+        g:
+          - 6
+        w:
+          - 1
+      run61:
+        b:
+          - 4
+        g:
+          - 5
+        w:
+          - 1
+      run62:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 8
+      run63:
+        b:
+          - 10
+        g:
+          - 6
+        w:
+          - 2
+      run64:
+        b:
+          - 4
+        g:
+          - 6
+        w:
+          - 3
+      run65:
+        b:
+          - 7
+        g:
+          - 6
+        w:
+          - 2
+      run66:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 3
+      run67:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 1
+      run68:
+        b:
+          - 5
+        g:
+          - 6
+        w:
+          - 2
+      run69:
+        b:
+          - 8
+        g:
+          - 7
+        w:
+          - 6
+      run70:
+        b:
+          - 10
+        g:
+          - 7
+        w:
+          - 7
+      run71:
+        b:
+          - 1
+        g:
+          - 5
+        w:
+          - 6
+      run72:
+        b:
+          - 1
+        g:
+          - 5
+        w:
+          - 7
+      run73:
+        b:
+          - 2
+        g:
+          - 6
+        w:
+          - 4
+  - name: domino
+    params:
+      include: true
+      run1:
+        module_threshold:
+          - 0.001
+        slice_threshold:
+          - 0.1
+      run2:
+        module_threshold:
+          - 0.001
+        slice_threshold:
+          - 0.001
+      run3:
+        module_threshold:
+          - 0.02
+        slice_threshold:
+          - 0.1
+      run4:
+        module_threshold:
+          - 0.01
+        slice_threshold:
+          - 0.001
+      run5:
+        module_threshold:
+          - 0.01
+        slice_threshold:
+          - 0.1
+      run6:
+        module_threshold:
+          - 0.02
+        slice_threshold:
+          - 0.001
+      run7:
+        module_threshold:
+          - 0.001
+        slice_threshold:
+          - 0.9
+      run8:
+        module_threshold:
+          - 0.001
+        slice_threshold:
+          - 0.3
+      run9:
+        module_threshold:
+          - 0.001
+        slice_threshold:
+          - 1
+  - name: mincostflow
+    params:
+      include: false
+      run1:
+        capacity:
+          - 15
+        flow:
+          - 80
+      run2:
+        capacity:
+          - 1
+        flow:
+          - 6
+      run3:
+        capacity:
+          - 5
+        flow:
+          - 60
+      run4:
+        capacity:
+          - 1
+        flow:
+          - 8
+      run5:
+        capacity:
+          - 5
+        flow:
+          - 50
+      run6:
+        capacity:
+          - 10
+        flow:
+          - 150
+      run7:
+        capacity:
+          - 1
+        flow:
+          - 20
+      run8:
+        capacity:
+          - 5
+        flow:
+          - 150
+      run9:
+        capacity:
+          - 5
+        flow:
+          - 90
+      run10:
+        capacity:
+          - 5
+        flow:
+          - 70
+  - name: pathlinker
+    params:
+      include: true
+      run1:
+        k:
+          - 200
+      run2:
+        k:
+          - 10
+      run3:
+        k:
+          - 50
+      run4:
+        k:
+          - 30
+      run5:
+        k:
+          - 40
+      run6:
+        k:
+          - 500
+      run7:
+        k:
+          - 20
+      run8:
+        k:
+          - 60
+      run9:
+        k:
+          - 100
+  - name: allpairs
+    params:
+      include: true
+  - name: meo
+    params:
+      include: true
+      run1:
+        local_search:
+          - 'No'
+        max_path_length:
+          - 2
+        rand_restarts:
+          - 10
+  - name: omicsintegrator1
+    params:
+      include: true
+      run1:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run2:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run3:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.1
+      run4:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run5:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.001
+      run6:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run7:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.001
+      run8:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run9:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.5
+      run10:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run11:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run12:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run13:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run14:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run15:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run16:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run17:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run18:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run19:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.001
+      run20:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run21:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run22:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run23:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run24:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 8
+      run25:
+        b:
+          - 10
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run26:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run27:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run28:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run29:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run30:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run31:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run32:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run33:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 2
+      run34:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run35:
+        b:
+          - 10
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run36:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 2
+      run37:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.1
+      run38:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run39:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run40:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run41:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run42:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run43:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 8
+      run44:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run45:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run46:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run47:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 8
+      run48:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run49:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run50:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run51:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 2
+      run52:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run53:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 2
+      run54:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 2
+      run55:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run56:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run57:
+        b:
+          - 0.01
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run58:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.001
+      run59:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run60:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run61:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run62:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run63:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run64:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 8
+      run65:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 2
+      run66:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.001
+      run67:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run68:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 8
+      run69:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run70:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 8
+      run71:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 2
+      run72:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run73:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run74:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 8
+      run75:
+        b:
+          - 10
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run76:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run77:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run78:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run79:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run80:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run81:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run82:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run83:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run84:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run85:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run86:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 2
+      run87:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run88:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 2
+      run89:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run90:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run91:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 8
+      run92:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.1
+      run93:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 8
+      run94:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.1
+      run95:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run96:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 8
+      run97:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.001
+      run98:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run99:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.001
+      run100:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run101:
+        b:
+          - 10
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run102:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run103:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 2
+      run104:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 8
+      run105:
+        b:
+          - 10
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run106:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 2
+      run107:
+        b:
+          - 10
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.001
+      run108:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run109:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 8
+      run110:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run111:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 2
+      run112:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 8
+      run113:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run114:
+        b:
+          - 0.01
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 1
+        w:
+          - 8
+      run115:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run116:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 8
+      run117:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run118:
+        b:
+          - 10
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.1
+      run119:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 2
+      run120:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run121:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run122:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run123:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 8
+      run124:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run125:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run126:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run127:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run128:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run129:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 2
+      run130:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run131:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 8
+      run132:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 2
+      run133:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 2
+      run134:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run135:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run136:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run137:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 2
+      run138:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run139:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 2
+      run140:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run141:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run142:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run143:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run144:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run145:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run146:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 2
+      run147:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 2
+      run148:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run149:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run150:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 2
+      run151:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.001
+      run152:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 8
+      run153:
+        b:
+          - 10
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run154:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 8
+      run155:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run156:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run157:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run158:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run159:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 8
+      run160:
+        b:
+          - 10
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run161:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.1
+      run162:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run163:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 2
+      run164:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run165:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run166:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 2
+      run167:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run168:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 8
+      run169:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 8
+      run170:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run171:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 8
+      run172:
+        b:
+          - 5
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run173:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run174:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run175:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run176:
+        b:
+          - 5
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run177:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.01
+        w:
+          - 8
+      run178:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 8
+      run179:
+        b:
+          - 10
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run180:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run181:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 8
+      run182:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 8
+      run183:
+        b:
+          - 10
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run184:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run185:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 8
+      run186:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run187:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run188:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run189:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.1
+      run190:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run191:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run192:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 0.001
+      run193:
+        b:
+          - 0.55
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run194:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run195:
+        b:
+          - 2
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 2
+      run196:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 8
+      run197:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 0.1
+        w:
+          - 2
+      run198:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.01
+        w:
+          - 2
+      run199:
+        b:
+          - 0.55
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.005
+        r:
+          - 0.1
+        w:
+          - 8
+      run200:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.02
+        r:
+          - 1
+        w:
+          - 8
+      run201:
+        b:
+          - 5
+        d:
+          - 40
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 1
+        w:
+          - 0.5
+      run202:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.001
+        r:
+          - 0.1
+        w:
+          - 0.5
+      run203:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run204:
+        b:
+          - 5
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run205:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 8
+      run206:
+        b:
+          - 2
+        d:
+          - 10
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+      run207:
+        b:
+          - 2
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 8
+      run208:
+        b:
+          - 0.55
+        d:
+          - 40
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 2
+      run209:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.01
+        w:
+          - 2
+      run210:
+        b:
+          - 0.55
+        d:
+          - 30
+        g:
+          - 0.0001
+        mu:
+          - 0.008
+        r:
+          - 0.1
+        w:
+          - 2
+      run211:
+        b:
+          - 10
+        d:
+          - 30
+        g:
+          - 0.001
+        mu:
+          - 0.03
+        r:
+          - 0.01
+        w:
+          - 0.5
+      run212:
+        b:
+          - 2
+        d:
+          - 20
+        g:
+          - 0.0001
+        mu:
+          - 0.03
+        r:
+          - 0.1
+        w:
+          - 8
+datasets:
+  - label: tps_egfr
+    node_files:
+      - tps-egfr-prizes.txt
+    edge_files:
+      - phosphosite-irefindex13.0-uniprot.txt
+    other_files: []
+    data_dir: input
+gold_standards:
+  - label: gs_egfr
+    node_files:
+      - gs-egfr.txt
+    data_dir: input
+    dataset_labels:
+      - tps_egfr
+reconstruction_settings:
+  locations:
+    reconstruction_dir: output
+  run: true
+analysis:
+  summary:
+    include: true
+  graphspace:
+    include: false
+  cytoscape:
+    include: false
+  ml:
+    include: true
+    aggregate_per_algorithm: true
+    components: 4
+    labels: false
+    linkage: ward
+    metric: euclidean
+  evaluation:
+    include: true
diff --git a/input/gs-egfr.txt b/input/gs-egfr.txt
new file mode 100644
index 00000000..4b880cd4
--- /dev/null
+++ b/input/gs-egfr.txt
@@ -0,0 +1,324 @@
+1433B_HUMAN
+1433E_HUMAN
+1433T_HUMAN
+4EBP1_HUMAN
+ABI1_HUMAN
+ABL1_HUMAN
+ACK1_HUMAN
+ACTS_HUMAN
+AHSA1_HUMAN
+AIMP2_HUMAN
+AKT1_HUMAN
+AKT2_HUMAN
+AKT3_HUMAN
+ANDR_HUMAN
+AP2A1_HUMAN
+AP2B1_HUMAN
+AP2M1_HUMAN
+AP2S1_HUMAN
+ARAF_HUMAN
+AREG_HUMAN
+ARF4_HUMAN
+ARF6_HUMAN
+ARHG2_HUMAN
+ARHG7_HUMAN
+ARRB1_HUMAN
+ASAP1_HUMAN
+ASAP2_HUMAN
+ATF1_HUMAN
+ATF2_HUMAN
+ATX1_HUMAN
+B2CL1_HUMAN
+BAD_HUMAN
+BCAR1_HUMAN
+BCL2_HUMAN
+BDNF_HUMAN
+BRAF_HUMAN
+BTC_HUMAN
+Ca++_PSEUDONODE
+CASP3_HUMAN
+CASP9_HUMAN
+CAV1_HUMAN
+CAV2_HUMAN
+CBL_HUMAN
+CBLB_HUMAN
+CBLC_HUMAN
+CCND1_HUMAN
+CDC42_HUMAN
+CDN1A_HUMAN
+CEAM1_HUMAN
+CEBPA_HUMAN
+CEBPB_HUMAN
+CLCA_HUMAN
+CREB1_HUMAN
+CRK_HUMAN
+CRKL_HUMAN
+CSK_HUMAN
+CTND1_HUMAN
+CXA1_HUMAN
+CYH3_HUMAN
+DAG_PSEUDONODE
+DAXX_HUMAN
+DDIT3_HUMAN
+DOK2_HUMAN
+DP13A_HUMAN
+DP13B_HUMAN
+DYN1_HUMAN
+ECSIT_HUMAN
+EF1A1_HUMAN
+EF1A2_HUMAN
+EF2K_HUMAN
+EGF_HUMAN
+EGFR_HUMAN
+ELF3_HUMAN
+ELK1_HUMAN
+ELK4_HUMAN
+EP15R_HUMAN
+EPHB2_HUMAN
+EPIPL_HUMAN
+EPN1_HUMAN
+EPS15_HUMAN
+EPS8_HUMAN
+ERBB2_HUMAN
+ERBB3_HUMAN
+ERBB4_HUMAN
+EREG_HUMAN
+ESR1_HUMAN
+FAK1_HUMAN
+FAK2_HUMAN
+FGF1_HUMAN
+FGFR1_HUMAN
+FLNA_HUMAN
+FLNB_HUMAN
+FLNC_HUMAN
+FOS_HUMAN
+FOXO1_HUMAN
+GA45G_HUMAN
+GAB1_HUMAN
+GAB2_HUMAN
+GELS_HUMAN
+GIT1_HUMAN
+GNA12_HUMAN
+GNAI1_HUMAN
+GNAI3_HUMAN
+GNDS_HUMAN
+GRAP2_HUMAN
+GRB10_HUMAN
+GRB14_HUMAN
+GRB2_HUMAN
+GRB7_HUMAN
+GSK3B_HUMAN
+H31T_HUMAN
+HAT1_HUMAN
+HBEGF_HUMAN
+HD_HUMAN
+HDAC1_HUMAN
+HDAC2_HUMAN
+HDAC3_HUMAN
+HGS_HUMAN
+HIP1_HUMAN
+HSPB1_HUMAN
+ICEF1_HUMAN
+IFIT3_HUMAN
+IKKA_HUMAN
+IL1A_HUMAN
+IL1R1_HUMAN
+ITCH_HUMAN
+JAK1_HUMAN
+JAK2_HUMAN
+JIP2_HUMAN
+JIP3_HUMAN
+JUN_HUMAN
+JUNB_HUMAN
+JUND_HUMAN
+K1C17_HUMAN
+K1C18_HUMAN
+K2C7_HUMAN
+K2C8_HUMAN
+KAP1_HUMAN
+KAP2_HUMAN
+KAP3_HUMAN
+KAPCA_HUMAN
+KAPCB_HUMAN
+KCC2G_HUMAN
+KLF11_HUMAN
+KPCA_HUMAN
+KPCD1_HUMAN
+KPCG_HUMAN
+KPCI_HUMAN
+KPCZ_HUMAN
+KS6A1_HUMAN
+KS6A2_HUMAN
+KS6A3_HUMAN
+KS6A4_HUMAN
+KS6A5_HUMAN
+KS6B1_HUMAN
+LTOR3_HUMAN
+M3K1_HUMAN
+M3K11_HUMAN
+M3K12_HUMAN
+M3K13_HUMAN
+M3K14_HUMAN
+M3K2_HUMAN
+M3K3_HUMAN
+M3K4_HUMAN
+M3K5_HUMAN
+M3K7_HUMAN
+M3K8_HUMAN
+M4K1_HUMAN
+M4K2_HUMAN
+M4K4_HUMAN
+MAPK3_HUMAN
+MAPK5_HUMAN
+MAX_HUMAN
+MCF2_HUMAN
+MED1_HUMAN
+MEF2C_HUMAN
+MK01_HUMAN
+MK03_HUMAN
+MK07_HUMAN
+MK08_HUMAN
+MK10_HUMAN
+MK14_HUMAN
+MKNK2_HUMAN
+MLTK_HUMAN
+MP2K1_HUMAN
+MP2K2_HUMAN
+MP2K3_HUMAN
+MP2K4_HUMAN
+MP2K5_HUMAN
+MP2K6_HUMAN
+MP2K7_HUMAN
+MTA2_HUMAN
+MTOR_HUMAN
+MYC_HUMAN
+NCK1_HUMAN
+NCK2_HUMAN
+NCOA1_HUMAN
+NF1_HUMAN
+NFAC4_HUMAN
+NGF_HUMAN
+NLK_HUMAN
+NRG1_HUMAN
+NRG2_HUMAN
+NRG3_HUMAN
+NRG4_HUMAN
+NTF3_HUMAN
+NTRK1_HUMAN
+P53_HUMAN
+P55G_HUMAN
+P63_HUMAN
+P85A_HUMAN
+P85B_HUMAN
+PAK1_HUMAN
+PAXI_HUMAN
+PDGFA_HUMAN
+PDPK1_HUMAN
+PEBP1_HUMAN
+PGFRA_HUMAN
+PI3,4,5P3_PSEUDONODE
+PI51C_HUMAN
+PIPNA_HUMAN
+PK3CA_HUMAN
+PK3CB_HUMAN
+PK3CD_HUMAN
+PK3CG_HUMAN
+PKD1_HUMAN
+PKN2_HUMAN
+PLCG1_HUMAN
+PLCG2_HUMAN
+PLD1_HUMAN
+PLD2_HUMAN
+PLEC_HUMAN
+PLS1_HUMAN
+PPM1B_HUMAN
+PPP5_HUMAN
+PRS6A_HUMAN
+PTK6_HUMAN
+PTN1_HUMAN
+PTN11_HUMAN
+PTN12_HUMAN
+PTN5_HUMAN
+PTN6_HUMAN
+PTN7_HUMAN
+PTPRH_HUMAN
+PTPRR_HUMAN
+RAB5A_HUMAN
+RAC2_HUMAN
+RAF1_HUMAN
+RALB_HUMAN
+RAP1A_HUMAN
+RASA1_HUMAN
+RASA2_HUMAN
+RASH_HUMAN
+RASK_HUMAN
+RASN_HUMAN
+RBBP7_HUMAN
+RBP1_HUMAN
+REPS1_HUMAN
+REPS2_HUMAN
+RGS16_HUMAN
+RHEB_HUMAN
+RHG01_HUMAN
+RIPK1_HUMAN
+RRAS2_HUMAN
+RSSA_HUMAN
+SH2D3_HUMAN
+SH3G2_HUMAN
+SH3G3_HUMAN
+SH3K1_HUMAN
+SH3L1_HUMAN
+SHC1_HUMAN
+SHC2_HUMAN
+SHIP2_HUMAN
+SHOC2_HUMAN
+SIN3A_HUMAN
+SMAD2_HUMAN
+SMAD3_HUMAN
+SMD2_HUMAN
+SOCS1_HUMAN
+SOCS3_HUMAN
+SOS1_HUMAN
+SOS2_HUMAN
+SP1_HUMAN
+SPY1_HUMAN
+SPY2_HUMAN
+SRC_HUMAN
+SRF_HUMAN
+STA5A_HUMAN
+STA5B_HUMAN
+STAM1_HUMAN
+STAT1_HUMAN
+STAT3_HUMAN
+STK3_HUMAN
+STXB1_HUMAN
+SYGP1_HUMAN
+SYHC_HUMAN
+SYUA_HUMAN
+TAB1_HUMAN
+TAB2_HUMAN
+TAU_HUMAN
+TE2IP_HUMAN
+TGFA_HUMAN
+TGFB1_HUMAN
+TGFR1_HUMAN
+TGIF1_HUMAN
+TLN1_HUMAN
+TNFA_HUMAN
+TNFL6_HUMAN
+TNR1A_HUMAN
+TNR6_HUMAN
+TRAF2_HUMAN
+TRAF6_HUMAN
+TSC1_HUMAN
+TSC2_HUMAN
+UBB_HUMAN
+UBC_HUMAN
+US6NL_HUMAN
+VAV_HUMAN
+VAV2_HUMAN
+VAV3_HUMAN
+WASL_HUMAN
+WNK1_HUMAN
+ZHX2_HUMAN
+ZPR1_HUMAN
\ No newline at end of file
diff --git a/spras/evaluation.py b/spras/evaluation.py
index 1330ee2b..67346f2f 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -4,7 +4,9 @@
 from typing import Dict, Iterable
 
 import pandas as pd
-from sklearn.metrics import precision_score, recall_score
+from sklearn.metrics import precision_score, recall_score, precision_recall_curve, average_precision_score
+import matplotlib.pyplot as plt
+import numpy as np
 
 
 class Evaluation:
@@ -72,7 +74,7 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
         # TODO: later iteration - chose between node and edge file, or allow both
 
     @staticmethod
-    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str, output_png: str ):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
         Calculates precision and recall for each pathway file
@@ -96,22 +98,100 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o
             recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
             results.append({"Pathway": file, "Precision": precision, "Recall": recall})
 
-        precision_df = pd.DataFrame(results)
-        precision_df.to_csv(output_file, sep="\t", index=False)
+        pr_df = pd.DataFrame(results)
+        pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
+        pr_df.to_csv(output_file, sep="\t", index=False)
 
         # TODO make "PR" curves from the precision_and_recall file
-
-    def edge_frequency_nodes(ensemble_file: str, node_table:pd.DataFrame, output_file: str, output_png: str):
-        None
-        # create one per ensemble file 
-    
-    def pr_curves ():
-        None
+        plt.figure(figsize=(8, 6))
+        plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
+        plt.xlabel("Recall")
+        plt.ylabel("Precision")
+        plt.title(f"Precision and Recall Plot")
+        plt.legend()
+        plt.grid(True)
+        plt.savefig(output_png)
 
     # TODO make PR curves for the nodes from ensembled files outputs
     # TODO make the edge frequency node ensembles 
+    def select_max_freq_and_node(row):
+        max_freq = 0
+        node = ""
+        if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
+            max_freq = row['Freq1']
+            node = row['Node1']
+        elif pd.isna(row['Node1']) and pd.isna(row['Freq1']):
+            max_freq = row['Freq2']
+            node = row['Node2']
+        else:
+            max_freq = max(row['Freq1'], row['Freq2'])
+            node = row['Node1']
+        return node, max_freq
+
+    def edge_frequency_node_ensemble(ensemble_file: str, node_table:pd.DataFrame):
+        
+        print(node_table)
+        print(type(ensemble_file))
+        ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0)
+        print(ensemble_df)
+        if not ensemble_df.empty:
+            node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction'])
+            node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction'])
+            max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
+            max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
+            max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
+            max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
+            node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
+            node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand')
+            node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
+            node_df_merged.sort_values('max_freq', ascending= False, inplace = True)
+            print(node_df_merged)
+            return node_df_merged
+        else:
+            return pd.DataFrame(columns = ['Node', 'max_freq'])
+        
+
+    def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
+       
+        gold_standard_nodes = set(node_table['NODEID'])
+
+        if not node_ensemble.empty:
+            y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']]
+            y_scores = node_ensemble['max_freq'].tolist()
+            precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
+            auc_precision_recall = average_precision_score(y_true, y_scores)
+
+            plt.figure()
+            plt.plot(recall, precision, marker='o', label='Precision-Recall curve')
+            plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}')
+            plt.xlabel('Recall')
+            plt.ylabel('Precision')
+            plt.title('Precision-Recall Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(output_png)
+        else: 
+            plt.figure()
+            plt.savefig(output_png)
 
-    def pca_chosen_pathway():
-        None
     # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway
-    
\ No newline at end of file
+    def pca_chosen_pathway(coordinates_file: str, output_dir:str):
+
+        print(output_dir)
+        coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0)
+
+        centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid']
+        centroid = centroid_row.iloc[0, 1:].tolist()
+
+        coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid']
+
+        pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
+        coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
+        print(coord_df.sort_values(by='Distance To Centroid'))
+        closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
+        print(closest_to_centroid)
+        rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")]
+
+        print(rep_pathway)
+
+        return rep_pathway
\ No newline at end of file

From 20d20bfc28bd542fa905099fd99844a29513b90a Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 4 Nov 2024 11:34:29 -0600
Subject: [PATCH 04/22] new updates to the integration

---
 Snakefile                     |  33 ++--
 config/config.yaml            |   9 +-
 config/egfr-param-tuning.yaml |   1 +
 parameter-tuning.py           | 288 ----------------------------------
 spras/config.py               |   6 +
 spras/evaluation.py           |  83 ++++++----
 6 files changed, 80 insertions(+), 340 deletions(-)
 delete mode 100644 parameter-tuning.py

diff --git a/Snakefile b/Snakefile
index bfd09d35..f717f739 100644
--- a/Snakefile
+++ b/Snakefile
@@ -106,12 +106,13 @@ def make_final_input(wildcards):
     if _config.config.analysis_include_evaluation:
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
         
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+    if _config.config.analysis_include_evaluation_aggregate_algo:
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
@@ -382,7 +383,8 @@ def get_dataset_label(wildcards):
     return dataset
 
 
-# Run evaluation for all pathway outputs and ensemble.txt for a dataset against its paired gold standard
+# Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
+# TODO: figure out why this works when all one rule, but the per algorithm doesn't work like that
 rule evaluation:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
@@ -400,29 +402,29 @@ rule evaluation:
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png)
-        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table)
-        Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png)
+        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
+        Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
         Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
 
-        
-
-# Run evaluation per algortihm for all associated pathway outputs and ensemble.txt for a dataset against its paired gold standard
-
+# Returns all pathways for a specific algorithm and dataset
 def collect_pathways_per_algo_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
     return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)
 
+# Returns ensemble file for a specific algorithm and dataset
 def collect_ensemble_per_algo_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
-    print(dataset_label)
     return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt"
 
+# Returns pca coordinates for a specific algorithm and dataset
 def collect_pca_coordinates_per_algo_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
     return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt"
 
+# Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
+# TODO: only works when these rules are broken up
 rule evaluation_per_algo_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
@@ -442,8 +444,8 @@ rule evaluation_per_algo_ensemble_pr_curve:
         pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table)
-        Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png)
+        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
+        Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
 
 rule evaluation_per_algo_pca_chosen:
     input: 
@@ -457,7 +459,6 @@ rule evaluation_per_algo_pca_chosen:
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
         Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
 
-
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'
diff --git a/config/config.yaml b/config/config.yaml
index 4b473050..fd47638b 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -173,9 +173,8 @@ analysis:
         # 'euclidean', 'manhattan', 'cosine'
         metric: 'euclidean'
       evaluation:
+        # evaluation per dataset-goldstandard pair
         include: true
-        # update to decouple the evaluation parts? 
-        # - ensemble vs all pathways vs pca chosen pathway
-        # pr curves from ensemble files 
-        # "pr" curves from all pathways
-        # p and r from pca chosen pathway
+        # adds evaluation per algorithm per dataset-goldstandard pair
+        aggregate_per_algorithm: true
+
diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml
index c219a471..ecc2a65f 100644
--- a/config/egfr-param-tuning.yaml
+++ b/config/egfr-param-tuning.yaml
@@ -3457,3 +3457,4 @@ analysis:
     metric: euclidean
   evaluation:
     include: true
+    aggregate_per_algorithm: true
diff --git a/parameter-tuning.py b/parameter-tuning.py
deleted file mode 100644
index 0094e3c1..00000000
--- a/parameter-tuning.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import glob
-import os
-import pickle as pkl
-from pathlib import Path
-from typing import Dict, Iterable
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-from sklearn.metrics import (
-    PrecisionRecallDisplay,
-    average_precision_score,
-    precision_recall_curve,
-    precision_score,
-    recall_score,
-)
-
-from spras.analysis.ml import summarize_networks
-from spras.evaluation import Evaluation
-
-# make directories
-directories = ["parameter-tuning","parameter-tuning/ensembling-parameter-tuning", "parameter-tuning/no-parameter-tuning", "parameter-tuning/pca-parameter-tuning"]
-
-for directory in directories:
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-        print(f"Directory {directory} was created.")
-    else:
-        print(f"Directory {directory} already exists.")
-
-
-# #################################################################################################################################################
-# Parameter Tuning with Ensemble networks
-
-def select_max_freq_and_node(row):
-        max_freq = 0
-        node = ""
-        if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
-            max_freq = row['Freq1']
-            node = row['Node1']
-        elif pd.isna(row['Node1']) and pd.isna(row['Freq1']):
-            max_freq = row['Freq2']
-            node = row['Node2']
-        else:
-            max_freq = max(row['Freq1'], row['Freq2'])
-            node = row['Node1']
-        return node, max_freq
-
-def precision_recall(file, node_table, node_freq_filename, output_file):
-    gold_standard_nodes = set(node_table['NODEID'])
-
-    df = pd.read_table(file, sep="\t", header=0)
-
-    node1_freq = df.drop(columns = ['Node2', 'Direction'])
-    node2_freq = df.drop(columns = ['Node1', 'Direction'])
-    max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
-    max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
-    max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
-    max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
-    node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
-    node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(select_max_freq_and_node, axis=1, result_type='expand')
-    node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
-
-    node_df_merged.sort_values('max_freq', ascending= False, inplace = True)
-    node_df_merged.to_csv(node_freq_filename, sep = "\t",header=True, index=False)
-
-    y_true = [1 if node in gold_standard_nodes else 0 for node in node_df_merged['Node']]
-    y_scores = node_df_merged['max_freq'].tolist()
-
-    # print(f"y_true:\n{y_true}")
-    # print(f"y_score:\n{y_scores}")
-
-    plt.figure()
-    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
-    # print(f"precision:{precision}\n recall:{recall}\n thresholds:{thresholds}\n")
-    auc_precision_recall = average_precision_score(y_true, y_scores)
-
-    plt.plot(recall, precision, marker='o', label='Precision-Recall curve')
-    plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}')
-    plt.xlabel('Recall')
-    plt.ylabel('Precision')
-    plt.title('Precision-Recall Curve')
-    plt.legend()
-    plt.grid(True)
-    plt.savefig(output_filename)
-
-    # print(f"overlapping nodes: {len(set(node_df_merged['Node'].tolist()) & gold_standard_nodes)}")
-    # print(f"average_precision_score: {auc_precision_recall}")
-
-# TODO: fix mincostflow bug with summarize networks
-algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino']
-
-gold_standard_file = "output/gs_egfr-merged.pickle"
-node_table = Evaluation.from_file(gold_standard_file).node_table
-new_folder_path = 'parameter-tuning/ensembling-parameter-tuning/'
-
-for algo in algorithms:
-    ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt"
-    node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt"
-    output_filename = f"{new_folder_path}{algo}-pr.png"
-    try:
-        precision_recall(ensemble_filename, node_table, node_freq_filename, output_filename)
-    except Exception as error:
-        print(error)
-
-# code to work for MEO
-algorithms = ['meo']
-
-for algo in algorithms:
-    ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt"
-    df = pd.read_table(ensemble_filename, sep="\t", header=0)
-    df['Node1'] = df['Node1'] + '_HUMAN'
-    df['Node2'] = df['Node2'] + '_HUMAN'
-    df['Node1'] = df['Node1'].replace({
-    'Ca++_HUMAN': 'Ca++_PSEUDONODE',
-    'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
-    'DAG_HUMAN': 'DAG_PSEUDONODE'
-    })
-    df['Node2'] = df['Node2'].replace({
-    'Ca++_HUMAN': 'Ca++_PSEUDONODE',
-    'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
-    'DAG_HUMAN': 'DAG_PSEUDONODE'
-    })
-
-    updated_ensemble_filename = f"{new_folder_path}meo-ensemble-pathway-updated.txt"
-    df.to_csv(updated_ensemble_filename, sep="\t", header=True, index=False)
-    node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt"
-    output_filename = f"{new_folder_path}{algo}-pr.png"
-    try:
-        precision_recall(updated_ensemble_filename, node_table, node_freq_filename, output_filename)
-    except Exception as error:
-        print(error)
-
-
-#################################################################################################################################################
-# No Parameter Tuning
-
-def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
-    """
-    Takes in file paths for a specific dataset and an associated gold standard node table.
-    Calculates recall for each pathway file
-    Returns output back to output_file
-    @param file_paths: file paths of pathway reconstruction algorithm outputs
-    @param node_table: the gold standard nodes
-    @param output_file: the filename to save the precision of each pathway
-    """
-    y_true = set(node_table['NODEID'])
-    results = []
-
-    for file in file_paths:
-        df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
-        y_pred = set(df['Node1']).union(set(df['Node2']))
-        all_nodes = y_true.union(y_pred)
-        y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
-        y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
-
-        # default to 0.0 if there is a divide by 0 error
-        precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
-        recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
-        results.append({"Pathway": file, "Precision": precision, "Recall": recall})
-
-    pr_df = pd.DataFrame(results)
-    pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True)
-    pr_df.to_csv(output_file, sep="\t", index=False)
-    return pr_df
-
-
-algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino']
-
-gold_standard_file = "output/gs_egfr-merged.pickle"
-node_table = Evaluation.from_file(gold_standard_file).node_table
-folder_path = 'output/'
-new_folder_path = 'parameter-tuning/no-parameter-tuning/'
-
-for algo in algorithms:
-    file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt")
-    files = glob.glob(file_pattern)
-    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
-    prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png"
-
-    pr_df = precision_and_recall(file_paths=files, node_table=node_table, output_file=output_file)
-
-    plt.figure(figsize=(8, 6))
-    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
-    plt.xlabel("Recall")
-    plt.ylabel("Precision")
-    plt.title(f"{algo} Precision-Recall Curve")
-    plt.legend()
-    plt.grid(True)
-    plt.savefig(prcurve_filename)
-
-
-# code to work for MEO
-def precision_and_recall_meo(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
-    """
-    Takes in file paths for a specific dataset and an associated gold standard node table.
-    Calculates recall for each pathway file
-    Returns output back to output_file
-    @param file_paths: file paths of pathway reconstruction algorithm outputs
-    @param node_table: the gold standard nodes
-    @param output_file: the filename to save the precision of each pathway
-    """
-    y_true = set(node_table['NODEID'])
-    results = []
-
-    for file in file_paths:
-        df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
-        df['Node1'] = df['Node1'] + '_HUMAN'
-        df['Node2'] = df['Node2'] + '_HUMAN'
-        df['Node1'] = df['Node1'].replace({
-        'Ca++_HUMAN': 'Ca++_PSEUDONODE',
-        'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
-        'DAG_HUMAN': 'DAG_PSEUDONODE'
-        })
-        df['Node2'] = df['Node2'].replace({
-        'Ca++_HUMAN': 'Ca++_PSEUDONODE',
-        'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE',
-        'DAG_HUMAN': 'DAG_PSEUDONODE'
-        })
-
-        y_pred = set(df['Node1']).union(set(df['Node2']))
-        all_nodes = y_true.union(y_pred)
-        y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
-        y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
-
-        # default to 0.0 if there is a divide by 0 error
-        precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
-        recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
-        results.append({"Pathway": file, "Precision": precision, "Recall": recall})
-
-    pr_df = pd.DataFrame(results)
-    pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True)
-    pr_df.to_csv(output_file, sep="\t", index=False)
-    return pr_df
-
-algorithms = ['meo']
-
-for algo in algorithms:
-
-    file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt")
-    files = glob.glob(file_pattern)
-    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
-    prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png"
-
-    pr_df = precision_and_recall_meo(file_paths=files, node_table=node_table, output_file=output_file)
-
-    plt.figure(figsize=(8, 6))
-    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
-    plt.xlabel("Recall")
-    plt.ylabel("Precision")
-    plt.title(f"{algo} Precision-Recall Curve")
-    plt.legend()
-    plt.grid(True)
-    plt.savefig(prcurve_filename)
-
-#################################################################################################################################################
-# PCA parameter tuning
-
-algorithms = ['omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'domino', 'meo', 'allpairs']
-folder_path = 'output/'
-gold_standard_file = "output/gs_egfr-merged.pickle"
-node_table = Evaluation.from_file(gold_standard_file).node_table
-new_folder_path = 'parameter-tuning/pca-parameter-tuning/'
-
-for algo in algorithms:
-    file_path = os.path.join(folder_path, f"tps_egfr-ml", f"{algo}-pca-coordinates.txt")
-    try:
-        coord_df = pd.read_csv(file_path, delimiter="\t", header=0)
-    except Exception as error:
-        print(f"PCA parameter tuning: {error}")
-        continue
-
-    # centroid 
-    centroid_row = coord_df[coord_df['algorithm'] == 'centroid']
-    centroid = centroid_row.iloc[0, 1:].tolist()
-
-    # update df to exclude centroid point
-    coord_df = coord_df[coord_df['algorithm'] != 'centroid']
-
-    # euclidean distance
-    pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
-    coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
-    closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
-    
-    # finding the rep pathway
-    rep_pathway = [os.path.join(folder_path, f"{closest_to_centroid['algorithm']}", "pathway.txt")]
-    output_file = f"{new_folder_path}{algo}-precision-and-recall.txt"
-    precision_and_recall(rep_pathway, node_table, output_file)
diff --git a/spras/config.py b/spras/config.py
index 14f1a926..b476f98f 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -233,6 +233,7 @@ def process_config(self, raw_config):
 
         self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {}
         self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {}
+        self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {}
 
         self.pca_params = {}
         if "components" in self.ml_params:
@@ -260,3 +261,8 @@ def process_config(self, raw_config):
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_ml_aggregate_algo = False
+
+        if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation:
+            self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"]
+        else:
+            self.analysis_include_evaluation_aggregate_algo = False
diff --git a/spras/evaluation.py b/spras/evaluation.py
index 67346f2f..6757dcf9 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -3,10 +3,15 @@
 from pathlib import Path
 from typing import Dict, Iterable
 
-import pandas as pd
-from sklearn.metrics import precision_score, recall_score, precision_recall_curve, average_precision_score
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    average_precision_score,
+    precision_recall_curve,
+    precision_score,
+    recall_score,
+)
 
 
 class Evaluation:
@@ -81,7 +86,8 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o
         Returns output back to output_file
         @param file_paths: file paths of pathway reconstruction algorithm outputs
         @param node_table: the gold standard nodes
-        @param output_file: the filename to save the precision of each pathway
+        @param output_file: the filename to save the precision and recall of each pathway
+        @param output_png: the filename to plot the precision and recall of each pathway (not a PRC)
         """
         y_true = set(node_table['NODEID'])
         results = []
@@ -102,7 +108,6 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o
         pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
         pr_df.to_csv(output_file, sep="\t", index=False)
 
-        # TODO make "PR" curves from the precision_and_recall file
         plt.figure(figsize=(8, 6))
         plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
         plt.xlabel("Recall")
@@ -111,10 +116,13 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o
         plt.legend()
         plt.grid(True)
         plt.savefig(output_png)
+        # TODO: what to do when this is empty
 
-    # TODO make PR curves for the nodes from ensembled files outputs
-    # TODO make the edge frequency node ensembles 
-    def select_max_freq_and_node(row):
+    def select_max_freq_and_node(row): # TODO: what (:type) would this row be
+        """
+        Selects the node and frequency with the highest frequency value from two potential nodes in a row.
+        Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency.
+        """
         max_freq = 0
         node = ""
         if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
@@ -128,31 +136,42 @@ def select_max_freq_and_node(row):
             node = row['Node1']
         return node, max_freq
 
-    def edge_frequency_node_ensemble(ensemble_file: str, node_table:pd.DataFrame):
-        
-        print(node_table)
-        print(type(ensemble_file))
+    def edge_frequency_node_ensemble(ensemble_file: str):
+        """
+        Processes an ensemble of edge frequencies to identify the highest frequency associated with each node
+        Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node.
+        Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty.
+        @param ensemble_file: the pre-computed node_ensemble
+        """
         ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0)
-        print(ensemble_df)
+
         if not ensemble_df.empty:
             node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction'])
             node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction'])
+
             max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
             max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
             max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
             max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
-            node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
-            node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand')
-            node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
-            node_df_merged.sort_values('max_freq', ascending= False, inplace = True)
-            print(node_df_merged)
-            return node_df_merged
+
+            node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
+            node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand')
+            node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
+            node_ensemble.sort_values('max_freq', ascending= False, inplace = True)
+            return node_ensemble
         else:
+            # TODO: figure out how to deal with empty ensemble files
             return pd.DataFrame(columns = ['Node', 'max_freq'])
-        
 
-    def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
-       
+    def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
+        """
+        Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table.
+        Plots a precision and recall curve for the node ensemble against its associated gold standard node table
+        Returns output back to output_png
+        @param node_ensemble: the pre-computed node_ensemble
+        @param node_table: the gold standard nodes
+        @param output_file: the filename to save the precision and recall curves
+        """
         gold_standard_nodes = set(node_table['NODEID'])
 
         if not node_ensemble.empty:
@@ -170,28 +189,30 @@ def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame
             plt.legend()
             plt.grid(True)
             plt.savefig(output_png)
-        else: 
+        else:
+            # TODO figure out how to deal with empty ensemble files (still will have the header)
             plt.figure()
+            plt.text(0.5, 0.5, "empty ensemble file", ha='center', va='center', fontsize=12, color='red')
+            plt.axis('off')
             plt.savefig(output_png)
 
-    # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway
     def pca_chosen_pathway(coordinates_file: str, output_dir:str):
-
-        print(output_dir)
+        """
+        Identifies the pathway closest to a specified centroid based on PCA coordinates
+        Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway.
+        Returns the file path for the representative pathway associated with the closest data point.
+        @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset
+        @param output_dir: the main reconstruction directory
+        """
         coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0)
 
         centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid']
         centroid = centroid_row.iloc[0, 1:].tolist()
-
         coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid']
 
         pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
         coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
-        print(coord_df.sort_values(by='Distance To Centroid'))
         closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
-        print(closest_to_centroid)
         rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")]
 
-        print(rep_pathway)
-
-        return rep_pathway
\ No newline at end of file
+        return rep_pathway

From 71aa43eda389d58111423c294dde84c17438fe22 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 4 Nov 2024 11:41:39 -0600
Subject: [PATCH 05/22] clean up of comments

---
 Snakefile            |  3 +--
 spras/analysis/ml.py | 10 +---------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/Snakefile b/Snakefile
index f717f739..e1e8cdbd 100644
--- a/Snakefile
+++ b/Snakefile
@@ -396,7 +396,6 @@ rule evaluation:
         pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]),
         pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']),
         pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']),
-        # add pca png and file that is needed by Evaluation.precision_and_recall
         pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]),
         pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]),
     run:
@@ -430,7 +429,7 @@ rule evaluation_per_algo_pathways:
         gold_standard_file = get_gold_standard_pickle_file,
         pathways =  collect_pathways_per_algo_per_dataset,
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), # these all need to be updated to use the algortihm in it
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]),
         pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 1585db89..b477e0f9 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -146,14 +146,12 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     scaler.fit(X)  # calc mean and standard deviation
     X_scaled = scaler.transform(X)
 
-
     # choosing the PCA
     pca_instance = PCA(n_components=components)
     pca_instance.fit(X_scaled)
     X_pca = pca_instance.transform(X_scaled)
     variance = pca_instance.explained_variance_ratio_ * 100
 
-    # TODO: add in centroid code from other branch
     # calculating the centroid
     centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples
 
@@ -167,17 +165,11 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)")
     plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)")
 
-    # saving the coordinates of each algorithm
-    # make_required_dirs(output_coord)
-    # coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
-    # coordinates_df.insert(0, 'algorithm', columns.tolist())
-    # coordinates_df.to_csv(output_coord, sep='\t', index=False)
-
     # saving the coordinates of each algorithm
     make_required_dirs(output_coord)
     coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
     coordinates_df.insert(0, 'datapoint_labels', columns.tolist())
-    centroid_row = ['centroid'] + centroid.tolist() # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint
+    centroid_row = ['centroid'] + centroid.tolist()
     coordinates_df.loc[len(coordinates_df)] = centroid_row
     coordinates_df.to_csv(output_coord, sep='\t', index=False)
 

From 691673c95008424f8df58c0c1437c1ea1297c34b Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 4 Nov 2024 12:09:36 -0600
Subject: [PATCH 06/22] updated test_ml.py to work with new updates

---
 test/ml/expected/expected-pca-coordinates.tsv | 9 +++++----
 test/ml/test_ml.py                            | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv
index b6371c84..ac10f2db 100644
--- a/test/ml/expected/expected-pca-coordinates.tsv
+++ b/test/ml/expected/expected-pca-coordinates.tsv
@@ -1,4 +1,5 @@
-algorithm	PC1	PC2
-test-data-s1	-2.006650210482033	-0.9865875190637743
-test-data-s2	-1.5276508866841987	1.0799457247533237
-test-data-s3	3.534301097166232	-0.0933582056895495
\ No newline at end of file
+datapoint_labels	PC1	PC2
+test-data-s1	-2.0066502104820323	-0.9865875190637746
+test-data-s2	-1.5276508866841985	1.0799457247533233
+test-data-s3	3.5343010971662308	-0.09335820568954915
+centroid	0.0	-1.6190752442450199e-16
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index 2b5720ae..b9ca69ca 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -77,14 +77,15 @@ def test_pca_robustness(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
         expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
         expected = expected.round(5)
+        expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
+
         for _ in range(5):
             dataframe_shuffled = dataframe.sample(frac=1, axis=1)  # permute the columns
             ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt',
                 OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
             coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
-            coord.sort_values(by='algorithm', ignore_index=True, inplace=True)
-
+            coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
             assert coord.equals(expected)
 
         for _ in range(5):
@@ -93,7 +94,7 @@ def test_pca_robustness(self):
                     OUT_DIR + 'pca-shuffled-rows-coordinates.tsv')
             coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv')
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
-            coord.sort_values(by='algorithm', ignore_index=True, inplace=True)
+            coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
 
             assert coord.equals(expected)
 

From 26178f98136ecf19e158fa7397520eca4bf65c2f Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 7 Nov 2024 11:34:17 -0600
Subject: [PATCH 07/22] in progress of testing

---
 .../expected/expected-node-ensemble.csv       | 13 +++++++
 test/evaluate/input/ensemble-network.tsv      | 10 ++++++
 test/evaluate/test_evaluate.py                | 36 +++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 test/evaluate/expected/expected-node-ensemble.csv
 create mode 100644 test/evaluate/input/ensemble-network.tsv
 create mode 100644 test/evaluate/test_evaluate.py

diff --git a/test/evaluate/expected/expected-node-ensemble.csv b/test/evaluate/expected/expected-node-ensemble.csv
new file mode 100644
index 00000000..ba467d55
--- /dev/null
+++ b/test/evaluate/expected/expected-node-ensemble.csv
@@ -0,0 +1,13 @@
+Node	max_freq
+C	0.75
+E	0.75
+D	0.75
+F	0.75
+A	0.5
+B	0.5
+L	0.5
+M	0.5
+O	0.25
+P	0.25
+N	0.25
+Q	0.25
diff --git a/test/evaluate/input/ensemble-network.tsv b/test/evaluate/input/ensemble-network.tsv
new file mode 100644
index 00000000..293ec3f5
--- /dev/null
+++ b/test/evaluate/input/ensemble-network.tsv
@@ -0,0 +1,10 @@
+Node1	Node2	Frequency	Direction
+A	B	0.5	U
+C	D	0.75	U
+E	F	0.75	U
+L	M	0.5	U
+M	N	0.25	U
+O	P	0.25	U
+P	Q	0.25	U
+A	B	0.25	D
+B	A	0.25	D
\ No newline at end of file
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
new file mode 100644
index 00000000..b0a60196
--- /dev/null
+++ b/test/evaluate/test_evaluate.py
@@ -0,0 +1,36 @@
+import filecmp
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from spras.evaluation import Evaluation
+
+INPUT_DIR = 'test/evaluate/input/'
+OUT_DIR = 'test/evaluate/output/'
+EXPECT_DIR = 'test/evaluate/expected/'
+
+
+class TestEvaluate:
+    @classmethod
+    def setup_class(cls):
+        """
+        Create the expected output directory
+        """
+        Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
+
+    def test_node_ensemble(self):
+        ensemble_file = INPUT_DIR + 'ensemble-network.tsv'
+        edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file)
+        edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False)
+        assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False)
+       
+    def test_PRC_node_ensemble(self):
+        None
+        
+    def test_precision_and_recall(self):
+        None
+    
+    def test_pca_chosen_pathway(self):
+        None
+    

From a5b320545d015102e4dd819294b9512969bfef68 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 14 Nov 2024 11:03:49 -0600
Subject: [PATCH 08/22] spras/con

---
 spras/config.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/spras/config.py b/spras/config.py
index b476f98f..85aa3875 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -101,6 +101,10 @@ def __init__(self, raw_config):
         self.analysis_include_ml = None
         # A Boolean specifying whether to run the Evaluation analysis
         self.analysis_include_evaluation = None
+        # A Boolean specifying whether to run the ML per algorithm analysis
+        self.analysis_include_ml_aggregate_algo = None
+        # A Boolean specifying whether to run the Evaluation per algorithm aanalysis
+        self.analysis_include_evaluation_aggregate_algo = None
 
         _raw_config = copy.deepcopy(raw_config)
         self.process_config(_raw_config)
@@ -253,16 +257,26 @@ def process_config(self, raw_config):
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
         self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"]
 
-        if self.gold_standards == {} and self.analysis_include_evaluation:
-            raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
-                             "Please set evaluation include to false or provide gold standard data.")
-
+        # only run ml aggregate_per_algorithm if analysis_include_ml is set to true
         if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_ml_aggregate_algo = False
 
+        if self.gold_standards == {} and self.analysis_include_evaluation:
+            raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
+                             "Please set evaluation include to false or provide gold standard data.")
+
+         # only run evaluation if ml is set to true
+        if not self.analysis_include_ml and self.analysis_include_evaluation:
+            self.analysis_include_evaluation = False
+
+        # only run evaluation aggregate_per_algorithm if analysis_include_ml is set to true
         if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation:
             self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_evaluation_aggregate_algo = False
+
+        # only run evaluation per algo if ml per algo is set to true
+        if not self.analysis_include_ml_aggregate_algo and self.analysis_include_evaluation_aggregate_algo:
+            self.analysis_include_evaluation_aggregate_algo = False

From 0aeda9514c19d272eff4231706c1f881baf15a9a Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 10:00:01 -0600
Subject: [PATCH 09/22] update to config.py to deal with ml and eval coupling

---
 spras/config.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spras/config.py b/spras/config.py
index 85aa3875..cd8c228b 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -257,26 +257,27 @@ def process_config(self, raw_config):
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
         self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"]
 
-        # only run ml aggregate_per_algorithm if analysis_include_ml is set to true
+        # Only run ML aggregate per algorithm if analysis include ML is set to True
         if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_ml_aggregate_algo = False
 
+        # Raises an error if Evaluation is enabled but no gold standard data is provided
         if self.gold_standards == {} and self.analysis_include_evaluation:
             raise ValueError("Evaluation analysis cannot run as gold standard data not provided. "
                              "Please set evaluation include to false or provide gold standard data.")
 
-         # only run evaluation if ml is set to true
+         # Only run Evaluation if ML is set to True
         if not self.analysis_include_ml and self.analysis_include_evaluation:
             self.analysis_include_evaluation = False
 
-        # only run evaluation aggregate_per_algorithm if analysis_include_ml is set to true
+        # Only run Evaluation aggregate per algorithm if analysis include ML is set to True
         if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation:
             self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_evaluation_aggregate_algo = False
 
-        # only run evaluation per algo if ml per algo is set to true
+        # Only run Evaluation per algorithm if ML per algorithm is set to True
         if not self.analysis_include_ml_aggregate_algo and self.analysis_include_evaluation_aggregate_algo:
             self.analysis_include_evaluation_aggregate_algo = False

From 46c87fc5cb136693a347e37cbc3aa0e049984621 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 10:00:51 -0600
Subject: [PATCH 10/22] added TODO comments on ideas to scale the binary data,
 still not sure what idea to use

---
 spras/analysis/ml.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index b477e0f9..7d45e091 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -10,7 +10,7 @@
 from scipy.cluster.hierarchy import dendrogram, fcluster
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
 
 from spras.util import make_required_dirs
 
@@ -142,8 +142,14 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     if not isinstance(labels, bool):
         raise ValueError(f"labels={labels} must be True or False")
 
-    scaler = StandardScaler()
+    #TODO: MinMaxScaler changes nothing about the data
+    # scaler = MinMaxScaler()
+    # scaler.fit(X)  # calc mean and standard deviation
+    # X_scaled = scaler.transform(X)
+
+    scaler = StandardScaler()  # TODO: StandardScalar doesn't make sense on binary data because the mean and variance lead to values outside the binary range
     scaler.fit(X)  # calc mean and standard deviation
+    scaler.transform(X)
     X_scaled = scaler.transform(X)
 
     # choosing the PCA

From 5265c53ba7ad3845db7c8deee4c2837eb5cb01c7 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 10:28:59 -0600
Subject: [PATCH 11/22] cleaned up file names and left TODOs

---
 Snakefile | 51 ++++++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/Snakefile b/Snakefile
index e1e8cdbd..c8ab2407 100644
--- a/Snakefile
+++ b/Snakefile
@@ -104,19 +104,17 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
     if _config.config.analysis_include_evaluation:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
-        
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        # TODO: should we provide the node ensemble frequencies       
     if _config.config.analysis_include_evaluation_aggregate_algo:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        # TODO: should we provide the node ensemble frequencies per algortihm
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -384,27 +382,24 @@ def get_dataset_label(wildcards):
 
 
 # Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
-# TODO: figure out why this works when all one rule, but the per algorithm doesn't work like that
 rule evaluation:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
         ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt",
-        # add PCA coordinates file 
         pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt"
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]),
-        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']),
-        pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']),
-        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]),
-        pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]),
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
+        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']),
+        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png)
+        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png)
         node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
         Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file)
 
 # Returns all pathways for a specific algorithm and dataset
 def collect_pathways_per_algo_per_dataset(wildcards):
@@ -423,24 +418,23 @@ def collect_pca_coordinates_per_algo_per_dataset(wildcards):
     return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt"
 
 # Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
-# TODO: only works when these rules are broken up
 rule evaluation_per_algo_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways =  collect_pathways_per_algo_per_dataset,
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]),
-        pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']),
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png)
+        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png)
 
 rule evaluation_per_algo_ensemble_pr_curve:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         ensemble_file = collect_ensemble_per_algo_per_dataset,
     output: 
-        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']),
+        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-curve-ensemble-nodes.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
@@ -451,12 +445,11 @@ rule evaluation_per_algo_pca_chosen:
         gold_standard_file = get_gold_standard_pickle_file,
         pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset
     output: 
-        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.txt"]),
-        pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.png"]),
+        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-pca-chosen-pathway.txt"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file)
 
 # Remove the output directory
 rule clean:

From 9b7e6875b6ba6219cb1db4e1543a70fe3ec810a3 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:40:17 -0600
Subject: [PATCH 12/22] added the eval test cases, made a todo for config test
 case

---
 ...ted-precision-recall-per-pathway-empty.txt |  2 +
 ...recision-recall-per-pathway-pca-chosen.txt |  2 +
 .../expected-precision-recall-per-pathway.txt |  5 ++
 .../input/data-test-params-123/pathway.txt    |  3 +
 .../input/data-test-params-456/pathway.txt    |  2 +
 .../input/data-test-params-789/pathway.txt    |  3 +
 .../input/data-test-params-empty/pathway.txt  |  1 +
 test/evaluate/input/node-ensemble-empty.csv   |  2 +
 test/evaluate/input/node-ensemble.csv         | 13 ++++
 test/evaluate/input/node_table.csv            |  4 ++
 test/evaluate/input/pca-coordinates.tsv       |  6 ++
 test/evaluate/test_evaluate.py                | 64 +++++++++++++++----
 test/test_config.py                           |  2 +
 13 files changed, 97 insertions(+), 12 deletions(-)
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-123/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-456/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-789/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-empty/pathway.txt
 create mode 100644 test/evaluate/input/node-ensemble-empty.csv
 create mode 100644 test/evaluate/input/node-ensemble.csv
 create mode 100644 test/evaluate/input/node_table.csv
 create mode 100644 test/evaluate/input/pca-coordinates.tsv

diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
new file mode 100644
index 00000000..6c97ff7e
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
@@ -0,0 +1,2 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
new file mode 100644
index 00000000..6c97ff7e
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
@@ -0,0 +1,2 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway.txt b/test/evaluate/expected/expected-precision-recall-per-pathway.txt
new file mode 100644
index 00000000..02e17a7c
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway.txt
@@ -0,0 +1,5 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-456/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-123/pathway.txt	0.6666666666666666	0.6666666666666666
+test/evaluate/input/data-test-params-789/pathway.txt	1.0	1.0
diff --git a/test/evaluate/input/data-test-params-123/pathway.txt b/test/evaluate/input/data-test-params-123/pathway.txt
new file mode 100644
index 00000000..21768464
--- /dev/null
+++ b/test/evaluate/input/data-test-params-123/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	C	1	U
diff --git a/test/evaluate/input/data-test-params-456/pathway.txt b/test/evaluate/input/data-test-params-456/pathway.txt
new file mode 100644
index 00000000..d445d80f
--- /dev/null
+++ b/test/evaluate/input/data-test-params-456/pathway.txt
@@ -0,0 +1,2 @@
+Node1	Node2	Rank	Direction
+F	L	1	U
diff --git a/test/evaluate/input/data-test-params-789/pathway.txt b/test/evaluate/input/data-test-params-789/pathway.txt
new file mode 100644
index 00000000..352698a0
--- /dev/null
+++ b/test/evaluate/input/data-test-params-789/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	Q	1	U
diff --git a/test/evaluate/input/data-test-params-empty/pathway.txt b/test/evaluate/input/data-test-params-empty/pathway.txt
new file mode 100644
index 00000000..63fda2b1
--- /dev/null
+++ b/test/evaluate/input/data-test-params-empty/pathway.txt
@@ -0,0 +1 @@
+Node1	Node2	Rank	Direction
\ No newline at end of file
diff --git a/test/evaluate/input/node-ensemble-empty.csv b/test/evaluate/input/node-ensemble-empty.csv
new file mode 100644
index 00000000..e488f56a
--- /dev/null
+++ b/test/evaluate/input/node-ensemble-empty.csv
@@ -0,0 +1,2 @@
+Node	max_freq
+
diff --git a/test/evaluate/input/node-ensemble.csv b/test/evaluate/input/node-ensemble.csv
new file mode 100644
index 00000000..ba467d55
--- /dev/null
+++ b/test/evaluate/input/node-ensemble.csv
@@ -0,0 +1,13 @@
+Node	max_freq
+C	0.75
+E	0.75
+D	0.75
+F	0.75
+A	0.5
+B	0.5
+L	0.5
+M	0.5
+O	0.25
+P	0.25
+N	0.25
+Q	0.25
diff --git a/test/evaluate/input/node_table.csv b/test/evaluate/input/node_table.csv
new file mode 100644
index 00000000..5b9cd41b
--- /dev/null
+++ b/test/evaluate/input/node_table.csv
@@ -0,0 +1,4 @@
+NODEID
+A
+B
+Q
\ No newline at end of file
diff --git a/test/evaluate/input/pca-coordinates.tsv b/test/evaluate/input/pca-coordinates.tsv
new file mode 100644
index 00000000..92fc6b3d
--- /dev/null
+++ b/test/evaluate/input/pca-coordinates.tsv
@@ -0,0 +1,6 @@
+datapoint_labels	PC1	PC2
+data-test-params-123	-1.3973472526239425	-1.632993161855452
+data-test-params-456	2.025440509784659	1.9566080710032526e-16
+data-test-params-789	-1.3973472526239425	1.632993161855452
+data-test-params-empty	0.7692539954632259	-4.1496185644351084e-16
+centroid	-2.7755575615628914e-17	-4.822931287961988e-17
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index b0a60196..1c1e1e9b 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -4,13 +4,13 @@
 import pandas as pd
 import pytest
 
+import spras.analysis.ml as ml
 from spras.evaluation import Evaluation
 
 INPUT_DIR = 'test/evaluate/input/'
 OUT_DIR = 'test/evaluate/output/'
 EXPECT_DIR = 'test/evaluate/expected/'
-
-
+NODE_TABLE = pd.read_csv(INPUT_DIR + "node_table.csv", header=0)
 class TestEvaluate:
     @classmethod
     def setup_class(cls):
@@ -24,13 +24,53 @@ def test_node_ensemble(self):
         edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file)
         edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False)
         assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False)
-       
-    def test_PRC_node_ensemble(self):
-        None
-        
-    def test_precision_and_recall(self):
-        None
-    
-    def test_pca_chosen_pathway(self):
-        None
-    
+
+    def test_precision_recal_curve_ensemble_nodes(self):
+        out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes.png")
+        out_path.unlink(missing_ok=True)
+        ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep="\t", header=0)
+        Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path)
+        assert out_path.exists()
+
+    def test_precision_recal_curve_ensemble_nodes_empty(self):
+        out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes-empty.png")
+        out_path.unlink(missing_ok=True)
+        ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep="\t", header=0)
+        Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path)
+        assert out_path.exists()
+
+    def test_precision_recall_per_pathway(self):
+        file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR + "test-precision-recall-per-pathway.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)
+
+
+
+    def test_precision_recall_per_pathway_empty(self):
+
+        file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
+
+
+    def  test_precision_recall_pca_chosen_pathway(self):
+        file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png"
+
+        dataframe = ml.summarize_networks(file_paths)
+        ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv')
+
+        pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR)
+        print(pathway)
+        Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False)
diff --git a/test/test_config.py b/test/test_config.py
index bf13cd6e..0f0d813e 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -142,3 +142,5 @@ def test_error_gs_dataset_mismatch(self):
 
         with pytest.raises(ValueError):
             config.init_global(test_config)
+
+    # TODO: should I add a test case on the new config eval / ml couple code

From 23d1070a8b7dcf348077cc20968fc61e6250df05 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:40:46 -0600
Subject: [PATCH 13/22] added algorithms to be used for eval code

---
 Snakefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Snakefile b/Snakefile
index c8ab2407..caee3428 100644
--- a/Snakefile
+++ b/Snakefile
@@ -395,11 +395,11 @@ rule evaluation:
         pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png)
+        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
         node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
-        Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
+        Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
 
 # Returns all pathways for a specific algorithm and dataset
 def collect_pathways_per_algo_per_dataset(wildcards):
@@ -427,7 +427,7 @@ rule evaluation_per_algo_pathways:
         pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png)
+        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
 
 rule evaluation_per_algo_ensemble_pr_curve:
     input: 
@@ -438,7 +438,7 @@ rule evaluation_per_algo_ensemble_pr_curve:
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
-        Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
+        Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
 
 rule evaluation_per_algo_pca_chosen:
     input: 
@@ -449,7 +449,7 @@ rule evaluation_per_algo_pca_chosen:
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file)
+        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
 
 # Remove the output directory
 rule clean:

From 0408a202c9e3b08d9ec402391bbb011fe06231dd Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:41:43 -0600
Subject: [PATCH 14/22] pre commit test_evaluate.py

---
 test/evaluate/test_evaluate.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index 1c1e1e9b..cfb2e8a2 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -48,8 +48,6 @@ def test_precision_recall_per_pathway(self):
         Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
         assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)
 
-
-
     def test_precision_recall_per_pathway_empty(self):
 
         file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]

From f1f58e79261336c29c07c2a528959a405f536c3f Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:42:24 -0600
Subject: [PATCH 15/22] updated evalute.py

---
 spras/evaluation.py | 66 ++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 6757dcf9..36dc59b4 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -79,27 +79,28 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
         # TODO: later iteration - chose between node and edge file, or allow both
 
     @staticmethod
-    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str, output_png: str ):
+    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
         Calculates precision and recall for each pathway file
         Returns output back to output_file
         @param file_paths: file paths of pathway reconstruction algorithm outputs
         @param node_table: the gold standard nodes
+        @param algorithms: list of algorithms used in current run of SPRAS
         @param output_file: the filename to save the precision and recall of each pathway
-        @param output_png: the filename to plot the precision and recall of each pathway (not a PRC)
+        @param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC)
         """
         y_true = set(node_table['NODEID'])
         results = []
-
         for file in file_paths:
             df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
+            # TODO: do we want to include the pathways that are empty for evaluation / in the pr_df?
             y_pred = set(df['Node1']).union(set(df['Node2']))
             all_nodes = y_true.union(y_pred)
             y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
             y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
-
             # default to 0.0 if there is a divide by 0 error
+            # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
             precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
             recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
             results.append({"Pathway": file, "Precision": precision, "Recall": recall})
@@ -107,18 +108,43 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o
         pr_df = pd.DataFrame(results)
         pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
         pr_df.to_csv(output_file, sep="\t", index=False)
-
-        plt.figure(figsize=(8, 6))
-        plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR")
-        plt.xlabel("Recall")
-        plt.ylabel("Precision")
-        plt.title(f"Precision and Recall Plot")
-        plt.legend()
-        plt.grid(True)
-        plt.savefig(output_png)
-        # TODO: what to do when this is empty
-
-    def select_max_freq_and_node(row): # TODO: what (:type) would this row be
+        print(pr_df)
+
+        num_of_algorithms_used = 0
+        if output_png is not None:
+            if not pr_df.empty:
+                plt.figure(figsize=(8, 6))
+                # plot a line per algorithm
+                for algorithm in algorithms: #TODO I think there is a better way than doing this; using split on the filepaths doesn't work bc it is not adaptable
+                    subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
+                    if not subset.empty:
+                        plt.plot(
+                            subset["Recall"],
+                            subset["Precision"],
+                            marker='o',
+                            linestyle='-',
+                            label=f"{algorithm}"
+                        )
+                        num_of_algorithms_used += 1
+
+                # plot overall precision and recall from all the algorithms
+                if num_of_algorithms_used > 1:
+                    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="Overall Precision-Recall")
+
+                plt.xlabel("Recall")
+                plt.ylabel("Precision")
+                plt.title(f"Precision and Recall Plot")
+                plt.legend()
+                plt.grid(True)
+                plt.savefig(output_png)
+            else:
+                plt.figure()
+                plt.plot([], [])
+                plt.title("Empty Pathway Files")
+                plt.savefig(output_png)
+
+
+    def select_max_freq_and_node(row: pd.Series):
         """
         Selects the node and frequency with the highest frequency value from two potential nodes in a row.
         Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency.
@@ -160,10 +186,9 @@ def edge_frequency_node_ensemble(ensemble_file: str):
             node_ensemble.sort_values('max_freq', ascending= False, inplace = True)
             return node_ensemble
         else:
-            # TODO: figure out how to deal with empty ensemble files
             return pd.DataFrame(columns = ['Node', 'max_freq'])
 
-    def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
+    def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
         """
         Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table.
         Plots a precision and recall curve for the node ensemble against its associated gold standard node table
@@ -190,10 +215,9 @@ def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, outpu
             plt.grid(True)
             plt.savefig(output_png)
         else:
-            # TODO figure out how to deal with empty ensemble files (still will have the header)
             plt.figure()
-            plt.text(0.5, 0.5, "empty ensemble file", ha='center', va='center', fontsize=12, color='red')
-            plt.axis('off')
+            plt.plot([], [])
+            plt.title("Empty Ensemble File")
             plt.savefig(output_png)
 
     def pca_chosen_pathway(coordinates_file: str, output_dir:str):

From 35074755db2017747a0bce8cf5fd5cde91474131 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:42:45 -0600
Subject: [PATCH 16/22] updated all config files

---
 config/config.yaml            | 6 +++++-
 config/egfr-param-tuning.yaml | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index fd47638b..79b7d086 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -177,4 +177,8 @@ analysis:
         include: true
         # adds evaluation per algorithm per dataset-goldstandard pair
         aggregate_per_algorithm: true
-
+        # TODO: should we decouple parts of eval that involve ml
+        # it will be good to seperate them otherwise if ml doesn't work then eval won't work at all
+          # pca_chosen
+          # ensemble
+          # precisin and recall
diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml
index ecc2a65f..a0a965b7 100644
--- a/config/egfr-param-tuning.yaml
+++ b/config/egfr-param-tuning.yaml
@@ -3439,7 +3439,7 @@ gold_standards:
       - tps_egfr
 reconstruction_settings:
   locations:
-    reconstruction_dir: output
+    reconstruction_dir: output/tps_egfr
   run: true
 analysis:
   summary:

From dd0359f58f3674fdffa1ec18433a7335e0824171 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 15 Nov 2024 14:48:08 -0600
Subject: [PATCH 17/22] cleane dup spras/evaluation.py

---
 spras/evaluation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 36dc59b4..b621daad 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -108,7 +108,6 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
         pr_df = pd.DataFrame(results)
         pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
         pr_df.to_csv(output_file, sep="\t", index=False)
-        print(pr_df)
 
         num_of_algorithms_used = 0
         if output_png is not None:

From ef15799f3bf653638b1139557887aa5de49975da Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 18 Nov 2024 12:15:02 -0600
Subject: [PATCH 18/22] updated spacing and added comments to the config files

---
 config/config.yaml | 2 ++
 config/egfr.yaml   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/config/config.yaml b/config/config.yaml
index 79b7d086..76231276 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -174,8 +174,10 @@ analysis:
         metric: 'euclidean'
       evaluation:
         # evaluation per dataset-goldstandard pair
+        # evalution will not run unless ml include is set to true
         include: true
         # adds evaluation per algorithm per dataset-goldstandard pair
+        # evalution per algortihm will not run unless ml include and ml aggregate_per_algorithm is set to true
         aggregate_per_algorithm: true
         # TODO: should we decouple parts of eval that involve ml
         # it will be good to seperate them otherwise if ml doesn't work then eval won't work at all
diff --git a/config/egfr.yaml b/config/egfr.yaml
index 0b41f0a5..93cbccec 100644
--- a/config/egfr.yaml
+++ b/config/egfr.yaml
@@ -90,4 +90,4 @@ analysis:
   ml:
     include: false
   evaluation:
-        include: false
+    include: false

From 47dab1a4ea313f7fdaa3d1d7e2666e89ba17678c Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 18 Nov 2024 12:15:16 -0600
Subject: [PATCH 19/22] updated evalution.py code

---
 spras/evaluation.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index b621daad..3e2b1e0b 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -109,12 +109,11 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
         pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
         pr_df.to_csv(output_file, sep="\t", index=False)
 
-        num_of_algorithms_used = 0
         if output_png is not None:
             if not pr_df.empty:
                 plt.figure(figsize=(8, 6))
                 # plot a line per algorithm
-                for algorithm in algorithms: #TODO I think there is a better way than doing this; using split on the filepaths doesn't work bc it is not adaptable
+                for algorithm in algorithms:
                     subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
                     if not subset.empty:
                         plt.plot(
@@ -124,11 +123,10 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                             linestyle='-',
                             label=f"{algorithm}"
                         )
-                        num_of_algorithms_used += 1
 
-                # plot overall precision and recall from all the algorithms
-                if num_of_algorithms_used > 1:
-                    plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="Overall Precision-Recall")
+                # plot combined precision and recall from all the algorithms
+                if len(algorithms) > 1:
+                    plt.plot(pr_df["Recall"], pr_df["Precision"], linestyle='--', color='b', label="Overall Precision-Recall", alpha = 0.3)
 
                 plt.xlabel("Recall")
                 plt.ylabel("Precision")

From b3504b52cb3644903ac18c7abba65b2dd2751f2b Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 18 Nov 2024 12:15:42 -0600
Subject: [PATCH 20/22] cleaned up eval tests and added coupling tests to
 config

---
 test/evaluate/test_evaluate.py |   1 -
 test/test_config.py            | 135 ++++++++++++++++++++++++++++++++-
 2 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index cfb2e8a2..5dc0b8f3 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -69,6 +69,5 @@ def  test_precision_recall_pca_chosen_pathway(self):
         ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv')
 
         pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR)
-        print(pathway)
         Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png)
         assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False)
diff --git a/test/test_config.py b/test/test_config.py
index 0f0d813e..c89d7123 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -27,7 +27,8 @@ def get_test_config():
                 "include": False
             },
             "ml": {
-                "include": False
+                "include": False,
+                "aggregate_per_algorithm": False
             },
             "graphspace": {
                 "include": False
@@ -36,7 +37,8 @@ def get_test_config():
                 "include": False
             },
             "evaluation": {
-                "include": False
+                "include": False,
+                 "aggregate_per_algorithm": False
             },
         },
     }
@@ -143,4 +145,131 @@ def test_error_gs_dataset_mismatch(self):
         with pytest.raises(ValueError):
             config.init_global(test_config)
 
-    # TODO: should I add a test case on the new config eval / ml couple code
+    def test_eval_ml_coupling(self):
+        test_config = get_test_config()
+        include_combos = [(True, True), (True, False), (False, True), (False, False)]
+
+        # ml: True evaluation: True
+        test_config["analysis"]["ml"]["include"] = include_combos[0][0]
+        test_config["analysis"]["evaluation"]["include"] = include_combos[0][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == True and config.config.analysis_include_evaluation == True
+
+        # ml: True evaluation: False
+        test_config["analysis"]["ml"]["include"] = include_combos[1][0]
+        test_config["analysis"]["evaluation"]["include"] = include_combos[1][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == True and config.config.analysis_include_evaluation == False
+
+        # ml: False evaluation: True
+        test_config["analysis"]["ml"]["include"] = include_combos[2][0]
+        test_config["analysis"]["evaluation"]["include"] = include_combos[2][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == False and config.config.analysis_include_evaluation == False
+
+        # ml: False evaluation: False
+        test_config["analysis"]["ml"]["include"] = include_combos[3][0]
+        test_config["analysis"]["evaluation"]["include"] = include_combos[3][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == False and config.config.analysis_include_evaluation == False
+
+
+    def test_ml_agg_algo_coupling(self):
+
+        test_config = get_test_config()
+        include_combos = [(True, True), (True, False), (False, True), (False, False)]
+
+        test_config["analysis"]["ml"]["include"] = include_combos[0][0]
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[0][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == True
+
+
+        test_config["analysis"]["ml"]["include"] = include_combos[1][0]
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[1][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False
+
+
+        test_config["analysis"]["ml"]["include"] = include_combos[2][0]
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[2][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False
+
+
+        test_config["analysis"]["ml"]["include"] = include_combos[3][0]
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[3][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False
+
+    def test_eval_agg_algo_coupling(self):
+
+        test_config = get_test_config()
+        test_config["analysis"]["ml"]["include"] = True
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True
+
+        include_combos = [(True, True), (True, False), (False, True), (False, False)]
+
+        test_config["analysis"]["evaluation"]["include"] = include_combos[0][0]
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[0][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == True
+
+
+        test_config["analysis"]["evaluation"]["include"] = include_combos[1][0]
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[1][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == False
+
+
+        test_config["analysis"]["evaluation"]["include"] = include_combos[2][0]
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[2][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False
+
+
+        test_config["analysis"]["evaluation"]["include"] = include_combos[3][0]
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[3][1]
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False
+
+    def test_eval_ml_agg_algo_coupling(self):
+
+        # the value of ml include and ml aggregate_per_algorithm can affect the value of evaluation include and evaluation aggregate_per_algorithm
+
+        test_config = get_test_config()
+
+        test_config["analysis"]["ml"]["include"] = False
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True
+        test_config["analysis"]["evaluation"]["include"] = True
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False
+
+        test_config["analysis"]["ml"]["include"] = True
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False
+        test_config["analysis"]["evaluation"]["include"] = True
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False
+
+        test_config["analysis"]["ml"]["include"] = False
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False
+        test_config["analysis"]["evaluation"]["include"] = True
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False
+
+        test_config["analysis"]["ml"]["include"] = True
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True
+        test_config["analysis"]["evaluation"]["include"] = True
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == True and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == True
+
+        test_config["analysis"]["ml"]["include"] = True
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False
+        test_config["analysis"]["evaluation"]["include"] = False
+        test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = False
+        config.init_global(test_config)
+        assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False

From dfcd302c554f50a3a2b2f93c9e495debd8ad28a7 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 5 Dec 2024 16:03:05 -0600
Subject: [PATCH 21/22] change how plot is

---
 spras/evaluation.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 3e2b1e0b..61628f59 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -120,14 +120,11 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                             subset["Recall"],
                             subset["Precision"],
                             marker='o',
-                            linestyle='-',
+                            linestyle='',
                             label=f"{algorithm}"
                         )
 
-                # plot combined precision and recall from all the algorithms
-                if len(algorithms) > 1:
-                    plt.plot(pr_df["Recall"], pr_df["Precision"], linestyle='--', color='b', label="Overall Precision-Recall", alpha = 0.3)
-
+               
                 plt.xlabel("Recall")
                 plt.ylabel("Precision")
                 plt.title(f"Precision and Recall Plot")

From 97a7d7be46ec2516710cb4ca0122a013bc88d897 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 9 Dec 2024 16:01:16 -0600
Subject: [PATCH 22/22] precommit

---
 spras/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 61628f59..e6f60c0b 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -124,7 +124,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                             label=f"{algorithm}"
                         )
 
-               
+
                 plt.xlabel("Recall")
                 plt.ylabel("Precision")
                 plt.title(f"Precision and Recall Plot")