From 43f18bf40b573e30b61d81cd3f034cbbe1992782 Mon Sep 17 00:00:00 2001
From: ctrlaltaf <altafayyubibarelvi@gmail.com>
Date: Tue, 11 Jun 2024 10:42:56 -0700
Subject: [PATCH] Fixed output file naming and preparing for pytest

---
 .gitignore                                    |  4 +-
 classes/hypergeometric_distribution_class.py  |  2 +-
 .../hypergeometric_distribution_class_V2.py   |  2 +-
 .../hypergeometric_distribution_class_V3.py   |  2 +-
 .../hypergeometric_distribution_class_V4.py   |  2 +-
 main.py                                       | 13 ++++--
 main_replicates.py                            | 44 +++++++++++--------
 tools/workflow.py                             | 23 +++++++---
 8 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index b1862fd..312d7be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,4 +45,6 @@ env/
 *.log
 
 output/
-input/
\ No newline at end of file
+input/
+
+.ipynb_checkpoints
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
index 1076cd4..e508c84 100644
--- a/classes/hypergeometric_distribution_class.py
+++ b/classes/hypergeometric_distribution_class.py
@@ -132,7 +132,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometricdistribution.csv"),
+            Path(output_path, "hypergeometric_distribution.csv"),
             index=False,
             sep="\t",
         )
diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
index 7215725..5847929 100644
--- a/classes/hypergeometric_distribution_class_V2.py
+++ b/classes/hypergeometric_distribution_class_V2.py
@@ -132,7 +132,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometricdistributionV3.csv"),
+            Path(output_path, "hypergeometric_distribution_V2.csv"),
             index=False,
             sep="\t",
         )
diff --git a/classes/hypergeometric_distribution_class_V3.py b/classes/hypergeometric_distribution_class_V3.py
index 9654e8f..44584c6 100644
--- a/classes/hypergeometric_distribution_class_V3.py
+++ b/classes/hypergeometric_distribution_class_V3.py
@@ -134,7 +134,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometricdistribution.csv"),
+            Path(output_path, "hypergeometric_distribution_v3.csv"),
             index=False,
             sep="\t",
         )
diff --git a/classes/hypergeometric_distribution_class_V4.py b/classes/hypergeometric_distribution_class_V4.py
index 27c16f4..7e11186 100644
--- a/classes/hypergeometric_distribution_class_V4.py
+++ b/classes/hypergeometric_distribution_class_V4.py
@@ -135,7 +135,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometricdistribution.csv"),
+            Path(output_path, "hypergeometric_distribution_v4_data.csv"),
             index=False,
             sep="\t",
         )
diff --git a/main.py b/main.py
index ca7cc9b..0419d29 100644
--- a/main.py
+++ b/main.py
@@ -45,6 +45,11 @@ def main():
     graph_file_path = Path(dataset_directory_path, "graph.pickle")
     sample_size = 1000
 
+    testing_output_data_path = Path("./output/data/")
+    testing_output_image_path = Path("./output/images/")
+    testing_input_directory_path = Path("./tests/testing-dataset/")
+    testing_graph_file_path = Path(testing_input_directory_path, "graph.pickle")
+
     interactome_columns = [0, 1, 4, 5]
     interactome = read_specific_columns(interactome_path, interactome_columns, "\t")
 
@@ -81,10 +86,10 @@ def main():
 
     results = run_workflow(
         algorithm_classes,
-        dataset_directory_path,
-        graph_file_path,
-        output_data_path,
-        output_image_path,
+        testing_input_directory_path,
+        testing_graph_file_path,
+        testing_output_data_path,
+        testing_output_image_path,
         True,
         True,
     )
diff --git a/main_replicates.py b/main_replicates.py
index 0e28998..eacb356 100644
--- a/main_replicates.py
+++ b/main_replicates.py
@@ -75,22 +75,24 @@ def main():
         "HypergeometricDistributionV3": HypergeometricDistributionV3,
         "HypergeometricDistributionV4": HypergeometricDistributionV4,
     }
-    
-    x = 20 #Number of replicates
+
+    x = 2  # Number of replicates
     print_graphs = False
     auc = {}
-    #index 0 is ROC, index 1 is Precision Recall
+    # index 0 is ROC, index 1 is Precision Recall
     for i in algorithm_classes.keys():
-        auc[i] = [[],[]]
+        auc[i] = [[], []]
 
-    for i in range(x): #Creates a pos/neg list each replicate then runs workflow like normal
+    for i in range(
+        x
+    ):  # Creates a pos/neg list each replicate then runs workflow like normal
         print("\n\nReplicate: " + str(i) + "\n")
-    
+
         # if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used
         positive_dataset, negative_dataset = sample_data(
             go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
         )
-    
+
         results = run_workflow(
             algorithm_classes,
             dataset_directory_path,
@@ -101,29 +103,33 @@ def main():
             print_graphs,
         )
 
-        #each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
+        # each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
         for i in algorithm_classes.keys():
-            auc[i][0].append(results[i]['roc_auc'])
-            auc[i][1].append(results[i]['pr_auc'])
+            auc[i][0].append(results[i]["roc_auc"])
+            auc[i][1].append(results[i]["pr_auc"])
 
-    #Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
+    # Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
     for i in auc.keys():
-        meanROC = round(stat.mean(auc[i][0]),5)
-        auc[i].append(round(stat.mean(auc[i][1]),5))
-        auc[i].append(round(stat.stdev(auc[i][1]),5))
-        auc[i][1] = round(stat.stdev(auc[i][0]),5)
+        meanROC = round(stat.mean(auc[i][0]), 5)
+        auc[i].append(round(stat.mean(auc[i][1]), 5))
+        auc[i].append(round(stat.stdev(auc[i][1]), 5))
+        auc[i][1] = round(stat.stdev(auc[i][0]), 5)
         auc[i][0] = meanROC
 
-    #Prints the roc and pr table, then saves to .csv file 
-    df = pd.DataFrame.from_dict(auc, orient = 'index', columns = ['ROC mean', 'ROC sd', 'Precision/Recall mean', 'Precision/Recall sd'])
+    # Prints the roc and pr table, then saves to .csv file
+    df = pd.DataFrame.from_dict(
+        auc,
+        orient="index",
+        columns=["ROC mean", "ROC sd", "Precision/Recall mean", "Precision/Recall sd"],
+    )
     print()
     print(df)
     df.to_csv(
-        Path(output_data_path, "auc_values.csv"),
+        Path(output_data_path, "repeated_auc_values.csv"),
         index=True,
         sep="\t",
     )
-    
+
     sys.exit()
 
 
diff --git a/tools/workflow.py b/tools/workflow.py
index 7837b0b..a5ae925 100644
--- a/tools/workflow.py
+++ b/tools/workflow.py
@@ -38,7 +38,7 @@ def run_workflow(
     if threshold:
         run_thresholds(results, algorithm_classes, output_data_path)
         if figures:
-            generate_figures(algorithm_classes, results, output_image_path)
+            generate_figures(algorithm_classes, results, output_image_path, output_data_path)
 
     return results
 
@@ -135,12 +135,12 @@ def run_thresholds(results, algorithm_classes, output_data_path):
     )
 
 
-def generate_figures(algorithm_classes, results, output_image_path):
+def generate_figures(algorithm_classes, results, output_image_path, output_data_path):
     # Generate ROC and PR figures to compare methods
 
     colors = generate_random_colors(len(algorithm_classes))
 
-    sorted_results = sort_results_by(results, "roc_auc")
+    sorted_results = sort_results_by(results, "roc_auc", output_data_path)
     i = 0
     plt.figure()
     for algorithm_name, metrics in sorted_results.items():
@@ -162,7 +162,7 @@ def generate_figures(algorithm_classes, results, output_image_path):
     plt.savefig(Path(output_image_path, "multiple_roc_curves.png"))
     plt.show()
 
-    sorted_results = sort_results_by(results, "pr_auc")
+    sorted_results = sort_results_by(results, "pr_auc", output_data_path)
     i = 0
     plt.figure()
     for algorithm_name, metrics in sorted_results.items():
@@ -245,12 +245,25 @@ def get_datasets(input_directory_path):
     return positive_dataset, negative_dataset
 
 
-def sort_results_by(results, key):
+def sort_results_by(results, key, output_path):
     algorithm_tuple_list = []
+    data = {"algorithm" : [], key: []}
+    output_file_path = Path(output_path, key + "_results.csv")
 
     # make a list of tuples where a tuple is (algorithm_name, the metric we will be sorting by)
     for algorithm_name, metrics in results.items():
         algorithm_tuple_list.append((algorithm_name, metrics[key]))
+        data["algorithm"].append(algorithm_name)
+        data[key].append(metrics[key])
+
+    df = pd.DataFrame(data)
+    df = df.sort_values(by=key, ascending=False)
+
+    df.to_csv(
+        output_file_path,
+        index=False,
+        sep="\t",
+    )
 
     algorithm_tuple_list = sorted(algorithm_tuple_list, key=itemgetter(1), reverse=True)