fixed algorithms to account for self edge and added .csv output for l…

…ist of all roc and pr values that result from running repititions.
Reed-CompBio · Jun 18, 2024 · 67043d6 · 67043d6
1 parent f160609
commit 67043d6
Show file tree

Hide file tree

Showing 9 changed files with 139 additions and 43 deletions.
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
@@ -76,14 +76,18 @@ def predict(
                 )
             )
 
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1 #Removes extra node if there is an edge to self 
+
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
+            pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
             K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
-
+            pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+            
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
-
+                
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -95,11 +99,16 @@ def predict(
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
+
+            neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
 
+
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)

diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
@@ -75,11 +75,15 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-
+
+            c = 1
+            if G.has_edge(positive_protein, positive_protein):
+                c = 0
+
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
+            pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
             K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
@@ -95,7 +99,11 @@ def predict(
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
+            c = 1
+            if G.has_edge(negative_protein, negative_protein):
+                c = 0
+
+            neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
@@ -159,4 +167,4 @@ def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
     for element in nodeList:
         if G.has_edge(element[0], goTerm):
             count += 1
-    return count
+    return count
diff --git a/classes/overlapping_neighbors_class.py b/classes/overlapping_neighbors_class.py
@@ -55,30 +55,40 @@ def predict(
 
         positive_dataset, negative_dataset = get_datasets(input_directory_path)
         G = import_graph_from_pickle(graph_file_path)
-
         i = 1
         for positive_protein, positive_go, negative_protein, negative_go in zip(
             positive_dataset["protein"],
             positive_dataset["go"],
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
             )
+
+            # print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
             positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
             positive_go_annotated_pro_pro_neighbor_count = (
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
-            positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
-                len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
-            )
+            ) - c
+
+            if len(positive_pro_pro_neighbor) == 0:
+                positive_score = 0
+            else:
+                positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
+                    len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
+                )
 
             # calculate the score for the negative set
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
@@ -88,9 +98,13 @@ def predict(
                     G, negative_pro_pro_neighbor, negative_go
                 )
             )
-            negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
-                len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
-            )
+
+            if len(negative_pro_pro_neighbor) == 0:
+                negative_score = 0
+            else:
+                negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
+                    len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
+                )
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
@@ -141,7 +155,7 @@ def get_neighbors(G: nx.Graph, node, edgeType):
     for edge in res:
         if edge[2]["type"] == edgeType:
             neighborNode = [edge[1], edge[2]]
-            neighbors.append(neighborNode)
+            neighbors.append(neighborNode) 
 
     return neighbors
 

diff --git a/classes/overlapping_neighbors_v2_class.py b/classes/overlapping_neighbors_v2_class.py
@@ -60,7 +60,9 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -70,13 +72,16 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
+            ) - c
             positive_score = positive_go_annotated_pro_pro_neighbor_count + (
                 1
-                + len(positive_pro_pro_neighbor)
+                + (len(positive_pro_pro_neighbor) - c)
                 * positive_go_annotated_pro_pro_neighbor_count
             ) / (len(positive_go_neighbor) / 2)
 
+            c = 0 
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -89,7 +94,7 @@ def predict(
             )
             negative_score = negative_go_annotated_pro_pro_neighbor_count + (
                 1
-                + len(negative_pro_pro_neighbor)
+                + (len(negative_pro_pro_neighbor) - c)
                 * negative_go_annotated_pro_pro_neighbor_count
             ) / (len(negative_go_neighbor) / 2)
 

diff --git a/classes/overlapping_neighbors_v3_class.py b/classes/overlapping_neighbors_v3_class.py
@@ -60,6 +60,9 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -69,7 +72,7 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
+            ) - c
             positive_score = positive_go_annotated_pro_pro_neighbor_count + (
                 1 + positive_go_annotated_pro_pro_neighbor_count
             ) / (len(positive_go_neighbor))
@@ -83,7 +86,7 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, negative_pro_pro_neighbor, negative_go
                 )
-            )
+            ) 
             negative_score = negative_go_annotated_pro_pro_neighbor_count + (
                 1 + negative_go_annotated_pro_pro_neighbor_count
             ) / (len(negative_go_neighbor))

diff --git a/classes/protein_degree_class.py b/classes/protein_degree_class.py
@@ -52,14 +52,20 @@ def predict(
             negative_dataset["go"],
         ):
 
+            c = 0 
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["degree"].append(G.degree(positive_protein))
+            data["degree"].append(G.degree(positive_protein) - c)
             data["true_label"].append(1)
 
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["degree"].append(G.degree(negative_protein))
+            data["degree"].append(G.degree(negative_protein) - c)
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))
             i += 1

diff --git a/classes/protein_degree_v2_class.py b/classes/protein_degree_v2_class.py
@@ -54,17 +54,23 @@ def predict(
             negative_dataset["go"],
         ):
 
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
             data["degree"].append(
-                len(get_neighbors(G, positive_protein, "protein_protein"))
+                len(get_neighbors(G, positive_protein, "protein_protein")) - c
             )
             data["true_label"].append(1)
 
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
             data["degree"].append(
-                len(get_neighbors(G, negative_protein, "protein_protein"))
+                len(get_neighbors(G, negative_protein, "protein_protein")) - c
             )
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))

diff --git a/main.py b/main.py
@@ -47,7 +47,7 @@ def main():
     output_image_path = Path("./output/images/")
     dataset_directory_path = Path("./output/dataset")
     graph_file_path = Path(dataset_directory_path, "graph.pickle")
-    sample_size = 100
+    sample_size = 1000
 
     testing_output_data_path = Path("./output/data/")
     testing_output_image_path = Path("./output/images/")
@@ -85,15 +85,15 @@ def main():
         "HypergeometricDistributionV2": HypergeometricDistributionV2,
     }
 
-    repeats = 1
+    repeats = 20
 
     run_workflow(
         algorithm_classes,
         go_protein_pairs,
         sample_size,
         protein_list,
-        testing_graph_file_path,
-        testing_input_directory_path,
+        graph_file_path,
+        dataset_directory_path,
         output_data_path,
         output_image_path,
         repeats,

diff --git a/tools/workflow.py b/tools/workflow.py
@@ -55,10 +55,15 @@ def run_workflow(
     for i in algorithm_classes.keys():
         auc[i] = [[], []]
 
+    #Generate number of datasets if they don't already exist 
+
+
+
     for i in range(
         x
     ):  # Creates a pos/neg list each replicate then runs workflow like normal
-        print("\n\nReplicate: " + str(i) + "\n")
+        if x > 1:
+            print("\n\nReplicate: " + str(i+1) + "\n")
 
         # positive_dataset, negative_dataset = sample_data(
         #     go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
@@ -76,11 +81,24 @@ def run_workflow(
 
         # each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
         for i in algorithm_classes.keys():
-            auc[i][0].append(results[i]["roc_auc"])
-            auc[i][1].append(results[i]["pr_auc"])
+            auc[i][0].append(round(results[i]["roc_auc"],5))
+            auc[i][1].append(round(results[i]["pr_auc"],5))
+
+    #Creates a dictionary for all pr values and all roc values 
+    roc = {}
+    pr = {}
 
-    # Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
+    for i in algorithm_classes.keys():
+        roc[i] = auc[i][0]
+        pr[i] = auc[i][1]
+
     if x > 1:
+        cols = []
+        for i in range(x):
+            cols.append("Replicate " + str(i+1))
+        name = "_replicate_list"
+
+        # Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
         for i in auc.keys():
             meanROC = round(stat.mean(auc[i][0]), 5)
             auc[i].append(round(stat.mean(auc[i][1]), 5))
@@ -106,7 +124,34 @@ def run_workflow(
             index=True,
             sep="\t",
         )
+    else:
+        cols = ["AUC"]
+        name = "_auc_results"
+
+    dfr = pd.DataFrame.from_dict(
+        roc,
+        orient = 'index',
+        columns = cols
+    )
 
+    dfp = pd.DataFrame.from_dict(
+        pr,
+        orient = 'index',
+        columns = cols
+    )
+
+    dfr.to_csv(
+        Path(output_data_path, "roc" + name + ".csv"),
+        index = True,
+        sep = "\t"
+    )
+
+    dfp.to_csv(
+        Path(output_data_path, "pr" + name + ".csv"),
+        index = True,
+        sep = "\t"
+    )
+
 
 def run_experiement(
     algorithm_classes,
@@ -461,11 +506,11 @@ def sort_results_by(results, key, output_path):
     df = pd.DataFrame(data)
     df = df.sort_values(by=key, ascending=False)
 
-    df.to_csv(
-        output_file_path,
-        index=False,
-        sep="\t",
-    )
+    # df.to_csv(
+    #     output_file_path,
+    #     index=False,
+    #     sep="\t",
+    # )
 
     algorithm_tuple_list = sorted(algorithm_tuple_list, key=itemgetter(1), reverse=True)