Merge pull request #19 from Reed-CompBio/figures

Figures
Reed-CompBio · Sep 13, 2024 · d2fdf84 · d2fdf84
2 parents f6ddffc + 635ee73
commit d2fdf84
Show file tree

Hide file tree

Showing 53 changed files with 2,542,058 additions and 180 deletions.
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
@@ -16,10 +16,10 @@ def __init__(self):
 
     def get_y_score(self):
         return self.y_score
-    
+
     def get_y_true(self):
         return self.y_true
-    
+
     def set_y_score(self, y_score):
         self.y_score = y_score
 
@@ -35,7 +35,7 @@ def predict(
         name,
     ):
         """
-        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of 
+        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
         interest and a GO term. Does not include protein of interest in calculations.
         """
         colorama_init()
@@ -56,15 +56,17 @@ def predict(
             "true_label": [],
         }
 
-        positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
+        positive_dataset, negative_dataset = get_datasets(
+            input_directory_path, rep_num, name
+        )
         G = import_graph_from_pickle(graph_file_path)
-
+        N = len(
+            [x for x, y in G.nodes(data=True) if y["type"] == "protein"]
+        )  # Total number of protein nodes in the entire graph
         i = 1
-        for positive_protein, positive_go, negative_protein, negative_go in zip(
+        for positive_protein, positive_go in zip(
             positive_dataset["protein"],
             positive_dataset["go"],
-            negative_dataset["protein"],
-            negative_dataset["go"],
         ):
 
             # calculate the score for the positive set
@@ -78,23 +80,40 @@ def predict(
                 )
             )
 
-            N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
-            K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+            pos_n = len(
+                positive_protein_neighbor
+            )  # Number of protein neighbors the protein of interest has
+            K = (
+                len(positive_go_neighbor) - 1
+            )  # Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
+            pos_k = positive_go_annotated_protein_neighbor_count  # The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+
+            # The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
+            positive_score = 1 - (
+                (math.comb(K, pos_k) * math.comb(N - K, pos_n - pos_k))
+                / math.comb(N, pos_n)
+            )
 
+            # input positive and negative score to data
+            data["protein"].append(positive_protein)
+            data["go_term"].append(positive_go)
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
+            data["go_neighbor"].append(len(positive_go_neighbor))
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
+            )
+            data["score"].append(positive_score)
+            data["true_label"].append(1)
 
-            # if K == -1:
-            #     K = 1
+            print_progress(i, len(positive_dataset["protein"]))
+            i += 1
 
-            # print("N: ", N)
-            # print("pos_n: ", pos_n)
-            # print("K: ", K)
-            # print("pos_k: ", pos_k)
+        i = 1
+        for negative_protein, negative_go in zip(
+            negative_dataset["protein"],
+            negative_dataset["go"],
+        ):
 
-            #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
-            positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
-
             # calculate the score for the negative set
             negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -105,23 +124,19 @@ def predict(
                     G, negative_protein_neighbor, negative_go
                 )
             )
-
-            neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
-            neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
-
-            negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
-
-
-            # input positive and negative score to data
-            data["protein"].append(positive_protein)
-            data["go_term"].append(positive_go)
-            data["protein_neighbor"].append(len(positive_protein_neighbor))
-            data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_protein_neighbors"].append(
-                positive_go_annotated_protein_neighbor_count
+            K = (
+                len(negative_go_neighbor) - 1
+            )  # Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
+
+            neg_n = len(
+                negative_protein_neighbor
+            )  # Negative protein of interest neighbors
+            neg_k = negative_go_annotated_protein_neighbor_count  # Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
+
+            negative_score = 1 - (
+                (math.comb(K, neg_k) * math.comb(N - K, neg_n - neg_k))
+                / math.comb(N, neg_n)
             )
-            data["score"].append(positive_score)
-            data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
@@ -133,7 +148,7 @@ def predict(
             data["score"].append(negative_score)
             data["true_label"].append(0)
 
-            print_progress(i, len(positive_dataset["protein"]))
+            print_progress(i, len(negative_dataset["protein"]))
             i += 1
 
         normalized_data = normalize(data["score"])

diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
@@ -16,10 +16,10 @@ def __init__(self):
 
     def get_y_score(self):
         return self.y_score
-    
+
     def get_y_true(self):
         return self.y_true
-    
+
     def set_y_score(self, y_score):
         self.y_score = y_score
 
@@ -35,8 +35,8 @@ def predict(
         name,
     ):
         """
-        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of 
-        interest and a GO term. Includes the protein of interest in calculations. 
+        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
+        interest and a GO term. Includes the protein of interest in calculations.
         """
         colorama_init()
 
@@ -56,15 +56,17 @@ def predict(
             "true_label": [],
         }
 
-        positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
+        positive_dataset, negative_dataset = get_datasets(
+            input_directory_path, rep_num, name
+        )
         G = import_graph_from_pickle(graph_file_path)
-
+        N = len(
+            [x for x, y in G.nodes(data=True) if y["type"] == "protein"]
+        )  # Total number of protein nodes in the entire graph
         i = 1
-        for positive_protein, positive_go, negative_protein, negative_go in zip(
+        for positive_protein, positive_go in zip(
             positive_dataset["protein"],
             positive_dataset["go"],
-            negative_dataset["protein"],
-            negative_dataset["go"],
         ):
 
             # calculate the score for the positive set
@@ -77,15 +79,41 @@ def predict(
                     G, positive_protein_neighbor, positive_go
                 )
             )
-
-            N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
-            K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_protein_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_n = (
+                len(positive_protein_neighbor) + 1
+            )  # Number of protein neighbors the protein of interest has (includes the protein of interest)
+            K = len(
+                positive_go_neighbor
+            )  # Number of protein neighbors the GO term of interest has, same for pos & neg
+            pos_k = (
+                positive_go_annotated_protein_neighbor_count + 1
+            )  # The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+
+            # The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
+            positive_score = 1 - (
+                (math.comb(K, pos_k) * math.comb(N - K, pos_n - pos_k))
+                / math.comb(N, pos_n)
+            )
+
+            # input positive and negative score to data
+            data["protein"].append(positive_protein)
+            data["go_term"].append(positive_go)
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
+            data["go_neighbor"].append(len(positive_go_neighbor))
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
+            )
+            data["score"].append(positive_score)
+            data["true_label"].append(1)
 
-            #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
-            positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
+            print_progress(i, len(positive_dataset["protein"]))
+            i += 1
 
+        i = 1
+        for negative_protein, negative_go in zip(
+            negative_dataset["protein"],
+            negative_dataset["go"],
+        ):
             # calculate the score for the negative set
             negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -96,22 +124,18 @@ def predict(
                     G, negative_protein_neighbor, negative_go
                 )
             )
-
-            neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
-            neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
-
-            negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
-
-            # input positive and negative score to data
-            data["protein"].append(positive_protein)
-            data["go_term"].append(positive_go)
-            data["protein_neighbor"].append(len(positive_protein_neighbor))
-            data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_protein_neighbors"].append(
-                positive_go_annotated_protein_neighbor_count
+            K = len(
+                negative_go_neighbor
+            )  # Number of protein neighbors the GO term of interest has, same for pos & neg
+            neg_n = (
+                len(negative_protein_neighbor) + 1
+            )  # Negative protein of interest neighbors (includes self)
+            neg_k = negative_go_annotated_protein_neighbor_count  # Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
+
+            negative_score = 1 - (
+                (math.comb(K, neg_k) * math.comb(N - K, neg_n - neg_k))
+                / math.comb(N, neg_n)
             )
-            data["score"].append(positive_score)
-            data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
@@ -123,7 +147,7 @@ def predict(
             data["score"].append(negative_score)
             data["true_label"].append(0)
 
-            print_progress(i, len(positive_dataset["protein"]))
+            print_progress(i, len(negative_dataset["protein"]))
             i += 1
 
         normalized_data = normalize(data["score"])
@@ -161,4 +185,4 @@ def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
     for element in nodeList:
         if G.has_edge(element[0], goTerm):
             count += 1
-    return count
+    return count