all methods work with new database

Reed-CompBio · Aug 27, 2024 · 71fbbe7 · 71fbbe7
1 parent c94f66d
commit 71fbbe7
Show file tree

Hide file tree

Showing 12 changed files with 69,222 additions and 150 deletions.
diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py
@@ -48,9 +48,9 @@ def predict(
         data = {
             "protein": [],
             "go_term": [],
-            "pro_pro_neighbor": [],
+            "protein_neighbor": [],
             "go_neighbor": [],
-            "go_annotated_pro_pro_neighbors": [],
+            "go_annotated_protein_neighbors": [],
             "score": [],
             "norm_score": [],
             "true_label": [],
@@ -68,36 +68,36 @@ def predict(
         ):
 
             # calculate the score for the positive set
-            positive_pro_pro_neighbor = get_neighbors(
-                G, positive_protein, "protein_protein"
+            positive_protein_neighbor = get_neighbors(
+                G, positive_protein, ["protein_protein", "regulatory"]
             )
-            positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
-            positive_go_annotated_pro_pro_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, positive_pro_pro_neighbor, positive_go
+            positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
+            positive_go_annotated_protein_neighbor_count = (
+                get_go_annotated_protein_neighbor_count(
+                    G, positive_protein_neighbor, positive_go
                 )
             )
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
+            pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
             K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
-            
+            pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
 
             # calculate the score for the negative set
-            negative_pro_pro_neighbor = get_neighbors(
+            negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
             negative_go_annotated_protein_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, negative_pro_pro_neighbor, negative_go
+                get_go_annotated_protein_neighbor_count(
+                    G, negative_protein_neighbor, negative_go
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
+            neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
@@ -106,19 +106,19 @@ def predict(
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
             data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
-                positive_go_annotated_pro_pro_neighbor_count
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
             )
             data["score"].append(positive_score)
             data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(negative_protein_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
+            data["go_annotated_protein_neighbors"].append(
                 negative_go_annotated_protein_neighbor_count
             )
             data["score"].append(negative_score)
@@ -146,18 +146,18 @@ def predict(
         return y_score, y_true
 
 
-def get_neighbors(G: nx.Graph, node, edgeType):
+def get_neighbors(G: nx.DiGraph, node, edgeTypes):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
-        if edge[2]["type"] == edgeType:
+        if edge[2]["type"] in edgeTypes:
             neighborNode = [edge[1], edge[2]]
             neighbors.append(neighborNode)
 
     return neighbors
 
 
-def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
+def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
     count = 0
     for element in nodeList:
         if G.has_edge(element[0], goTerm):

diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py
@@ -48,9 +48,9 @@ def predict(
         data = {
             "protein": [],
             "go_term": [],
-            "pro_pro_neighbor": [],
+            "protein_neighbor": [],
             "go_neighbor": [],
-            "go_annotated_pro_pro_neighbors": [],
+            "go_annotated_protein_neighbors": [],
             "score": [],
             "norm_score": [],
             "true_label": [],
@@ -68,56 +68,56 @@ def predict(
         ):
 
             # calculate the score for the positive set
-            positive_pro_pro_neighbor = get_neighbors(
-                G, positive_protein, "protein_protein"
+            positive_protein_neighbor = get_neighbors(
+                G, positive_protein, ["protein_protein", "regulatory"]
             )
-            positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
-            positive_go_annotated_pro_pro_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, positive_pro_pro_neighbor, positive_go
+            positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
+            positive_go_annotated_protein_neighbor_count = (
+                get_go_annotated_protein_neighbor_count(
+                    G, positive_protein_neighbor, positive_go
                 )
             )
 
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
+            pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
             K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_k = positive_go_annotated_protein_neighbor_count  + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
 
             # calculate the score for the negative set
-            negative_pro_pro_neighbor = get_neighbors(
+            negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
             negative_go_annotated_protein_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, negative_pro_pro_neighbor, negative_go
+                get_go_annotated_protein_neighbor_count(
+                    G, negative_protein_neighbor, negative_go
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
+            neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
             data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
-                positive_go_annotated_pro_pro_neighbor_count
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
             )
             data["score"].append(positive_score)
             data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(negative_protein_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
+            data["go_annotated_protein_neighbors"].append(
                 negative_go_annotated_protein_neighbor_count
             )
             data["score"].append(negative_score)
@@ -149,14 +149,14 @@ def get_neighbors(G: nx.Graph, node, edgeType):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
-        if edge[2]["type"] == edgeType:
+        if edge[2]["type"] in edgeType:
             neighborNode = [edge[1], edge[2]]
             neighbors.append(neighborNode)
 
     return neighbors
 
 
-def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
+def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
     count = 0
     for element in nodeList:
         if G.has_edge(element[0], goTerm):

diff --git a/classes/overlapping_neighbors_class.py b/classes/overlapping_neighbors_class.py
@@ -47,9 +47,9 @@ def predict(
         data = {
             "protein": [],
             "go_term": [],
-            "pro_pro_neighbor": [],
+            "protein_neighbor": [],
             "go_neighbor": [],
-            "go_annotated_pro_pro_neighbors": [],
+            "go_annotated_protein_neighbors": [],
             "score": [],
             "norm_score": [],
             "true_label": [],
@@ -65,59 +65,59 @@ def predict(
             negative_dataset["go"],
         ):
             # calculate the score for the positive set
-            positive_pro_pro_neighbor = get_neighbors(
-                G, positive_protein, "protein_protein"
+            positive_protein_neighbor = get_neighbors(
+                G, positive_protein, ["protein_protein", "regulatory"]
             )
 
             # print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
-            positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
-            positive_go_annotated_pro_pro_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, positive_pro_pro_neighbor, positive_go
+            positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
+            positive_go_annotated_protein_neighbor_count = (
+                get_go_annotated_protein_neighbor_count(
+                    G, positive_protein_neighbor, positive_go
                 )
             )
 
-            if len(positive_pro_pro_neighbor) == 0:
+            if len(positive_protein_neighbor) == 0:
                 positive_score = 0
             else:
-                positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
-                    len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
+                positive_score = (1 + positive_go_annotated_protein_neighbor_count) / (
+                    len(positive_protein_neighbor) + len(positive_go_neighbor)
                 )
 
             # calculate the score for the negative set
-            negative_pro_pro_neighbor = get_neighbors(
+            negative_protein_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
             negative_go_annotated_protein_neighbor_count = (
-                get_go_annotated_pro_pro_neighbor_count(
-                    G, negative_pro_pro_neighbor, negative_go
+                get_go_annotated_protein_neighbor_count(
+                    G, negative_protein_neighbor, negative_go
                 )
             )
 
-            if len(negative_pro_pro_neighbor) == 0:
+            if len(negative_protein_neighbor) == 0:
                 negative_score = 0
             else:
                 negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
-                    len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
+                    len(negative_protein_neighbor) + len(negative_go_neighbor)
                 )
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(positive_protein_neighbor))
             data["go_neighbor"].append(len(positive_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
-                positive_go_annotated_pro_pro_neighbor_count
+            data["go_annotated_protein_neighbors"].append(
+                positive_go_annotated_protein_neighbor_count
             )
             data["score"].append(positive_score)
             data["true_label"].append(1)
 
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
+            data["protein_neighbor"].append(len(negative_protein_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
-            data["go_annotated_pro_pro_neighbors"].append(
+            data["go_annotated_protein_neighbors"].append(
                 negative_go_annotated_protein_neighbor_count
             )
             data["score"].append(negative_score)
@@ -145,18 +145,18 @@ def predict(
         return y_score, y_true
 
 
-def get_neighbors(G: nx.Graph, node, edgeType):
+def get_neighbors(G: nx.DiGraph, node, edgeTypes):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
-        if edge[2]["type"] == edgeType:
+        if edge[2]["type"] in edgeTypes:
             neighborNode = [edge[1], edge[2]]
             neighbors.append(neighborNode) 
 
     return neighbors
 
 
-def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
+def get_go_annotated_protein_neighbor_count(G: nx.DiGraph, nodeList, goTerm):
     count = 0
     for element in nodeList:
         if G.has_edge(element[0], goTerm):