Skip to content

Commit

Permalink
all methods work with new database
Browse files Browse the repository at this point in the history
  • Loading branch information
ctrlaltaf committed Aug 27, 2024
1 parent c94f66d commit 71fbbe7
Show file tree
Hide file tree
Showing 12 changed files with 69,222 additions and 150 deletions.
46 changes: 23 additions & 23 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,36 +68,36 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand All @@ -106,19 +106,19 @@ def predict(
# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -146,18 +146,18 @@ def predict(
return y_score, y_true


def get_neighbors(G: nx.Graph, node, edgeType):
def get_neighbors(G: nx.DiGraph, node, edgeTypes):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeTypes:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
42 changes: 21 additions & 21 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,56 +68,56 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_k = positive_go_annotated_protein_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -149,14 +149,14 @@ def get_neighbors(G: nx.Graph, node, edgeType):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeType:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
48 changes: 24 additions & 24 deletions classes/overlapping_neighbors_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -65,59 +65,59 @@ def predict(
negative_dataset["go"],
):
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)

# print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

if len(positive_pro_pro_neighbor) == 0:
if len(positive_protein_neighbor) == 0:
positive_score = 0
else:
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
positive_score = (1 + positive_go_annotated_protein_neighbor_count) / (
len(positive_protein_neighbor) + len(positive_go_neighbor)
)

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

if len(negative_pro_pro_neighbor) == 0:
if len(negative_protein_neighbor) == 0:
negative_score = 0
else:
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
len(negative_protein_neighbor) + len(negative_go_neighbor)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -145,18 +145,18 @@ def predict(
return y_score, y_true


def get_neighbors(G: nx.Graph, node, edgeType):
def get_neighbors(G: nx.DiGraph, node, edgeTypes):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeTypes:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.DiGraph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
Loading

0 comments on commit 71fbbe7

Please sign in to comment.