Skip to content

Commit

Permalink
Merge pull request #19 from Reed-CompBio/figures
Browse files Browse the repository at this point in the history
Figures
  • Loading branch information
ctrlaltaf authored Sep 13, 2024
2 parents f6ddffc + 635ee73 commit d2fdf84
Show file tree
Hide file tree
Showing 53 changed files with 2,542,058 additions and 180 deletions.
91 changes: 53 additions & 38 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def __init__(self):

def get_y_score(self):
return self.y_score

def get_y_true(self):
return self.y_true

def set_y_score(self, y_score):
self.y_score = y_score

Expand All @@ -35,7 +35,7 @@ def predict(
name,
):
"""
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
interest and a GO term. Does not include protein of interest in calculations.
"""
colorama_init()
Expand All @@ -56,15 +56,17 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)
G = import_graph_from_pickle(graph_file_path)

N = len(
[x for x, y in G.nodes(data=True) if y["type"] == "protein"]
) # Total number of protein nodes in the entire graph
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

# calculate the score for the positive set
Expand All @@ -78,23 +80,40 @@ def predict(
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
pos_n = len(
positive_protein_neighbor
) # Number of protein neighbors the protein of interest has
K = (
len(positive_go_neighbor) - 1
) # Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_protein_neighbor_count # The overlap between the GO protein neighbors and protein neighbors of the protein of interest

# The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - (
(math.comb(K, pos_k) * math.comb(N - K, pos_n - pos_k))
/ math.comb(N, pos_n)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

# if K == -1:
# K = 1
print_progress(i, len(positive_dataset["protein"]))
i += 1

# print("N: ", N)
# print("pos_n: ", pos_n)
# print("K: ", K)
# print("pos_k: ", pos_k)
i = 1
for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -105,23 +124,19 @@ def predict(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))


# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
K = (
len(negative_go_neighbor) - 1
) # Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)

neg_n = len(
negative_protein_neighbor
) # Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count # Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - (
(math.comb(K, neg_k) * math.comb(N - K, neg_n - neg_k))
/ math.comb(N, neg_n)
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
Expand All @@ -133,7 +148,7 @@ def predict(
data["score"].append(negative_score)
data["true_label"].append(0)

print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["score"])
Expand Down
90 changes: 57 additions & 33 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def __init__(self):

def get_y_score(self):
return self.y_score

def get_y_true(self):
return self.y_true

def set_y_score(self, y_score):
self.y_score = y_score

Expand All @@ -35,8 +35,8 @@ def predict(
name,
):
"""
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
interest and a GO term. Includes the protein of interest in calculations.
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
interest and a GO term. Includes the protein of interest in calculations.
"""
colorama_init()

Expand All @@ -56,15 +56,17 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)
G = import_graph_from_pickle(graph_file_path)

N = len(
[x for x, y in G.nodes(data=True) if y["type"] == "protein"]
) # Total number of protein nodes in the entire graph
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

# calculate the score for the positive set
Expand All @@ -77,15 +79,41 @@ def predict(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_protein_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_n = (
len(positive_protein_neighbor) + 1
) # Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(
positive_go_neighbor
) # Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = (
positive_go_annotated_protein_neighbor_count + 1
) # The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

# The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - (
(math.comb(K, pos_k) * math.comb(N - K, pos_n - pos_k))
/ math.comb(N, pos_n)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
print_progress(i, len(positive_dataset["protein"]))
i += 1

i = 1
for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the negative set
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -96,22 +124,18 @@ def predict(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
K = len(
negative_go_neighbor
) # Number of protein neighbors the GO term of interest has, same for pos & neg
neg_n = (
len(negative_protein_neighbor) + 1
) # Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count # Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - (
(math.comb(K, neg_k) * math.comb(N - K, neg_n - neg_k))
/ math.comb(N, neg_n)
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
Expand All @@ -123,7 +147,7 @@ def predict(
data["score"].append(negative_score)
data["true_label"].append(0)

print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["score"])
Expand Down Expand Up @@ -161,4 +185,4 @@ def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
for element in nodeList:
if G.has_edge(element[0], goTerm):
count += 1
return count
return count
Loading

0 comments on commit d2fdf84

Please sign in to comment.