Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regulatory #18

Merged
merged 27 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9ab178d
updated difference function for more recent workflow, added a go_neig…
Jun 26, 2024
66095a3
updated reverse_sample to run faster and added proteins with no annot…
Jun 27, 2024
3969307
added random walk
Jul 8, 2024
5daac45
new random walk and test file using small graphs
Jul 9, 2024
aacbbda
Changed numbering of random walks and added V3 & V5 (refer to google …
Jul 19, 2024
60cbf57
test file with neighbor ranks and .csv with a list of all unique go t…
Jul 22, 2024
d8c3690
fixed merge conflict
ctrlaltaf Jul 22, 2024
ccc00d5
Cleaned up files, added to README for .py files, updated pytests
Aug 1, 2024
e59110f
fixed pytest
Aug 1, 2024
13ae1cd
testing fly mixed networks
ctrlaltaf Aug 21, 2024
c94f66d
added new species data
ctrlaltaf Aug 22, 2024
71fbbe7
all methods work with new database
ctrlaltaf Aug 27, 2024
90571ad
working sampling method
ctrlaltaf Aug 27, 2024
2bd7f6c
added go term sampling
ctrlaltaf Aug 28, 2024
baf918c
cleaned up code
ctrlaltaf Aug 28, 2024
d617b81
added degree and ratio sampling method
ctrlaltaf Aug 29, 2024
5a6b4cb
optimized sampling method
ctrlaltaf Aug 30, 2024
a44a4e9
new methods
ctrlaltaf Sep 5, 2024
f42e113
testing final figures workflow
ctrlaltaf Sep 9, 2024
f1e9686
testing final figures workflow
ctrlaltaf Sep 9, 2024
cb5ddb8
changing data structure
ctrlaltaf Sep 9, 2024
731266c
pipeline for one species data
ctrlaltaf Sep 9, 2024
b10a552
pipeline for one species data
ctrlaltaf Sep 9, 2024
75fbd2a
multi species data
ctrlaltaf Sep 9, 2024
49b341c
subplot for two figures
ctrlaltaf Sep 9, 2024
bd65d5c
subplot for two figures
ctrlaltaf Sep 9, 2024
457fe84
generated multi subplot for a test case
ctrlaltaf Sep 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ A python program that aims to predict protein function prediction using protein-
- Now you have a conda environment that has all the necessary packages for this project
- To test that everything is working, you can run `python main.py`

# Files
- main.py: File used to run the algorithms
- neighbor_accuracy: Computes how accurately the neighbors of a GO term can be predicted using random walk
- accuracy_stats.py: Computes stats on neighbor_accuracy
- difference.py: Takes two matched outputs from main and prints a table comparing them
- distribution.py: Visualizes the distribution of GO term neighbor counts
- small_graph.py: Visualizes the impact of pagerank on a directed and undirected graph using a test dataset
- subgraph.py: Visualizes a subgraph of the one built in main based on pagerank node ranks
- paired_sample.py: An additional way to generate pos/neg samples keeping specific aspects constant (then run main with new_random_lists set to False)
68 changes: 68 additions & 0 deletions accuracy_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import matplotlib.pyplot as plt
import statistics as stat
from pathlib import Path
from tools.helper import read_specific_columns

'''
Takes the dataframe output by neighbor_accuracy.py and generates a graph (initially used to change alpha value without regenerating all the data). Additionally, the frequency of the number of neighbors and associated score is calculated and printed.
'''

ranked = read_specific_columns("./output/data/go_neighbor_tests/neighbor_rank/all_neighbor_rank_under_100.csv", [1,2], "\t")

all_go_neighbor_rank = []
all_go_neighbor_num = []

# Sorts through data, remvoing values with no score
for i in ranked:
if i[1] != "N/A":
all_go_neighbor_num.append(int(i[0]))
all_go_neighbor_rank.append(float(i[1]))

fig, ax = plt.subplots()
plt.scatter(all_go_neighbor_rank, all_go_neighbor_num, alpha = .05)
plt.xlabel("% Go Neighbors Accurately Predicted")
ax.set_xlim([-5, 105])
plt.ylabel("Number of Neighbors")
plt.savefig("./output/data/go_neighbor_tests/neighbor_rank/under_100_rank.png")
plt.show()

mean_rank = round(stat.mean(all_go_neighbor_rank),2)
mean_num = round(stat.mean(all_go_neighbor_num),2)
print("Mean Rank: " + str(mean_rank))
print("Mean Number of Neighbors: " + str(mean_num))

#Prints the frequency of some number of neighbors having a specific score, can uncomment below to remove any that have a score of zero
freq_dict = {}
for i in ranked:
if i[1] != "N/A": # and float(i[1]) != 0.0:
key = i[0] + "_" + i[1]
if key in freq_dict.keys():
freq_dict[key] += 1
else:
freq_dict[key] = 1

# Recursive function that saves the frequency output as a dictionary
def order(freq_dict):
top_freq = 0
top = ""
for i in freq_dict.keys():
if freq_dict[i] > top_freq:
top_freq = freq_dict[i]
top = i

s = top.split("_")
print(s[0] + " Neighbors with a percent accuracy of " + s[1] + " occurs " + str(top_freq) + " times")
freq_dict.pop(top)
if len(freq_dict) != 1:
x = order(freq_dict)
x.insert(0,[top, top_freq])
return x
return [[top, top_freq]]

lst = order(freq_dict)
print(lst)

#Dictionary is not saved anywhere but could be



55 changes: 32 additions & 23 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,36 +68,45 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest

pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest


# if K == -1:
# K = 1

# print("N: ", N)
# print("pos_n: ", pos_n)
# print("K: ", K)
# print("pos_k: ", pos_k)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand All @@ -106,19 +115,19 @@ def predict(
# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -146,18 +155,18 @@ def predict(
return y_score, y_true


def get_neighbors(G: nx.Graph, node, edgeType):
def get_neighbors(G: nx.DiGraph, node, edgeTypes):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeTypes:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
42 changes: 21 additions & 21 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def predict(
data = {
"protein": [],
"go_term": [],
"pro_pro_neighbor": [],
"protein_neighbor": [],
"go_neighbor": [],
"go_annotated_pro_pro_neighbors": [],
"go_annotated_protein_neighbors": [],
"score": [],
"norm_score": [],
"true_label": [],
Expand All @@ -68,56 +68,56 @@ def predict(
):

# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
pos_n = len(positive_protein_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_k = positive_go_annotated_protein_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
negative_go_annotated_protein_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
get_go_annotated_protein_neighbor_count(
G, negative_protein_neighbor, negative_go
)
)

neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_n = len(negative_protein_neighbor) + 1 #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor))
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
positive_go_annotated_pro_pro_neighbor_count
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
data["protein_neighbor"].append(len(negative_protein_neighbor))
data["go_neighbor"].append(len(negative_go_neighbor))
data["go_annotated_pro_pro_neighbors"].append(
data["go_annotated_protein_neighbors"].append(
negative_go_annotated_protein_neighbor_count
)
data["score"].append(negative_score)
Expand Down Expand Up @@ -149,14 +149,14 @@ def get_neighbors(G: nx.Graph, node, edgeType):
res = G.edges(node, data=True)
neighbors = []
for edge in res:
if edge[2]["type"] == edgeType:
if edge[2]["type"] in edgeType:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)

return neighbors


def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm):
count = 0
for element in nodeList:
if G.has_edge(element[0], goTerm):
Expand Down
Loading
Loading