Skip to content

Commit

Permalink
fixed algorithms to account for self edge and added .csv output for l…
Browse files Browse the repository at this point in the history
…ist of all roc and pr values that result from running repititions.
  • Loading branch information
amnorman committed Jun 18, 2024
1 parent f160609 commit 67043d6
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 43 deletions.
19 changes: 14 additions & 5 deletions classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,18 @@ def predict(
)
)

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1 #Removes extra node if there is an edge to self

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest

pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))

# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -95,11 +99,16 @@ def predict(
)
)

neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1

neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))


# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
Expand Down
18 changes: 13 additions & 5 deletions classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,15 @@ def predict(
G, positive_pro_pro_neighbor, positive_go
)
)


c = 1
if G.has_edge(positive_protein, positive_protein):
c = 0

N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)

#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
Expand All @@ -95,7 +99,11 @@ def predict(
)
)

neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
c = 1
if G.has_edge(negative_protein, negative_protein):
c = 0

neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)

negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
Expand Down Expand Up @@ -159,4 +167,4 @@ def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
for element in nodeList:
if G.has_edge(element[0], goTerm):
count += 1
return count
return count
34 changes: 24 additions & 10 deletions classes/overlapping_neighbors_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,30 +55,40 @@ def predict(

positive_dataset, negative_dataset = get_datasets(input_directory_path)
G = import_graph_from_pickle(graph_file_path)

i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
)

# print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
)
) - c

if len(positive_pro_pro_neighbor) == 0:
positive_score = 0
else:
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
)

# calculate the score for the negative set
c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
Expand All @@ -88,9 +98,13 @@ def predict(
G, negative_pro_pro_neighbor, negative_go
)
)
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
)

if len(negative_pro_pro_neighbor) == 0:
negative_score = 0
else:
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
Expand Down Expand Up @@ -141,7 +155,7 @@ def get_neighbors(G: nx.Graph, node, edgeType):
for edge in res:
if edge[2]["type"] == edgeType:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)
neighbors.append(neighborNode)

return neighbors

Expand Down
13 changes: 9 additions & 4 deletions classes/overlapping_neighbors_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -70,13 +72,16 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
) - c
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1
+ len(positive_pro_pro_neighbor)
+ (len(positive_pro_pro_neighbor) - c)
* positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor) / 2)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -89,7 +94,7 @@ def predict(
)
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1
+ len(negative_pro_pro_neighbor)
+ (len(negative_pro_pro_neighbor) - c)
* negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor) / 2)

Expand Down
7 changes: 5 additions & 2 deletions classes/overlapping_neighbors_v3_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):
c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -69,7 +72,7 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
) - c
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1 + positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor))
Expand All @@ -83,7 +86,7 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
)
)
)
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1 + negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor))
Expand Down
10 changes: 8 additions & 2 deletions classes/protein_degree_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,20 @@ def predict(
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(G.degree(positive_protein))
data["degree"].append(G.degree(positive_protein) - c)
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(G.degree(negative_protein))
data["degree"].append(G.degree(negative_protein) - c)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
i += 1
Expand Down
10 changes: 8 additions & 2 deletions classes/protein_degree_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,23 @@ def predict(
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(
len(get_neighbors(G, positive_protein, "protein_protein"))
len(get_neighbors(G, positive_protein, "protein_protein")) - c
)
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(
len(get_neighbors(G, negative_protein, "protein_protein"))
len(get_neighbors(G, negative_protein, "protein_protein")) - c
)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
Expand Down
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main():
output_image_path = Path("./output/images/")
dataset_directory_path = Path("./output/dataset")
graph_file_path = Path(dataset_directory_path, "graph.pickle")
sample_size = 100
sample_size = 1000

testing_output_data_path = Path("./output/data/")
testing_output_image_path = Path("./output/images/")
Expand Down Expand Up @@ -85,15 +85,15 @@ def main():
"HypergeometricDistributionV2": HypergeometricDistributionV2,
}

repeats = 1
repeats = 20

run_workflow(
algorithm_classes,
go_protein_pairs,
sample_size,
protein_list,
testing_graph_file_path,
testing_input_directory_path,
graph_file_path,
dataset_directory_path,
output_data_path,
output_image_path,
repeats,
Expand Down
63 changes: 54 additions & 9 deletions tools/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,15 @@ def run_workflow(
for i in algorithm_classes.keys():
auc[i] = [[], []]

#Generate number of datasets if they don't already exist



for i in range(
x
): # Creates a pos/neg list each replicate then runs workflow like normal
print("\n\nReplicate: " + str(i) + "\n")
if x > 1:
print("\n\nReplicate: " + str(i+1) + "\n")

# positive_dataset, negative_dataset = sample_data(
# go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
Expand All @@ -76,11 +81,24 @@ def run_workflow(

# each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
for i in algorithm_classes.keys():
auc[i][0].append(results[i]["roc_auc"])
auc[i][1].append(results[i]["pr_auc"])
auc[i][0].append(round(results[i]["roc_auc"],5))
auc[i][1].append(round(results[i]["pr_auc"],5))

#Creates a dictionary for all pr values and all roc values
roc = {}
pr = {}

# Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
for i in algorithm_classes.keys():
roc[i] = auc[i][0]
pr[i] = auc[i][1]

if x > 1:
cols = []
for i in range(x):
cols.append("Replicate " + str(i+1))
name = "_replicate_list"

# Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
for i in auc.keys():
meanROC = round(stat.mean(auc[i][0]), 5)
auc[i].append(round(stat.mean(auc[i][1]), 5))
Expand All @@ -106,7 +124,34 @@ def run_workflow(
index=True,
sep="\t",
)
else:
cols = ["AUC"]
name = "_auc_results"

dfr = pd.DataFrame.from_dict(
roc,
orient = 'index',
columns = cols
)

dfp = pd.DataFrame.from_dict(
pr,
orient = 'index',
columns = cols
)

dfr.to_csv(
Path(output_data_path, "roc" + name + ".csv"),
index = True,
sep = "\t"
)

dfp.to_csv(
Path(output_data_path, "pr" + name + ".csv"),
index = True,
sep = "\t"
)


def run_experiement(
algorithm_classes,
Expand Down Expand Up @@ -461,11 +506,11 @@ def sort_results_by(results, key, output_path):
df = pd.DataFrame(data)
df = df.sort_values(by=key, ascending=False)

df.to_csv(
output_file_path,
index=False,
sep="\t",
)
# df.to_csv(
# output_file_path,
# index=False,
# sep="\t",
# )

algorithm_tuple_list = sorted(algorithm_tuple_list, key=itemgetter(1), reverse=True)

Expand Down

0 comments on commit 67043d6

Please sign in to comment.