From a44a4e9940f6897d45c3c787bb01a804ede4f102 Mon Sep 17 00:00:00 2001 From: ctrlaltaf Date: Thu, 5 Sep 2024 12:43:59 -0700 Subject: [PATCH] new methods --- .../hypergeometric_distribution_class_V3.py | 171 ++++++++++++++++++ classes/one_hop_go_degree_class.py | 167 +++++++++++++++++ classes/random_walk_class_v3.py | 34 ---- main.py | 20 +- tools/helper.py | 2 +- tools/workflow.py | 2 - 6 files changed, 352 insertions(+), 44 deletions(-) create mode 100644 classes/hypergeometric_distribution_class_V3.py create mode 100644 classes/one_hop_go_degree_class.py diff --git a/classes/hypergeometric_distribution_class_V3.py b/classes/hypergeometric_distribution_class_V3.py new file mode 100644 index 0000000..85c60ce --- /dev/null +++ b/classes/hypergeometric_distribution_class_V3.py @@ -0,0 +1,171 @@ +from classes.base_algorithm_class import BaseAlgorithm +import networkx as nx +import pandas as pd +from colorama import init as colorama_init +from colorama import Fore, Back, Style +from pathlib import Path +import math +from tools.helper import print_progress, normalize, import_graph_from_pickle +from tools.workflow import get_datasets + + +class HypergeometricDistributionV3(BaseAlgorithm): + def __init__(self): + self.y_score = [] + self.y_true = [] + + def get_y_score(self): + return self.y_score + + def get_y_true(self): + return self.y_true + + def set_y_score(self, y_score): + self.y_score = y_score + + def set_y_true(self, y_true): + self.y_true = y_true + + def predict( + self, + input_directory_path, + graph_file_path, + output_path, + rep_num, + name, + ): + """ + Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of + interest and a GO term. Does not include protein of interest in calculations. + """ + colorama_init() + + # have two sets of positive and negative protein-go_term pairs + # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term. + # 50% of the data are proteins that are annotated to a GO term + # 50% of the data are proteins that are not annotated to a GO term + + data = { + "protein": [], + "go_term": [], + "protein_neighbor": [], + "go_neighbor": [], + "go_annotated_protein_neighbors": [], + "score": [], + "norm_score": [], + "true_label": [], + } + + positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name) + G = import_graph_from_pickle(graph_file_path) + + i = 1 + for positive_protein, positive_go, negative_protein, negative_go in zip( + positive_dataset["protein"], + positive_dataset["go"], + negative_dataset["protein"], + negative_dataset["go"], + ): + + # calculate the score for the positive set + positive_protein_neighbor = get_neighbors( + G, positive_protein, ["protein_protein", "regulatory"] + ) + positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"]) + positive_go_annotated_protein_neighbor_count = ( + get_go_annotated_protein_neighbor_count( + G, positive_protein_neighbor, positive_go + ) + ) + + N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph + pos_n = len(positive_protein_neighbor) #Number of protein neighbors the protein of interest has + K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included) + pos_k = positive_go_annotated_protein_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest + + if pos_n == 0: + positive_score = 0 + else: + #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function + positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n)) + + # calculate the score for the negative set + negative_protein_neighbor = get_neighbors( + G, negative_protein, "protein_protein" + ) + negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term") + negative_go_annotated_protein_neighbor_count = ( + get_go_annotated_protein_neighbor_count( + G, negative_protein_neighbor, negative_go + ) + ) + + neg_n = len(negative_protein_neighbor) #Negative protein of interest neighbors + neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos) + + if neg_n == 0: + negative_score = 0 + else: + negative_score = 1 - ((math.comb(K + 1,neg_k)*math.comb(N-K + 1,neg_n-neg_k))/math.comb(N,neg_n)) + + + # input positive and negative score to data + data["protein"].append(positive_protein) + data["go_term"].append(positive_go) + data["protein_neighbor"].append(len(positive_protein_neighbor)) + data["go_neighbor"].append(len(positive_go_neighbor)) + data["go_annotated_protein_neighbors"].append( + positive_go_annotated_protein_neighbor_count + ) + data["score"].append(positive_score) + data["true_label"].append(1) + + data["protein"].append(negative_protein) + data["go_term"].append(negative_go) + data["protein_neighbor"].append(len(negative_protein_neighbor)) + data["go_neighbor"].append(len(negative_go_neighbor)) + data["go_annotated_protein_neighbors"].append( + negative_go_annotated_protein_neighbor_count + ) + data["score"].append(negative_score) + data["true_label"].append(0) + + print_progress(i, len(positive_dataset["protein"])) + i += 1 + + normalized_data = normalize(data["score"]) + for item in normalized_data: + data["norm_score"].append(item) + + df = pd.DataFrame(data) + df = df.sort_values(by="norm_score", ascending=False) + + df.to_csv( + Path(output_path, "hypergeometric_distribution.csv"), + index=False, + sep="\t", + ) + + y_score = df["norm_score"].to_list() + y_true = df["true_label"].to_list() + + return y_score, y_true + + +def get_neighbors(G: nx.DiGraph, node, edgeTypes): + res = G.edges(node, data=True) + neighbors = [] + for edge in res: + if edge[2]["type"] in edgeTypes: + neighborNode = [edge[1], edge[2]] + neighbors.append(neighborNode) + + return neighbors + + +def get_go_annotated_protein_neighbor_count(G: nx.Graph, nodeList, goTerm): + count = 0 + for element in nodeList: + if G.has_edge(element[0], goTerm): + count += 1 + return count diff --git a/classes/one_hop_go_degree_class.py b/classes/one_hop_go_degree_class.py new file mode 100644 index 0000000..8a9c0f2 --- /dev/null +++ b/classes/one_hop_go_degree_class.py @@ -0,0 +1,167 @@ +from classes.base_algorithm_class import BaseAlgorithm +import networkx as nx +import pandas as pd +from colorama import init as colorama_init +from colorama import Fore, Back, Style +from pathlib import Path +from tools.helper import print_progress, normalize, import_graph_from_pickle +from tools.workflow import get_datasets + + +class OneHopGODegree(BaseAlgorithm): + def __init__(self): + self.y_score = [] + self.y_true = [] + + def get_y_score(self): + return self.y_score + + def get_y_true(self): + return self.y_true + + def set_y_score(self, y_score): + self.y_score = y_score + + def set_y_true(self, y_true): + self.y_true = y_true + + def predict( + self, + input_directory_path, + graph_file_path, + output_path, + rep_num, + name, + ): + """ + evaluate overlapping neighbors method on a protein protein interaction network with go term annotation. + """ + colorama_init() + + # have two sets of positive and negative protein-go_term pairs + # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term. + # 50% of the data are proteins that are annotated to a GO term + # 50% of the data are proteins that are not annotated to a GO term + # score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor) + + data = { + "protein": [], + "go_term": [], + "protein_neighbor": [], + "go_neighbor": [], + "go_annotated_protein_neighbors": [], + "score": [], + "norm_score": [], + "true_label": [], + } + + positive_dataset, negative_dataset = get_datasets( + input_directory_path, rep_num, name + ) + G = import_graph_from_pickle(graph_file_path) + i = 1 + for positive_protein, positive_go in zip( + positive_dataset["protein"], + positive_dataset["go"], + ): + # calculate the score for the positive set + positive_protein_neighbor = get_neighbors( + G, positive_protein, ["protein_protein", "regulatory"] + ) + + positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"]) + positive_go_annotated_protein_neighbor_count = ( + get_go_annotated_protein_neighbor_count( + G, positive_protein_neighbor, positive_go + ) + ) + + if len(positive_protein_neighbor) == 0: + positive_score = 0 + else: + positive_score = positive_go_annotated_protein_neighbor_count + + # input positive and negative score to data + data["protein"].append(positive_protein) + data["go_term"].append(positive_go) + data["protein_neighbor"].append(len(positive_protein_neighbor)) + data["go_neighbor"].append(len(positive_go_neighbor)) + data["go_annotated_protein_neighbors"].append( + positive_go_annotated_protein_neighbor_count + ) + data["score"].append(positive_score) + data["true_label"].append(1) + + print_progress(i, len(positive_dataset["protein"])) + i += 1 + + for negative_protein, negative_go in zip( + negative_dataset["protein"], + negative_dataset["go"], + ): + + # calculate the score for the negative set + negative_protein_neighbor = get_neighbors( + G, negative_protein, "protein_protein" + ) + negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term") + negative_go_annotated_protein_neighbor_count = ( + get_go_annotated_protein_neighbor_count( + G, negative_protein_neighbor, negative_go + ) + ) + + if len(negative_protein_neighbor) == 0: + negative_score = 0 + else: + negative_score = negative_go_annotated_protein_neighbor_count + + data["protein"].append(negative_protein) + data["go_term"].append(negative_go) + data["protein_neighbor"].append(len(negative_protein_neighbor)) + data["go_neighbor"].append(len(negative_go_neighbor)) + data["go_annotated_protein_neighbors"].append( + negative_go_annotated_protein_neighbor_count + ) + data["score"].append(negative_score) + data["true_label"].append(0) + + print_progress(i, len(negative_dataset["protein"])) + i += 1 + + normalized_data = normalize(data["score"]) + for item in normalized_data: + data["norm_score"].append(item) + + df = pd.DataFrame(data) + df = df.sort_values(by="norm_score", ascending=False) + + df.to_csv( + Path(output_path, "overlapping_neighbor_data.csv"), + index=False, + sep="\t", + ) + + y_score = df["norm_score"].to_list() + y_true = df["true_label"].to_list() + + return y_score, y_true + + +def get_neighbors(G: nx.DiGraph, node, edgeTypes): + res = G.edges(node, data=True) + neighbors = [] + for edge in res: + if edge[2]["type"] in edgeTypes: + neighborNode = [edge[1], edge[2]] + neighbors.append(neighborNode) + + return neighbors + + +def get_go_annotated_protein_neighbor_count(G: nx.DiGraph, nodeList, goTerm): + count = 0 + for element in nodeList: + if G.has_edge(element[0], goTerm): + count += 1 + return count diff --git a/classes/random_walk_class_v3.py b/classes/random_walk_class_v3.py index 8057f01..78fd89b 100644 --- a/classes/random_walk_class_v3.py +++ b/classes/random_walk_class_v3.py @@ -107,40 +107,6 @@ def predict( print_progress(i, len(negative_dataset["protein"])) i += 1 - # for positive_protein, positive_go, negative_protein, negative_go in zip( - # positive_dataset["protein"], - # positive_dataset["go"], - # negative_dataset["protein"], - # negative_dataset["go"], - # ): - # go_neighbors = get_neighbors(G, positive_go, "protein_go_term") - - # go_neighbor_dict = {} - # for j in go_neighbors: - # if j[0] != positive_protein: - # go_neighbor_dict[j[0]] = 1 - # G.remove_edge(j[0], positive_go) - # if len(go_neighbor_dict) != 0: - # p = nx.pagerank(G, alpha=0.7, personalization=go_neighbor_dict) - # data["walk"].append(p[positive_protein]) - # data["walk"].append(p[negative_protein]) - # else: - # data["walk"].append(0) - # data["walk"].append(0) - - # data["protein"].append(positive_protein) - # data["go_term"].append(positive_go) - # data["true_label"].append(1) - - # data["protein"].append(negative_protein) - # data["go_term"].append(negative_go) - # data["true_label"].append(0) - # for j in go_neighbors: - # G.add_edge(j[0], positive_go, type="protein_go_term") - - # print_progress(i, len(positive_dataset["protein"])) - # i += 1 - normalized_data = normalize(data["walk"]) for item in normalized_data: data["norm_score"].append(item) diff --git a/main.py b/main.py index bf1da43..826d1d5 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,14 @@ from classes.overlapping_neighbors_class import OverlappingNeighbors from classes.overlapping_neighbors_v2_class import OverlappingNeighborsV2 from classes.overlapping_neighbors_v3_class import OverlappingNeighborsV3 +from classes.one_hop_go_degree_class import OneHopGODegree from classes.protein_degree_class import ProteinDegree from classes.protein_degree_v2_class import ProteinDegreeV2 from classes.protein_degree_v3_class import ProteinDegreeV3 from classes.sample_algorithm import SampleAlgorithm from classes.hypergeometric_distribution_class import HypergeometricDistribution from classes.hypergeometric_distribution_class_V2 import HypergeometricDistributionV2 +from classes.hypergeometric_distribution_class_V3 import HypergeometricDistributionV3 from classes.random_walk_class import RandomWalk from classes.random_walk_class_v2 import RandomWalkV2 from classes.random_walk_class_v3 import RandomWalkV3 @@ -121,9 +123,11 @@ def main(): short_name = short_name + "_cel" interactome_columns = [0, 1] - interactome = read_specific_columns(fly_interactome_path, interactome_columns, ",") + interactome = read_specific_columns( + elegans_interactome_path, interactome_columns, "," + ) regulatory_interactome = read_specific_columns( - fly_reg_path, interactome_columns, "," + elegans_reg_path, interactome_columns, "," ) go_inferred_columns = [0, 2, 3] # Adds relationship_type column @@ -131,7 +135,7 @@ def main(): go_inferred_columns.append(1) go_protein_pairs = read_pro_go_data( - fly_go_association_mixed_path, go_inferred_columns, go_term_type, "," + elegans_go_association_mixed_path, go_inferred_columns, go_term_type, "," ) # Uses relationship_type column to sort through which proGO edges are inferred if no_inferred_edges: @@ -155,23 +159,25 @@ def main(): # P, protein_list = create_only_protein_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict) # export_graph_to_pickle(P, "./output/dataset/protein.pickle") # Creates a graph with only protein-GO term edges (used for RandomWalkV5) - # D, protein_list = create_go_protein_only_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict) - # export_graph_to_pickle(D, "./output/dataset/go_protein.pickle") + D = create_go_protein_only_network(interactome,regulatory_interactome, go_protein_pairs, go_depth_dict) + export_graph_to_pickle(D, "./output/dataset/go_protein.pickle") # Define algorithm classes and their names algorithm_classes = { "OverlappingNeighbors": OverlappingNeighbors, "OverlappingNeighborsV2": OverlappingNeighborsV2, "OverlappingNeighborsV3": OverlappingNeighborsV3, + "OneHopGODegree": OneHopGODegree, "ProteinDegree": ProteinDegree, "ProteinDegreeV2": ProteinDegreeV2, "ProteinDegreeV3": ProteinDegreeV3, "SampleAlgorithm": SampleAlgorithm, "HypergeometricDistribution": HypergeometricDistribution, "HypergeometricDistributionV2": HypergeometricDistributionV2, + "HypergeometricDistributionV3": HypergeometricDistributionV3, "RandomWalk": RandomWalk, - "RandomWalkV2": RandomWalkV2, - "RandomWalkV3": RandomWalkV3, + # "RandomWalkV2": RandomWalkV2, + # "RandomWalkV3": RandomWalkV3, # "RandomWalkV4": RandomWalkV4, #need protein-only network # "RandomWalkV5": RandomWalkV5, #need protein-goterm only network } diff --git a/tools/helper.py b/tools/helper.py index 496cf7a..3abc3d0 100644 --- a/tools/helper.py +++ b/tools/helper.py @@ -285,7 +285,7 @@ def create_go_protein_only_network(ppi_interactome,regulatory_interactome,GO_ter print("total edge count: ", len(G.edges())) print("total node count: ", len(G.nodes())) - return G, protein_list + return G def read_specific_columns(file_path, columns, delimit): diff --git a/tools/workflow.py b/tools/workflow.py index 4e34a36..13f067c 100644 --- a/tools/workflow.py +++ b/tools/workflow.py @@ -90,8 +90,6 @@ def run_workflow( name, ) - sys.exit() - for i in range( x ): # Creates a pos/neg list each replicate then runs workflow like normal