diff --git a/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V2-checkpoint.py b/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V2-checkpoint.py deleted file mode 100644 index 7215725..0000000 --- a/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V2-checkpoint.py +++ /dev/null @@ -1,162 +0,0 @@ -from classes.base_algorithm_class import BaseAlgorithm -import networkx as nx -import pandas as pd -from colorama import init as colorama_init -from colorama import Fore, Back, Style -from pathlib import Path -import math -from tools.helper import print_progress, normalize, import_graph_from_pickle -from tools.workflow import get_datasets - - -class HypergeometricDistributionV2(BaseAlgorithm): - def __init__(self): - self.y_score = [] - self.y_true = [] - - def get_y_score(self): - return self.y_score - - def get_y_true(self): - return self.y_true - - def set_y_score(self, y_score): - self.y_score = y_score - - def set_y_true(self, y_true): - self.y_true = y_true - - def predict( - self, - input_directory_path, - graph_file_path, - output_path, - ): - """ - Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of - interest and a GO term. Includes the protein of interest in calculations. - """ - colorama_init() - - # have two sets of positive and negative protein-go_term pairs - # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term. - # 50% of the data are proteins that are annotated to a GO term - # 50% of the data are proteins that are not annotated to a GO term - - data = { - "protein": [], - "go_term": [], - "pro_pro_neighbor": [], - "go_neighbor": [], - "go_annotated_pro_pro_neighbors": [], - "score": [], - "norm_score": [], - "true_label": [], - } - - positive_dataset, negative_dataset = get_datasets(input_directory_path) - G = import_graph_from_pickle(graph_file_path) - - i = 1 - for positive_protein, positive_go, negative_protein, negative_go in zip( - positive_dataset["protein"], - positive_dataset["go"], - negative_dataset["protein"], - negative_dataset["go"], - ): - - # calculate the score for the positive set - positive_pro_pro_neighbor = get_neighbors( - G, positive_protein, "protein_protein" - ) - positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term") - positive_go_annotated_pro_pro_neighbor_count = ( - get_go_annotated_pro_pro_neighbor_count( - G, positive_pro_pro_neighbor, positive_go - ) - ) - - N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph - pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest) - K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg - pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest) - - #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function - positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n)) - - # calculate the score for the negative set - negative_pro_pro_neighbor = get_neighbors( - G, negative_protein, "protein_protein" - ) - negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term") - negative_go_annotated_protein_neighbor_count = ( - get_go_annotated_pro_pro_neighbor_count( - G, negative_pro_pro_neighbor, negative_go - ) - ) - - neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self) - neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos) - - negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n)) - - # input positive and negative score to data - data["protein"].append(positive_protein) - data["go_term"].append(positive_go) - data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor)) - data["go_neighbor"].append(len(positive_go_neighbor)) - data["go_annotated_pro_pro_neighbors"].append( - positive_go_annotated_pro_pro_neighbor_count - ) - data["score"].append(positive_score) - data["true_label"].append(1) - - data["protein"].append(negative_protein) - data["go_term"].append(negative_go) - data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor)) - data["go_neighbor"].append(len(negative_go_neighbor)) - data["go_annotated_pro_pro_neighbors"].append( - negative_go_annotated_protein_neighbor_count - ) - data["score"].append(negative_score) - data["true_label"].append(0) - - print_progress(i, len(positive_dataset["protein"])) - i += 1 - - normalized_data = normalize(data["score"]) - for item in normalized_data: - data["norm_score"].append(item) - - df = pd.DataFrame(data) - df = df.sort_values(by="norm_score", ascending=False) - - df.to_csv( - Path(output_path, "hypergeometricdistributionV3.csv"), - index=False, - sep="\t", - ) - - y_score = df["norm_score"].to_list() - y_true = df["true_label"].to_list() - - return y_score, y_true - - -def get_neighbors(G: nx.Graph, node, edgeType): - res = G.edges(node, data=True) - neighbors = [] - for edge in res: - if edge[2]["type"] == edgeType: - neighborNode = [edge[1], edge[2]] - neighbors.append(neighborNode) - - return neighbors - - -def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm): - count = 0 - for element in nodeList: - if G.has_edge(element[0], goTerm): - count += 1 - return count diff --git a/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V3-checkpoint.py b/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V3-checkpoint.py deleted file mode 100644 index 9654e8f..0000000 --- a/classes/.ipynb_checkpoints/hypergeometric_distribution_class_V3-checkpoint.py +++ /dev/null @@ -1,164 +0,0 @@ -from classes.base_algorithm_class import BaseAlgorithm -import networkx as nx -import pandas as pd -from colorama import init as colorama_init -from colorama import Fore, Back, Style -from pathlib import Path -import math -from tools.helper import print_progress, normalize, import_graph_from_pickle -from tools.workflow import get_datasets - - -class HypergeometricDistributionV3(BaseAlgorithm): - def __init__(self): - self.y_score = [] - self.y_true = [] - - def get_y_score(self): - return self.y_score - - def get_y_true(self): - return self.y_true - - def set_y_score(self, y_score): - self.y_score = y_score - - def set_y_true(self, y_true): - self.y_true = y_true - - def predict( - self, - input_directory_path, - graph_file_path, - output_path, - ): - """ - Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of - interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein - of interest and/or the GO term). Does not include the protein of interest. - """ - colorama_init() - - # have two sets of positive and negative protein-go_term pairs - # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term. - # 50% of the data are proteins that are annotated to a GO term - # 50% of the data are proteins that are not annotated to a GO term - - data = { - "protein": [], - "go_term": [], - "pro_pro_neighbor": [], - "go_neighbor": [], - "go_annotated_pro_pro_neighbors": [], - "score": [], - "norm_score": [], - "true_label": [], - } - - positive_dataset, negative_dataset = get_datasets(input_directory_path) - G = import_graph_from_pickle(graph_file_path) - - i = 1 - for positive_protein, positive_go, negative_protein, negative_go in zip( - positive_dataset["protein"], - positive_dataset["go"], - negative_dataset["protein"], - negative_dataset["go"], - ): - - # calculate the score for the positive set - positive_pro_pro_neighbor = get_neighbors( - G, positive_protein, "protein_protein" - ) - positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term") - positive_go_annotated_pro_pro_neighbor_count = ( - get_go_annotated_pro_pro_neighbor_count( - G, positive_pro_pro_neighbor, positive_go - ) - ) - - pos_N = len(positive_pro_pro_neighbor) + len(positive_go_neighbor) -positive_go_annotated_pro_pro_neighbor_count - 1 #Sample size is only the neighbors of the protein & GO term of interest - pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has - K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include the protein of interest - pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO term and the protein of interst's neighbor proteins - - #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function - positive_score = 1 - ((math.comb(K,pos_k)*math.comb(pos_N-K,pos_n-pos_k))/math.comb(pos_N,pos_n)) - - # calculate the score for the negative set - negative_pro_pro_neighbor = get_neighbors( - G, negative_protein, "protein_protein" - ) - negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term") - negative_go_annotated_protein_neighbor_count = ( - get_go_annotated_pro_pro_neighbor_count( - G, negative_pro_pro_neighbor, negative_go - ) - ) - - neg_N = len(negative_pro_pro_neighbor) + len(negative_go_neighbor) - negative_go_annotated_protein_neighbor_count - neg_n = len(negative_pro_pro_neighbor) - neg_k = negative_go_annotated_protein_neighbor_count - - negative_score = 1 - ((math.comb(K,neg_k)*math.comb(neg_N-K,neg_n-neg_k))/math.comb(neg_N,neg_n)) - - # input positive and negative score to data - data["protein"].append(positive_protein) - data["go_term"].append(positive_go) - data["pro_pro_neighbor"].append(len(positive_pro_pro_neighbor)) - data["go_neighbor"].append(len(positive_go_neighbor)) - data["go_annotated_pro_pro_neighbors"].append( - positive_go_annotated_pro_pro_neighbor_count - ) - data["score"].append(positive_score) - data["true_label"].append(1) - - data["protein"].append(negative_protein) - data["go_term"].append(negative_go) - data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor)) - data["go_neighbor"].append(len(negative_go_neighbor)) - data["go_annotated_pro_pro_neighbors"].append( - negative_go_annotated_protein_neighbor_count - ) - data["score"].append(negative_score) - data["true_label"].append(0) - - print_progress(i, len(positive_dataset["protein"])) - i += 1 - - normalized_data = normalize(data["score"]) - for item in normalized_data: - data["norm_score"].append(item) - - df = pd.DataFrame(data) - df = df.sort_values(by="norm_score", ascending=False) - - df.to_csv( - Path(output_path, "hypergeometricdistribution.csv"), - index=False, - sep="\t", - ) - - y_score = df["norm_score"].to_list() - y_true = df["true_label"].to_list() - - return y_score, y_true - - -def get_neighbors(G: nx.Graph, node, edgeType): - res = G.edges(node, data=True) - neighbors = [] - for edge in res: - if edge[2]["type"] == edgeType: - neighborNode = [edge[1], edge[2]] - neighbors.append(neighborNode) - - return neighbors - - -def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm): - count = 0 - for element in nodeList: - if G.has_edge(element[0], goTerm): - count += 1 - return count diff --git a/tools/.ipynb_checkpoints/helper-checkpoint.py b/tools/.ipynb_checkpoints/helper-checkpoint.py deleted file mode 100644 index 3be6cee..0000000 --- a/tools/.ipynb_checkpoints/helper-checkpoint.py +++ /dev/null @@ -1,154 +0,0 @@ -from colorama import Fore, Style -import networkx as nx -import random -import numpy as np -import pickle - - -def print_progress(current, total, bar_length=65): - # Calculate the progress as a percentage - percent = float(current) / total - # Determine the number of hash marks in the progress bar - arrow = "-" * int(round(percent * bar_length) - 1) + ">" - spaces = " " * (bar_length - len(arrow)) - - # Choose color based on completion - if current < total: - color = Fore.YELLOW - else: - color = Fore.GREEN - - # Construct the progress bar string - progress_bar = f"[{arrow + spaces}] {int(round(percent * 100))}%" - - # Print the progress bar with color, overwriting the previous line - print(f"\r{color}{progress_bar}{Style.RESET_ALL}", end="") - - -def create_ppi_network(fly_interactome, fly_GO_term): - print("Initializing network") - i = 1 - total_progress = len(fly_interactome) + len(fly_GO_term) - G = nx.Graph() - protein_protein_edge = 0 - protein_go_edge = 0 - protein_node = 0 - go_node = 0 - protein_list = [] - go_term_list = [] - - # go through fly interactome, add a new node if it doesnt exists already, then add their physical interactions as edges - for line in fly_interactome: - if not G.has_node(line[2]): - G.add_node(line[2], name=line[0], type="protein") - protein_list.append({"id": line[2], "name": line[0]}) - protein_node += 1 - - if not G.has_node(line[3]): - G.add_node(line[3], name=line[1], type="protein") - protein_list.append({"id": line[3], "name": line[1]}) - protein_node += 1 - - G.add_edge(line[2], line[3], type="protein_protein") - protein_protein_edge += 1 - print_progress(i, total_progress) - i += 1 - - # Proteins annotated with a GO term have an edge to a GO term node - for line in fly_GO_term: - if not G.has_node(line[1]): - G.add_node(line[1], type="go_term") - go_term_list.append(line[1]) - go_node += 1 - - if not G.has_node(line[0]): - G.add_node(line[0], name=line[0], type="protein") - protein_list.append({"id": line[0], "name": line[0]}) - protein_node += 1 - - G.add_edge(line[1], line[0], type="protein_go_term") - protein_go_edge += 1 - print_progress(i, total_progress) - i += 1 - - print("") - print("") - print("network summary") - - print("protein-protein edge count: ", protein_protein_edge) - print("protein-go edge count: ", protein_go_edge) - print("protein node count: ", protein_node) - print("go node count: ", go_node) - print("total edge count: ", len(G.edges())) - print("total node count: ", len(G.nodes())) - - return G, protein_list - - -def read_specific_columns(file_path, columns, delimit): - try: - with open(file_path, "r") as file: - next(file) - data = [] - for line in file: - parts = line.strip().split(delimit) - selected_columns = [] - for col in columns: - selected_columns.append(parts[col].replace('"', "")) - data.append(selected_columns) - return data - except FileNotFoundError: - print(f"Error: File '{file_path}' not found.") - return None - except Exception as e: - print(f"An error occurred: {e}") - return None - - -def generate_random_colors(num_colors): - colors = [] - for _ in range(num_colors): - color = (random.random(), random.random(), random.random()) - colors.append(color) - return colors - - -def normalize(data): - data = np.array(data) - min_val = data.min() - max_val = data.max() - - if min_val == max_val: - return np.zeros_like(data) - - normalized_data = (data - min_val) / (max_val - min_val) - return normalized_data.tolist() - - -def get_neighbors(G: nx.Graph, node, edgeType): - res = G.edges(node, data=True) - neighbors = [] - for edge in res: - if edge[2]["type"] == edgeType: - neighborNode = [edge[1], edge[2]] - neighbors.append(neighborNode) - - return neighbors - - -def add_print_statements(filename, statements): - # Open the file in append mode (will create the file if it doesn't exist) - with open(filename, "w") as file: - for statement in statements: - # Write each statement to the file - file.write(f"{statement}\n") - - -def export_graph_to_pickle(graph, filename): - with open(filename, 'wb') as f: - pickle.dump(graph, f) - - -def import_graph_from_pickle(filename): - with open(filename, 'rb') as f: - return pickle.load(f)