From 43f18bf40b573e30b61d81cd3f034cbbe1992782 Mon Sep 17 00:00:00 2001 From: ctrlaltaf Date: Tue, 11 Jun 2024 10:42:56 -0700 Subject: [PATCH] Fixed output file naming and preparing for pytest --- .gitignore | 4 +- classes/hypergeometric_distribution_class.py | 2 +- .../hypergeometric_distribution_class_V2.py | 2 +- .../hypergeometric_distribution_class_V3.py | 2 +- .../hypergeometric_distribution_class_V4.py | 2 +- main.py | 13 ++++-- main_replicates.py | 44 +++++++++++-------- tools/workflow.py | 23 +++++++--- 8 files changed, 59 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index b1862fd..312d7be 100644 --- a/.gitignore +++ b/.gitignore @@ -45,4 +45,6 @@ env/ *.log output/ -input/ \ No newline at end of file +input/ + +.ipynb_checkpoints diff --git a/classes/hypergeometric_distribution_class.py b/classes/hypergeometric_distribution_class.py index 1076cd4..e508c84 100644 --- a/classes/hypergeometric_distribution_class.py +++ b/classes/hypergeometric_distribution_class.py @@ -132,7 +132,7 @@ def predict( df = df.sort_values(by="norm_score", ascending=False) df.to_csv( - Path(output_path, "hypergeometricdistribution.csv"), + Path(output_path, "hypergeometric_distribution.csv"), index=False, sep="\t", ) diff --git a/classes/hypergeometric_distribution_class_V2.py b/classes/hypergeometric_distribution_class_V2.py index 7215725..5847929 100644 --- a/classes/hypergeometric_distribution_class_V2.py +++ b/classes/hypergeometric_distribution_class_V2.py @@ -132,7 +132,7 @@ def predict( df = df.sort_values(by="norm_score", ascending=False) df.to_csv( - Path(output_path, "hypergeometricdistributionV3.csv"), + Path(output_path, "hypergeometric_distribution_V2.csv"), index=False, sep="\t", ) diff --git a/classes/hypergeometric_distribution_class_V3.py b/classes/hypergeometric_distribution_class_V3.py index 9654e8f..44584c6 100644 --- a/classes/hypergeometric_distribution_class_V3.py +++ b/classes/hypergeometric_distribution_class_V3.py @@ -134,7 +134,7 @@ def predict( df = df.sort_values(by="norm_score", ascending=False) df.to_csv( - Path(output_path, "hypergeometricdistribution.csv"), + Path(output_path, "hypergeometric_distribution_v3.csv"), index=False, sep="\t", ) diff --git a/classes/hypergeometric_distribution_class_V4.py b/classes/hypergeometric_distribution_class_V4.py index 27c16f4..7e11186 100644 --- a/classes/hypergeometric_distribution_class_V4.py +++ b/classes/hypergeometric_distribution_class_V4.py @@ -135,7 +135,7 @@ def predict( df = df.sort_values(by="norm_score", ascending=False) df.to_csv( - Path(output_path, "hypergeometricdistribution.csv"), + Path(output_path, "hypergeometric_distribution_v4_data.csv"), index=False, sep="\t", ) diff --git a/main.py b/main.py index ca7cc9b..0419d29 100644 --- a/main.py +++ b/main.py @@ -45,6 +45,11 @@ def main(): graph_file_path = Path(dataset_directory_path, "graph.pickle") sample_size = 1000 + testing_output_data_path = Path("./output/data/") + testing_output_image_path = Path("./output/images/") + testing_input_directory_path = Path("./tests/testing-dataset/") + testing_graph_file_path = Path(testing_input_directory_path, "graph.pickle") + interactome_columns = [0, 1, 4, 5] interactome = read_specific_columns(interactome_path, interactome_columns, "\t") @@ -81,10 +86,10 @@ def main(): results = run_workflow( algorithm_classes, - dataset_directory_path, - graph_file_path, - output_data_path, - output_image_path, + testing_input_directory_path, + testing_graph_file_path, + testing_output_data_path, + testing_output_image_path, True, True, ) diff --git a/main_replicates.py b/main_replicates.py index 0e28998..eacb356 100644 --- a/main_replicates.py +++ b/main_replicates.py @@ -75,22 +75,24 @@ def main(): "HypergeometricDistributionV3": HypergeometricDistributionV3, "HypergeometricDistributionV4": HypergeometricDistributionV4, } - - x = 20 #Number of replicates + + x = 2 # Number of replicates print_graphs = False auc = {} - #index 0 is ROC, index 1 is Precision Recall + # index 0 is ROC, index 1 is Precision Recall for i in algorithm_classes.keys(): - auc[i] = [[],[]] + auc[i] = [[], []] - for i in range(x): #Creates a pos/neg list each replicate then runs workflow like normal + for i in range( + x + ): # Creates a pos/neg list each replicate then runs workflow like normal print("\n\nReplicate: " + str(i) + "\n") - + # if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used positive_dataset, negative_dataset = sample_data( go_protein_pairs, sample_size, protein_list, G, dataset_directory_path ) - + results = run_workflow( algorithm_classes, dataset_directory_path, @@ -101,29 +103,33 @@ def main(): print_graphs, ) - #each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm + # each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm for i in algorithm_classes.keys(): - auc[i][0].append(results[i]['roc_auc']) - auc[i][1].append(results[i]['pr_auc']) + auc[i][0].append(results[i]["roc_auc"]) + auc[i][1].append(results[i]["pr_auc"]) - #Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3 + # Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3 for i in auc.keys(): - meanROC = round(stat.mean(auc[i][0]),5) - auc[i].append(round(stat.mean(auc[i][1]),5)) - auc[i].append(round(stat.stdev(auc[i][1]),5)) - auc[i][1] = round(stat.stdev(auc[i][0]),5) + meanROC = round(stat.mean(auc[i][0]), 5) + auc[i].append(round(stat.mean(auc[i][1]), 5)) + auc[i].append(round(stat.stdev(auc[i][1]), 5)) + auc[i][1] = round(stat.stdev(auc[i][0]), 5) auc[i][0] = meanROC - #Prints the roc and pr table, then saves to .csv file - df = pd.DataFrame.from_dict(auc, orient = 'index', columns = ['ROC mean', 'ROC sd', 'Precision/Recall mean', 'Precision/Recall sd']) + # Prints the roc and pr table, then saves to .csv file + df = pd.DataFrame.from_dict( + auc, + orient="index", + columns=["ROC mean", "ROC sd", "Precision/Recall mean", "Precision/Recall sd"], + ) print() print(df) df.to_csv( - Path(output_data_path, "auc_values.csv"), + Path(output_data_path, "repeated_auc_values.csv"), index=True, sep="\t", ) - + sys.exit() diff --git a/tools/workflow.py b/tools/workflow.py index 7837b0b..a5ae925 100644 --- a/tools/workflow.py +++ b/tools/workflow.py @@ -38,7 +38,7 @@ def run_workflow( if threshold: run_thresholds(results, algorithm_classes, output_data_path) if figures: - generate_figures(algorithm_classes, results, output_image_path) + generate_figures(algorithm_classes, results, output_image_path, output_data_path) return results @@ -135,12 +135,12 @@ def run_thresholds(results, algorithm_classes, output_data_path): ) -def generate_figures(algorithm_classes, results, output_image_path): +def generate_figures(algorithm_classes, results, output_image_path, output_data_path): # Generate ROC and PR figures to compare methods colors = generate_random_colors(len(algorithm_classes)) - sorted_results = sort_results_by(results, "roc_auc") + sorted_results = sort_results_by(results, "roc_auc", output_data_path) i = 0 plt.figure() for algorithm_name, metrics in sorted_results.items(): @@ -162,7 +162,7 @@ def generate_figures(algorithm_classes, results, output_image_path): plt.savefig(Path(output_image_path, "multiple_roc_curves.png")) plt.show() - sorted_results = sort_results_by(results, "pr_auc") + sorted_results = sort_results_by(results, "pr_auc", output_data_path) i = 0 plt.figure() for algorithm_name, metrics in sorted_results.items(): @@ -245,12 +245,25 @@ def get_datasets(input_directory_path): return positive_dataset, negative_dataset -def sort_results_by(results, key): +def sort_results_by(results, key, output_path): algorithm_tuple_list = [] + data = {"algorithm" : [], key: []} + output_file_path = Path(output_path, key + "_results.csv") # make a list of tuples where a tuple is (algorithm_name, the metric we will be sorting by) for algorithm_name, metrics in results.items(): algorithm_tuple_list.append((algorithm_name, metrics[key])) + data["algorithm"].append(algorithm_name) + data[key].append(metrics[key]) + + df = pd.DataFrame(data) + df = df.sort_values(by=key, ascending=False) + + df.to_csv( + output_file_path, + index=False, + sep="\t", + ) algorithm_tuple_list = sorted(algorithm_tuple_list, key=itemgetter(1), reverse=True)