From 1872c4835ca9b1e68985249ae4316acb1b5febc9 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 28 Oct 2024 13:37:32 -0500 Subject: [PATCH 01/22] ideas for parameter tuning --- Snakefile | 14 +++ config/config.yaml | 5 + parameter-tuning.py | 288 +++++++++++++++++++++++++++++++++++++++++++ spras/analysis/ml.py | 6 +- spras/evaluation.py | 8 +- 5 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 parameter-tuning.py diff --git a/Snakefile b/Snakefile index 9d2debe3..65143cf3 100644 --- a/Snakefile +++ b/Snakefile @@ -381,6 +381,20 @@ rule evaluation: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table Evaluation.precision(input.pathways, node_table, output.eval_file) + # add recall + # Run "PR" curves for output pathays precision and recall + # Run PR curves for ensemble files only + # Run PCA "tuning" idea + +# parameter tuning section? +# does there need to be a seperate section for parameter tuning if evaluation will deal with it +# PCA +# - only one that isn't taken care of by the evaluation code directly, but can be added as something to look at in evaluation +# no parameter tuning +# - will use the outputs that can be put into evaluation +# ensembling +# - will use the outputs that can be put into evaluation + # Remove the output directory rule clean: diff --git a/config/config.yaml b/config/config.yaml index b87bcd45..4b473050 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -174,3 +174,8 @@ analysis: metric: 'euclidean' evaluation: include: true + # update to decouple the evaluation parts? + # - ensemble vs all pathways vs pca chosen pathway + # pr curves from ensemble files + # "pr" curves from all pathways + # p and r from pca chosen pathway diff --git a/parameter-tuning.py b/parameter-tuning.py new file mode 100644 index 00000000..0094e3c1 --- /dev/null +++ b/parameter-tuning.py @@ -0,0 +1,288 @@ +import glob +import os +import pickle as pkl +from pathlib import Path +from typing import Dict, Iterable + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.metrics import ( + PrecisionRecallDisplay, + average_precision_score, + precision_recall_curve, + precision_score, + recall_score, +) + +from spras.analysis.ml import summarize_networks +from spras.evaluation import Evaluation + +# make directories +directories = ["parameter-tuning","parameter-tuning/ensembling-parameter-tuning", "parameter-tuning/no-parameter-tuning", "parameter-tuning/pca-parameter-tuning"] + +for directory in directories: + if not os.path.exists(directory): + os.makedirs(directory) + print(f"Directory {directory} was created.") + else: + print(f"Directory {directory} already exists.") + + +# ################################################################################################################################################# +# Parameter Tuning with Ensemble networks + +def select_max_freq_and_node(row): + max_freq = 0 + node = "" + if pd.isna(row['Node2']) and pd.isna(row['Freq2']): + max_freq = row['Freq1'] + node = row['Node1'] + elif pd.isna(row['Node1']) and pd.isna(row['Freq1']): + max_freq = row['Freq2'] + node = row['Node2'] + else: + max_freq = max(row['Freq1'], row['Freq2']) + node = row['Node1'] + return node, max_freq + +def precision_recall(file, node_table, node_freq_filename, output_file): + gold_standard_nodes = set(node_table['NODEID']) + + df = pd.read_table(file, sep="\t", header=0) + + node1_freq = df.drop(columns = ['Node2', 'Direction']) + node2_freq = df.drop(columns = ['Node1', 'Direction']) + max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() + max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) + max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() + max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) + node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') + node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(select_max_freq_and_node, axis=1, result_type='expand') + node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) + + node_df_merged.sort_values('max_freq', ascending= False, inplace = True) + node_df_merged.to_csv(node_freq_filename, sep = "\t",header=True, index=False) + + y_true = [1 if node in gold_standard_nodes else 0 for node in node_df_merged['Node']] + y_scores = node_df_merged['max_freq'].tolist() + + # print(f"y_true:\n{y_true}") + # print(f"y_score:\n{y_scores}") + + plt.figure() + precision, recall, thresholds = precision_recall_curve(y_true, y_scores) + # print(f"precision:{precision}\n recall:{recall}\n thresholds:{thresholds}\n") + auc_precision_recall = average_precision_score(y_true, y_scores) + + plt.plot(recall, precision, marker='o', label='Precision-Recall curve') + plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.legend() + plt.grid(True) + plt.savefig(output_filename) + + # print(f"overlapping nodes: {len(set(node_df_merged['Node'].tolist()) & gold_standard_nodes)}") + # print(f"average_precision_score: {auc_precision_recall}") + +# TODO: fix mincostflow bug with summarize networks +algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] + +gold_standard_file = "output/gs_egfr-merged.pickle" +node_table = Evaluation.from_file(gold_standard_file).node_table +new_folder_path = 'parameter-tuning/ensembling-parameter-tuning/' + +for algo in algorithms: + ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt" + node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt" + output_filename = f"{new_folder_path}{algo}-pr.png" + try: + precision_recall(ensemble_filename, node_table, node_freq_filename, output_filename) + except Exception as error: + print(error) + +# code to work for MEO +algorithms = ['meo'] + +for algo in algorithms: + ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt" + df = pd.read_table(ensemble_filename, sep="\t", header=0) + df['Node1'] = df['Node1'] + '_HUMAN' + df['Node2'] = df['Node2'] + '_HUMAN' + df['Node1'] = df['Node1'].replace({ + 'Ca++_HUMAN': 'Ca++_PSEUDONODE', + 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', + 'DAG_HUMAN': 'DAG_PSEUDONODE' + }) + df['Node2'] = df['Node2'].replace({ + 'Ca++_HUMAN': 'Ca++_PSEUDONODE', + 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', + 'DAG_HUMAN': 'DAG_PSEUDONODE' + }) + + updated_ensemble_filename = f"{new_folder_path}meo-ensemble-pathway-updated.txt" + df.to_csv(updated_ensemble_filename, sep="\t", header=True, index=False) + node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt" + output_filename = f"{new_folder_path}{algo}-pr.png" + try: + precision_recall(updated_ensemble_filename, node_table, node_freq_filename, output_filename) + except Exception as error: + print(error) + + +################################################################################################################################################# +# No Parameter Tuning + +def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): + """ + Takes in file paths for a specific dataset and an associated gold standard node table. + Calculates recall for each pathway file + Returns output back to output_file + @param file_paths: file paths of pathway reconstruction algorithm outputs + @param node_table: the gold standard nodes + @param output_file: the filename to save the precision of each pathway + """ + y_true = set(node_table['NODEID']) + results = [] + + for file in file_paths: + df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) + y_pred = set(df['Node1']).union(set(df['Node2'])) + all_nodes = y_true.union(y_pred) + y_true_binary = [1 if node in y_true else 0 for node in all_nodes] + y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] + + # default to 0.0 if there is a divide by 0 error + precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) + recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) + results.append({"Pathway": file, "Precision": precision, "Recall": recall}) + + pr_df = pd.DataFrame(results) + pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True) + pr_df.to_csv(output_file, sep="\t", index=False) + return pr_df + + +algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] + +gold_standard_file = "output/gs_egfr-merged.pickle" +node_table = Evaluation.from_file(gold_standard_file).node_table +folder_path = 'output/' +new_folder_path = 'parameter-tuning/no-parameter-tuning/' + +for algo in algorithms: + file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt") + files = glob.glob(file_pattern) + output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" + prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png" + + pr_df = precision_and_recall(file_paths=files, node_table=node_table, output_file=output_file) + + plt.figure(figsize=(8, 6)) + plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"{algo} Precision-Recall Curve") + plt.legend() + plt.grid(True) + plt.savefig(prcurve_filename) + + +# code to work for MEO +def precision_and_recall_meo(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): + """ + Takes in file paths for a specific dataset and an associated gold standard node table. + Calculates recall for each pathway file + Returns output back to output_file + @param file_paths: file paths of pathway reconstruction algorithm outputs + @param node_table: the gold standard nodes + @param output_file: the filename to save the precision of each pathway + """ + y_true = set(node_table['NODEID']) + results = [] + + for file in file_paths: + df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) + df['Node1'] = df['Node1'] + '_HUMAN' + df['Node2'] = df['Node2'] + '_HUMAN' + df['Node1'] = df['Node1'].replace({ + 'Ca++_HUMAN': 'Ca++_PSEUDONODE', + 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', + 'DAG_HUMAN': 'DAG_PSEUDONODE' + }) + df['Node2'] = df['Node2'].replace({ + 'Ca++_HUMAN': 'Ca++_PSEUDONODE', + 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', + 'DAG_HUMAN': 'DAG_PSEUDONODE' + }) + + y_pred = set(df['Node1']).union(set(df['Node2'])) + all_nodes = y_true.union(y_pred) + y_true_binary = [1 if node in y_true else 0 for node in all_nodes] + y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] + + # default to 0.0 if there is a divide by 0 error + precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) + recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) + results.append({"Pathway": file, "Precision": precision, "Recall": recall}) + + pr_df = pd.DataFrame(results) + pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True) + pr_df.to_csv(output_file, sep="\t", index=False) + return pr_df + +algorithms = ['meo'] + +for algo in algorithms: + + file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt") + files = glob.glob(file_pattern) + output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" + prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png" + + pr_df = precision_and_recall_meo(file_paths=files, node_table=node_table, output_file=output_file) + + plt.figure(figsize=(8, 6)) + plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"{algo} Precision-Recall Curve") + plt.legend() + plt.grid(True) + plt.savefig(prcurve_filename) + +################################################################################################################################################# +# PCA parameter tuning + +algorithms = ['omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'domino', 'meo', 'allpairs'] +folder_path = 'output/' +gold_standard_file = "output/gs_egfr-merged.pickle" +node_table = Evaluation.from_file(gold_standard_file).node_table +new_folder_path = 'parameter-tuning/pca-parameter-tuning/' + +for algo in algorithms: + file_path = os.path.join(folder_path, f"tps_egfr-ml", f"{algo}-pca-coordinates.txt") + try: + coord_df = pd.read_csv(file_path, delimiter="\t", header=0) + except Exception as error: + print(f"PCA parameter tuning: {error}") + continue + + # centroid + centroid_row = coord_df[coord_df['algorithm'] == 'centroid'] + centroid = centroid_row.iloc[0, 1:].tolist() + + # update df to exclude centroid point + coord_df = coord_df[coord_df['algorithm'] != 'centroid'] + + # euclidean distance + pc_columns = [col for col in coord_df.columns if col.startswith('PC')] + coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) + closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] + + # finding the rep pathway + rep_pathway = [os.path.join(folder_path, f"{closest_to_centroid['algorithm']}", "pathway.txt")] + output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" + precision_and_recall(rep_pathway, node_table, output_file) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 3dad8775..9fdc6e03 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -146,6 +146,8 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: scaler.fit(X) # calc mean and standard deviation X_scaled = scaler.transform(X) + # TODO: add in centroid code from other branch + # choosing the PCA pca_instance = PCA(n_components=components) pca_instance.fit(X_scaled) @@ -163,9 +165,11 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: # saving the coordinates of each algorithm make_required_dirs(output_coord) coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) - coordinates_df.insert(0, 'algorithm', columns.tolist()) + coordinates_df.insert(0, 'algorithm', columns.tolist()) # update the algortihms to somehting else (datapoints labels?) coordinates_df.to_csv(output_coord, sep='\t', index=False) + # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint + # saving the principal components make_required_dirs(output_var) with open(output_var, "w") as f: diff --git a/spras/evaluation.py b/spras/evaluation.py index 5d00e7d4..344e06a5 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -71,7 +71,7 @@ def load_files_from_dict(self, gold_standard_dict: Dict): # TODO: later iteration - chose between node and edge file, or allow both - @staticmethod + @staticmethod # TODO update to do precision and recall in the same function for the nodes def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): """ Takes in file paths for a specific dataset and an associated gold standard node table. @@ -98,3 +98,9 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: precision_df = pd.DataFrame(results) precision_df.to_csv(output_file, sep="\t", index=False) + + # TODO make PR curves for the nodes from ensembled files outputs + # TODO make the edge frequency node ensembles + + # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway + \ No newline at end of file From 7e0a990d98ec77c3be8b28f140f816a17154f71e Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 29 Oct 2024 10:59:17 -0500 Subject: [PATCH 02/22] update to ml code for centroid, update to eval code to have recall --- Snakefile | 5 +++-- spras/analysis/ml.py | 20 ++++++++++++++++---- spras/evaluation.py | 23 +++++++++++++++++------ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/Snakefile b/Snakefile index 65143cf3..cc39780b 100644 --- a/Snakefile +++ b/Snakefile @@ -373,18 +373,19 @@ def get_dataset_label(wildcards): return dataset # Run evaluation code for a specific dataset's pathway outputs against its paired gold standard -rule evaluation: +rule evaluation: # update to be per algorithm and for all algortihms input: gold_standard_file = get_gold_standard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"]) run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision(input.pathways, node_table, output.eval_file) + Evaluation.precision_and_recall(input.pathways, node_table, output.eval_file) # add recall # Run "PR" curves for output pathays precision and recall # Run PR curves for ensemble files only # Run PCA "tuning" idea + # - will either need to read file from ml_analysis or rerun pca rule # parameter tuning section? # does there need to be a seperate section for parameter tuning if evaluation will deal with it diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 9fdc6e03..1585db89 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -146,7 +146,6 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: scaler.fit(X) # calc mean and standard deviation X_scaled = scaler.transform(X) - # TODO: add in centroid code from other branch # choosing the PCA pca_instance = PCA(n_components=components) @@ -154,21 +153,34 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: X_pca = pca_instance.transform(X_scaled) variance = pca_instance.explained_variance_ratio_ * 100 + # TODO: add in centroid code from other branch + # calculating the centroid + centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples + # making the plot label_color_map = create_palette(column_names) plt.figure(figsize=(10, 7)) - sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, legend=True, palette=label_color_map) + sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, palette=label_color_map) + plt.scatter(centroid[0], centroid[1], color='red', marker='X', s=100, label='Centroid') plt.title("PCA") + plt.legend() plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)") plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)") + # saving the coordinates of each algorithm + # make_required_dirs(output_coord) + # coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) + # coordinates_df.insert(0, 'algorithm', columns.tolist()) + # coordinates_df.to_csv(output_coord, sep='\t', index=False) + # saving the coordinates of each algorithm make_required_dirs(output_coord) coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) - coordinates_df.insert(0, 'algorithm', columns.tolist()) # update the algortihms to somehting else (datapoints labels?) + coordinates_df.insert(0, 'datapoint_labels', columns.tolist()) + centroid_row = ['centroid'] + centroid.tolist() # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint + coordinates_df.loc[len(coordinates_df)] = centroid_row coordinates_df.to_csv(output_coord, sep='\t', index=False) - # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint # saving the principal components make_required_dirs(output_var) diff --git a/spras/evaluation.py b/spras/evaluation.py index 344e06a5..1330ee2b 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -4,7 +4,7 @@ from typing import Dict, Iterable import pandas as pd -from sklearn.metrics import precision_score +from sklearn.metrics import precision_score, recall_score class Evaluation: @@ -71,11 +71,11 @@ def load_files_from_dict(self, gold_standard_dict: Dict): # TODO: later iteration - chose between node and edge file, or allow both - @staticmethod # TODO update to do precision and recall in the same function for the nodes - def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): + @staticmethod + def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): """ Takes in file paths for a specific dataset and an associated gold standard node table. - Calculates precision for each pathway file + Calculates precision and recall for each pathway file Returns output back to output_file @param file_paths: file paths of pathway reconstruction algorithm outputs @param node_table: the gold standard nodes @@ -93,14 +93,25 @@ def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: # default to 0.0 if there is a divide by 0 error precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) - - results.append({"Pathway": file, "Precision": precision}) + recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) + results.append({"Pathway": file, "Precision": precision, "Recall": recall}) precision_df = pd.DataFrame(results) precision_df.to_csv(output_file, sep="\t", index=False) + # TODO make "PR" curves from the precision_and_recall file + + def edge_frequency_nodes(ensemble_file: str, node_table:pd.DataFrame, output_file: str, output_png: str): + None + # create one per ensemble file + + def pr_curves (): + None + # TODO make PR curves for the nodes from ensembled files outputs # TODO make the edge frequency node ensembles + def pca_chosen_pathway(): + None # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway \ No newline at end of file From d5d0461598eb4c2d2f7e2fe0a9b27227794cea67 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 1 Nov 2024 12:52:05 -0500 Subject: [PATCH 03/22] integrated all the code --- Snakefile | 103 +- config/egfr-param-tuning.yaml | 3459 +++++++++++++++++++++++++++++++++ input/gs-egfr.txt | 324 +++ spras/evaluation.py | 108 +- 4 files changed, 3959 insertions(+), 35 deletions(-) create mode 100644 config/egfr-param-tuning.yaml create mode 100644 input/gs-egfr.txt diff --git a/Snakefile b/Snakefile index cc39780b..bfd09d35 100644 --- a/Snakefile +++ b/Snakefile @@ -42,7 +42,6 @@ def algo_has_mult_param_combos(algo): return len(algorithm_params.get(algo, {})) > 1 algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)] - # Get the parameter dictionary for the specified # algorithm and parameter combination hash def reconstruction_params(algorithm, params_hash): @@ -105,8 +104,18 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) if _config.config.analysis_include_evaluation: - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) + if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. # (if analysis is specified, these should be implicitly run). @@ -372,29 +381,81 @@ def get_dataset_label(wildcards): dataset = parts[0] return dataset -# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard -rule evaluation: # update to be per algorithm and for all algortihms + +# Run evaluation for all pathway outputs and ensemble.txt for a dataset against its paired gold standard +rule evaluation: input: gold_standard_file = get_gold_standard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), - output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"]) + ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt", + # add PCA coordinates file + pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt" + output: + pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]), + pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']), + pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']), + # add pca png and file that is needed by Evaluation.precision_and_recall + pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]), + pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]), + run: + node_table = Evaluation.from_file(input.gold_standard_file).node_table + Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png) + node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table) + Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png) + pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) + + + +# Run evaluation per algortihm for all associated pathway outputs and ensemble.txt for a dataset against its paired gold standard + +def collect_pathways_per_algo_per_dataset(wildcards): + dataset_label = get_dataset_label(wildcards) + filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] + return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label) + +def collect_ensemble_per_algo_per_dataset(wildcards): + dataset_label = get_dataset_label(wildcards) + print(dataset_label) + return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt" + +def collect_pca_coordinates_per_algo_per_dataset(wildcards): + dataset_label = get_dataset_label(wildcards) + return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt" + +rule evaluation_per_algo_pathways: + input: + gold_standard_file = get_gold_standard_pickle_file, + pathways = collect_pathways_per_algo_per_dataset, + output: + pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), # these all need to be updated to use the algortihm in it + pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']), + run: + node_table = Evaluation.from_file(input.gold_standard_file).node_table + Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png) + +rule evaluation_per_algo_ensemble_pr_curve: + input: + gold_standard_file = get_gold_standard_pickle_file, + ensemble_file = collect_ensemble_per_algo_per_dataset, + output: + pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']), + run: + node_table = Evaluation.from_file(input.gold_standard_file).node_table + node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table) + Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png) + +rule evaluation_per_algo_pca_chosen: + input: + gold_standard_file = get_gold_standard_pickle_file, + pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset + output: + pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.txt"]), + pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.png"]), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, output.eval_file) - # add recall - # Run "PR" curves for output pathays precision and recall - # Run PR curves for ensemble files only - # Run PCA "tuning" idea - # - will either need to read file from ml_analysis or rerun pca rule - -# parameter tuning section? -# does there need to be a seperate section for parameter tuning if evaluation will deal with it -# PCA -# - only one that isn't taken care of by the evaluation code directly, but can be added as something to look at in evaluation -# no parameter tuning -# - will use the outputs that can be put into evaluation -# ensembling -# - will use the outputs that can be put into evaluation + pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) # Remove the output directory diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml new file mode 100644 index 00000000..c219a471 --- /dev/null +++ b/config/egfr-param-tuning.yaml @@ -0,0 +1,3459 @@ +hash_length: 7 +container_framework: docker +unpack_singularity: false +container_registry: + base_url: docker.io + owner: reedcompbio +algorithms: + - name: omicsintegrator2 + params: + include: true + run1: + b: + - 1 + g: + - 3 + w: + - 5 + run2: + b: + - 10 + g: + - 7 + w: + - 6 + run3: + b: + - 1 + g: + - 5 + w: + - 8 + run4: + b: + - 9 + g: + - 7 + w: + - 8 + run5: + b: + - 7 + g: + - 7 + w: + - 10 + run6: + b: + - 6 + g: + - 7 + w: + - 4 + run7: + b: + - 9 + g: + - 7 + w: + - 3 + run8: + b: + - 1 + g: + - 6 + w: + - 2 + run9: + b: + - 9 + g: + - 7 + w: + - 4 + run10: + b: + - 3 + g: + - 6 + w: + - 3 + run11: + b: + - 4 + g: + - 6 + w: + - 2 + run12: + b: + - 6 + g: + - 7 + w: + - 6 + run13: + b: + - 5 + g: + - 7 + w: + - 5 + run14: + b: + - 7 + g: + - 6 + w: + - 1 + run15: + b: + - 1 + g: + - 4 + w: + - 8 + run16: + b: + - 1 + g: + - 4 + w: + - 1 + run17: + b: + - 1 + g: + - 3 + w: + - 3 + run18: + b: + - 2 + g: + - 6 + w: + - 2 + run19: + b: + - 1 + g: + - 6 + w: + - 7 + run20: + b: + - 1 + g: + - 4 + w: + - 4 + run21: + b: + - 8 + g: + - 7 + w: + - 9 + run22: + b: + - 1 + g: + - 2 + w: + - 4 + run23: + b: + - 1 + g: + - 6 + w: + - 3 + run24: + b: + - 6 + g: + - 7 + w: + - 9 + run25: + b: + - 10 + g: + - 7 + w: + - 8 + run26: + b: + - 5 + g: + - 5 + w: + - 1 + run27: + b: + - 9 + g: + - 7 + w: + - 7 + run28: + b: + - 7 + g: + - 7 + w: + - 4 + run29: + b: + - 1 + g: + - 3 + w: + - 9 + run30: + b: + - 8 + g: + - 7 + w: + - 4 + run31: + b: + - 10 + g: + - 7 + w: + - 5 + run32: + b: + - 7 + g: + - 7 + w: + - 9 + run33: + b: + - 4 + g: + - 6 + w: + - 1 + run34: + b: + - 9 + g: + - 6 + w: + - 2 + run35: + b: + - 8 + g: + - 6 + w: + - 2 + run36: + b: + - 8 + g: + - 7 + w: + - 10 + run37: + b: + - 7 + g: + - 7 + w: + - 8 + run38: + b: + - 2 + g: + - 6 + w: + - 10 + run39: + b: + - 6 + g: + - 7 + w: + - 10 + run40: + b: + - 1 + g: + - 5 + w: + - 4 + run41: + b: + - 8 + g: + - 7 + w: + - 5 + run42: + b: + - 1 + g: + - 3 + w: + - 1 + run43: + b: + - 1 + g: + - 5 + w: + - 1 + run44: + b: + - 3 + g: + - 6 + w: + - 2 + run45: + b: + - 3 + g: + - 6 + w: + - 4 + run46: + b: + - 6 + g: + - 7 + w: + - 7 + run47: + b: + - 6 + g: + - 5 + w: + - 1 + run48: + b: + - 10 + g: + - 5 + w: + - 1 + run49: + b: + - 1 + g: + - 2 + w: + - 2 + run50: + b: + - 2 + g: + - 6 + w: + - 6 + run51: + b: + - 10 + g: + - 7 + w: + - 10 + run52: + b: + - 10 + g: + - 7 + w: + - 9 + run53: + b: + - 8 + g: + - 5 + w: + - 1 + run54: + b: + - 6 + g: + - 6 + w: + - 2 + run55: + b: + - 6 + g: + - 7 + w: + - 5 + run56: + b: + - 2 + g: + - 5 + w: + - 1 + run57: + b: + - 2 + g: + - 6 + w: + - 5 + run58: + b: + - 9 + g: + - 7 + w: + - 10 + run59: + b: + - 7 + g: + - 7 + w: + - 6 + run60: + b: + - 5 + g: + - 6 + w: + - 1 + run61: + b: + - 4 + g: + - 5 + w: + - 1 + run62: + b: + - 8 + g: + - 7 + w: + - 8 + run63: + b: + - 10 + g: + - 6 + w: + - 2 + run64: + b: + - 4 + g: + - 6 + w: + - 3 + run65: + b: + - 7 + g: + - 6 + w: + - 2 + run66: + b: + - 2 + g: + - 6 + w: + - 3 + run67: + b: + - 2 + g: + - 6 + w: + - 1 + run68: + b: + - 5 + g: + - 6 + w: + - 2 + run69: + b: + - 8 + g: + - 7 + w: + - 6 + run70: + b: + - 10 + g: + - 7 + w: + - 7 + run71: + b: + - 1 + g: + - 5 + w: + - 6 + run72: + b: + - 1 + g: + - 5 + w: + - 7 + run73: + b: + - 2 + g: + - 6 + w: + - 4 + - name: domino + params: + include: true + run1: + module_threshold: + - 0.001 + slice_threshold: + - 0.1 + run2: + module_threshold: + - 0.001 + slice_threshold: + - 0.001 + run3: + module_threshold: + - 0.02 + slice_threshold: + - 0.1 + run4: + module_threshold: + - 0.01 + slice_threshold: + - 0.001 + run5: + module_threshold: + - 0.01 + slice_threshold: + - 0.1 + run6: + module_threshold: + - 0.02 + slice_threshold: + - 0.001 + run7: + module_threshold: + - 0.001 + slice_threshold: + - 0.9 + run8: + module_threshold: + - 0.001 + slice_threshold: + - 0.3 + run9: + module_threshold: + - 0.001 + slice_threshold: + - 1 + - name: mincostflow + params: + include: false + run1: + capacity: + - 15 + flow: + - 80 + run2: + capacity: + - 1 + flow: + - 6 + run3: + capacity: + - 5 + flow: + - 60 + run4: + capacity: + - 1 + flow: + - 8 + run5: + capacity: + - 5 + flow: + - 50 + run6: + capacity: + - 10 + flow: + - 150 + run7: + capacity: + - 1 + flow: + - 20 + run8: + capacity: + - 5 + flow: + - 150 + run9: + capacity: + - 5 + flow: + - 90 + run10: + capacity: + - 5 + flow: + - 70 + - name: pathlinker + params: + include: true + run1: + k: + - 200 + run2: + k: + - 10 + run3: + k: + - 50 + run4: + k: + - 30 + run5: + k: + - 40 + run6: + k: + - 500 + run7: + k: + - 20 + run8: + k: + - 60 + run9: + k: + - 100 + - name: allpairs + params: + include: true + - name: meo + params: + include: true + run1: + local_search: + - 'No' + max_path_length: + - 2 + rand_restarts: + - 10 + - name: omicsintegrator1 + params: + include: true + run1: + b: + - 5 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run2: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run3: + b: + - 5 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.1 + run4: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.1 + run5: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.001 + run6: + b: + - 2 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run7: + b: + - 5 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.001 + run8: + b: + - 5 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.1 + run9: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.5 + run10: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run11: + b: + - 10 + d: + - 40 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run12: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run13: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run14: + b: + - 5 + d: + - 40 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run15: + b: + - 10 + d: + - 40 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.01 + w: + - 0.1 + run16: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.1 + run17: + b: + - 5 + d: + - 10 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.1 + run18: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run19: + b: + - 5 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.001 + run20: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.01 + w: + - 0.5 + run21: + b: + - 2 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run22: + b: + - 5 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run23: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.5 + run24: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 8 + run25: + b: + - 10 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run26: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.5 + run27: + b: + - 10 + d: + - 30 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.1 + run28: + b: + - 5 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.5 + run29: + b: + - 5 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.1 + run30: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.1 + run31: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run32: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run33: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 2 + run34: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.5 + run35: + b: + - 10 + d: + - 10 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.1 + run36: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 2 + run37: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.1 + run38: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run39: + b: + - 5 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.5 + run40: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run41: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.1 + run42: + b: + - 2 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run43: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 8 + run44: + b: + - 10 + d: + - 30 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.1 + run45: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run46: + b: + - 5 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.1 + run47: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 8 + run48: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run49: + b: + - 5 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run50: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run51: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 2 + run52: + b: + - 5 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.1 + run53: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 2 + run54: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 2 + run55: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run56: + b: + - 10 + d: + - 40 + g: + - 0.001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.1 + run57: + b: + - 0.01 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run58: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.001 + run59: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.5 + run60: + b: + - 5 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run61: + b: + - 5 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run62: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run63: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run64: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 8 + run65: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 2 + run66: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.001 + run67: + b: + - 5 + d: + - 20 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run68: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 8 + run69: + b: + - 10 + d: + - 40 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run70: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 8 + run71: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 2 + run72: + b: + - 2 + d: + - 40 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run73: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run74: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 8 + run75: + b: + - 10 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.5 + run76: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run77: + b: + - 2 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run78: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run79: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run80: + b: + - 5 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run81: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run82: + b: + - 5 + d: + - 40 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run83: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run84: + b: + - 5 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run85: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run86: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 2 + run87: + b: + - 10 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.5 + run88: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 2 + run89: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run90: + b: + - 2 + d: + - 40 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run91: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 8 + run92: + b: + - 2 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.1 + run93: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 8 + run94: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.1 + run95: + b: + - 5 + d: + - 10 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run96: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 8 + run97: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.001 + run98: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run99: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.001 + run100: + b: + - 10 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 0.1 + run101: + b: + - 10 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run102: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run103: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 2 + run104: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 8 + run105: + b: + - 10 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.5 + run106: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 2 + run107: + b: + - 10 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.001 + run108: + b: + - 2 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run109: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 8 + run110: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run111: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 2 + run112: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 8 + run113: + b: + - 2 + d: + - 30 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run114: + b: + - 0.01 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 1 + w: + - 8 + run115: + b: + - 10 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 0.5 + run116: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 8 + run117: + b: + - 10 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run118: + b: + - 10 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.1 + run119: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 2 + run120: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run121: + b: + - 2 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run122: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run123: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 8 + run124: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run125: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run126: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run127: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run128: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run129: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.02 + r: + - 1 + w: + - 2 + run130: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run131: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 8 + run132: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 2 + run133: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 2 + run134: + b: + - 5 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run135: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run136: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run137: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 2 + run138: + b: + - 5 + d: + - 10 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run139: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 2 + run140: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run141: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run142: + b: + - 5 + d: + - 20 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run143: + b: + - 2 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run144: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run145: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.01 + w: + - 0.5 + run146: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 2 + run147: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 2 + run148: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run149: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run150: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 2 + run151: + b: + - 5 + d: + - 20 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.001 + run152: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 8 + run153: + b: + - 10 + d: + - 10 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run154: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.01 + w: + - 8 + run155: + b: + - 2 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run156: + b: + - 5 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run157: + b: + - 10 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run158: + b: + - 5 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 0.5 + run159: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 8 + run160: + b: + - 10 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run161: + b: + - 5 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.1 + run162: + b: + - 5 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run163: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.01 + w: + - 2 + run164: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run165: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 0.5 + run166: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 2 + run167: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run168: + b: + - 0.55 + d: + - 20 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.01 + w: + - 8 + run169: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 8 + run170: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run171: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 8 + run172: + b: + - 5 + d: + - 10 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run173: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run174: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run175: + b: + - 5 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run176: + b: + - 5 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run177: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.01 + w: + - 8 + run178: + b: + - 0.55 + d: + - 40 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 8 + run179: + b: + - 10 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run180: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run181: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 8 + run182: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 8 + run183: + b: + - 10 + d: + - 40 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 0.5 + run184: + b: + - 2 + d: + - 40 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run185: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 8 + run186: + b: + - 2 + d: + - 40 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run187: + b: + - 2 + d: + - 10 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run188: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run189: + b: + - 0.55 + d: + - 10 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.1 + run190: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run191: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run192: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 0.001 + run193: + b: + - 0.55 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run194: + b: + - 2 + d: + - 30 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 0.5 + run195: + b: + - 2 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 2 + run196: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.02 + r: + - 0.1 + w: + - 8 + run197: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.02 + r: + - 0.1 + w: + - 2 + run198: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.01 + w: + - 2 + run199: + b: + - 0.55 + d: + - 20 + g: + - 0.0001 + mu: + - 0.005 + r: + - 0.1 + w: + - 8 + run200: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.02 + r: + - 1 + w: + - 8 + run201: + b: + - 5 + d: + - 40 + g: + - 0.001 + mu: + - 0.03 + r: + - 1 + w: + - 0.5 + run202: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.001 + r: + - 0.1 + w: + - 0.5 + run203: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run204: + b: + - 5 + d: + - 20 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 0.5 + run205: + b: + - 0.55 + d: + - 30 + g: + - 0.001 + mu: + - 0.008 + r: + - 0.1 + w: + - 8 + run206: + b: + - 2 + d: + - 10 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 + run207: + b: + - 2 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 8 + run208: + b: + - 0.55 + d: + - 40 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.01 + w: + - 2 + run209: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.01 + w: + - 2 + run210: + b: + - 0.55 + d: + - 30 + g: + - 0.0001 + mu: + - 0.008 + r: + - 0.1 + w: + - 2 + run211: + b: + - 10 + d: + - 30 + g: + - 0.001 + mu: + - 0.03 + r: + - 0.01 + w: + - 0.5 + run212: + b: + - 2 + d: + - 20 + g: + - 0.0001 + mu: + - 0.03 + r: + - 0.1 + w: + - 8 +datasets: + - label: tps_egfr + node_files: + - tps-egfr-prizes.txt + edge_files: + - phosphosite-irefindex13.0-uniprot.txt + other_files: [] + data_dir: input +gold_standards: + - label: gs_egfr + node_files: + - gs-egfr.txt + data_dir: input + dataset_labels: + - tps_egfr +reconstruction_settings: + locations: + reconstruction_dir: output + run: true +analysis: + summary: + include: true + graphspace: + include: false + cytoscape: + include: false + ml: + include: true + aggregate_per_algorithm: true + components: 4 + labels: false + linkage: ward + metric: euclidean + evaluation: + include: true diff --git a/input/gs-egfr.txt b/input/gs-egfr.txt new file mode 100644 index 00000000..4b880cd4 --- /dev/null +++ b/input/gs-egfr.txt @@ -0,0 +1,324 @@ +1433B_HUMAN +1433E_HUMAN +1433T_HUMAN +4EBP1_HUMAN +ABI1_HUMAN +ABL1_HUMAN +ACK1_HUMAN +ACTS_HUMAN +AHSA1_HUMAN +AIMP2_HUMAN +AKT1_HUMAN +AKT2_HUMAN +AKT3_HUMAN +ANDR_HUMAN +AP2A1_HUMAN +AP2B1_HUMAN +AP2M1_HUMAN +AP2S1_HUMAN +ARAF_HUMAN +AREG_HUMAN +ARF4_HUMAN +ARF6_HUMAN +ARHG2_HUMAN +ARHG7_HUMAN +ARRB1_HUMAN +ASAP1_HUMAN +ASAP2_HUMAN +ATF1_HUMAN +ATF2_HUMAN +ATX1_HUMAN +B2CL1_HUMAN +BAD_HUMAN +BCAR1_HUMAN +BCL2_HUMAN +BDNF_HUMAN +BRAF_HUMAN +BTC_HUMAN +Ca++_PSEUDONODE +CASP3_HUMAN +CASP9_HUMAN +CAV1_HUMAN +CAV2_HUMAN +CBL_HUMAN +CBLB_HUMAN +CBLC_HUMAN +CCND1_HUMAN +CDC42_HUMAN +CDN1A_HUMAN +CEAM1_HUMAN +CEBPA_HUMAN +CEBPB_HUMAN +CLCA_HUMAN +CREB1_HUMAN +CRK_HUMAN +CRKL_HUMAN +CSK_HUMAN +CTND1_HUMAN +CXA1_HUMAN +CYH3_HUMAN +DAG_PSEUDONODE +DAXX_HUMAN +DDIT3_HUMAN +DOK2_HUMAN +DP13A_HUMAN +DP13B_HUMAN +DYN1_HUMAN +ECSIT_HUMAN +EF1A1_HUMAN +EF1A2_HUMAN +EF2K_HUMAN +EGF_HUMAN +EGFR_HUMAN +ELF3_HUMAN +ELK1_HUMAN +ELK4_HUMAN +EP15R_HUMAN +EPHB2_HUMAN +EPIPL_HUMAN +EPN1_HUMAN +EPS15_HUMAN +EPS8_HUMAN +ERBB2_HUMAN +ERBB3_HUMAN +ERBB4_HUMAN +EREG_HUMAN +ESR1_HUMAN +FAK1_HUMAN +FAK2_HUMAN +FGF1_HUMAN +FGFR1_HUMAN +FLNA_HUMAN +FLNB_HUMAN +FLNC_HUMAN +FOS_HUMAN +FOXO1_HUMAN +GA45G_HUMAN +GAB1_HUMAN +GAB2_HUMAN +GELS_HUMAN +GIT1_HUMAN +GNA12_HUMAN +GNAI1_HUMAN +GNAI3_HUMAN +GNDS_HUMAN +GRAP2_HUMAN +GRB10_HUMAN +GRB14_HUMAN +GRB2_HUMAN +GRB7_HUMAN +GSK3B_HUMAN +H31T_HUMAN +HAT1_HUMAN +HBEGF_HUMAN +HD_HUMAN +HDAC1_HUMAN +HDAC2_HUMAN +HDAC3_HUMAN +HGS_HUMAN +HIP1_HUMAN +HSPB1_HUMAN +ICEF1_HUMAN +IFIT3_HUMAN +IKKA_HUMAN +IL1A_HUMAN +IL1R1_HUMAN +ITCH_HUMAN +JAK1_HUMAN +JAK2_HUMAN +JIP2_HUMAN +JIP3_HUMAN +JUN_HUMAN +JUNB_HUMAN +JUND_HUMAN +K1C17_HUMAN +K1C18_HUMAN +K2C7_HUMAN +K2C8_HUMAN +KAP1_HUMAN +KAP2_HUMAN +KAP3_HUMAN +KAPCA_HUMAN +KAPCB_HUMAN +KCC2G_HUMAN +KLF11_HUMAN +KPCA_HUMAN +KPCD1_HUMAN +KPCG_HUMAN +KPCI_HUMAN +KPCZ_HUMAN +KS6A1_HUMAN +KS6A2_HUMAN +KS6A3_HUMAN +KS6A4_HUMAN +KS6A5_HUMAN +KS6B1_HUMAN +LTOR3_HUMAN +M3K1_HUMAN +M3K11_HUMAN +M3K12_HUMAN +M3K13_HUMAN +M3K14_HUMAN +M3K2_HUMAN +M3K3_HUMAN +M3K4_HUMAN +M3K5_HUMAN +M3K7_HUMAN +M3K8_HUMAN +M4K1_HUMAN +M4K2_HUMAN +M4K4_HUMAN +MAPK3_HUMAN +MAPK5_HUMAN +MAX_HUMAN +MCF2_HUMAN +MED1_HUMAN +MEF2C_HUMAN +MK01_HUMAN +MK03_HUMAN +MK07_HUMAN +MK08_HUMAN +MK10_HUMAN +MK14_HUMAN +MKNK2_HUMAN +MLTK_HUMAN +MP2K1_HUMAN +MP2K2_HUMAN +MP2K3_HUMAN +MP2K4_HUMAN +MP2K5_HUMAN +MP2K6_HUMAN +MP2K7_HUMAN +MTA2_HUMAN +MTOR_HUMAN +MYC_HUMAN +NCK1_HUMAN +NCK2_HUMAN +NCOA1_HUMAN +NF1_HUMAN +NFAC4_HUMAN +NGF_HUMAN +NLK_HUMAN +NRG1_HUMAN +NRG2_HUMAN +NRG3_HUMAN +NRG4_HUMAN +NTF3_HUMAN +NTRK1_HUMAN +P53_HUMAN +P55G_HUMAN +P63_HUMAN +P85A_HUMAN +P85B_HUMAN +PAK1_HUMAN +PAXI_HUMAN +PDGFA_HUMAN +PDPK1_HUMAN +PEBP1_HUMAN +PGFRA_HUMAN +PI3,4,5P3_PSEUDONODE +PI51C_HUMAN +PIPNA_HUMAN +PK3CA_HUMAN +PK3CB_HUMAN +PK3CD_HUMAN +PK3CG_HUMAN +PKD1_HUMAN +PKN2_HUMAN +PLCG1_HUMAN +PLCG2_HUMAN +PLD1_HUMAN +PLD2_HUMAN +PLEC_HUMAN +PLS1_HUMAN +PPM1B_HUMAN +PPP5_HUMAN +PRS6A_HUMAN +PTK6_HUMAN +PTN1_HUMAN +PTN11_HUMAN +PTN12_HUMAN +PTN5_HUMAN +PTN6_HUMAN +PTN7_HUMAN +PTPRH_HUMAN +PTPRR_HUMAN +RAB5A_HUMAN +RAC2_HUMAN +RAF1_HUMAN +RALB_HUMAN +RAP1A_HUMAN +RASA1_HUMAN +RASA2_HUMAN +RASH_HUMAN +RASK_HUMAN +RASN_HUMAN +RBBP7_HUMAN +RBP1_HUMAN +REPS1_HUMAN +REPS2_HUMAN +RGS16_HUMAN +RHEB_HUMAN +RHG01_HUMAN +RIPK1_HUMAN +RRAS2_HUMAN +RSSA_HUMAN +SH2D3_HUMAN +SH3G2_HUMAN +SH3G3_HUMAN +SH3K1_HUMAN +SH3L1_HUMAN +SHC1_HUMAN +SHC2_HUMAN +SHIP2_HUMAN +SHOC2_HUMAN +SIN3A_HUMAN +SMAD2_HUMAN +SMAD3_HUMAN +SMD2_HUMAN +SOCS1_HUMAN +SOCS3_HUMAN +SOS1_HUMAN +SOS2_HUMAN +SP1_HUMAN +SPY1_HUMAN +SPY2_HUMAN +SRC_HUMAN +SRF_HUMAN +STA5A_HUMAN +STA5B_HUMAN +STAM1_HUMAN +STAT1_HUMAN +STAT3_HUMAN +STK3_HUMAN +STXB1_HUMAN +SYGP1_HUMAN +SYHC_HUMAN +SYUA_HUMAN +TAB1_HUMAN +TAB2_HUMAN +TAU_HUMAN +TE2IP_HUMAN +TGFA_HUMAN +TGFB1_HUMAN +TGFR1_HUMAN +TGIF1_HUMAN +TLN1_HUMAN +TNFA_HUMAN +TNFL6_HUMAN +TNR1A_HUMAN +TNR6_HUMAN +TRAF2_HUMAN +TRAF6_HUMAN +TSC1_HUMAN +TSC2_HUMAN +UBB_HUMAN +UBC_HUMAN +US6NL_HUMAN +VAV_HUMAN +VAV2_HUMAN +VAV3_HUMAN +WASL_HUMAN +WNK1_HUMAN +ZHX2_HUMAN +ZPR1_HUMAN \ No newline at end of file diff --git a/spras/evaluation.py b/spras/evaluation.py index 1330ee2b..67346f2f 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -4,7 +4,9 @@ from typing import Dict, Iterable import pandas as pd -from sklearn.metrics import precision_score, recall_score +from sklearn.metrics import precision_score, recall_score, precision_recall_curve, average_precision_score +import matplotlib.pyplot as plt +import numpy as np class Evaluation: @@ -72,7 +74,7 @@ def load_files_from_dict(self, gold_standard_dict: Dict): # TODO: later iteration - chose between node and edge file, or allow both @staticmethod - def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): + def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str, output_png: str ): """ Takes in file paths for a specific dataset and an associated gold standard node table. Calculates precision and recall for each pathway file @@ -96,22 +98,100 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) results.append({"Pathway": file, "Precision": precision, "Recall": recall}) - precision_df = pd.DataFrame(results) - precision_df.to_csv(output_file, sep="\t", index=False) + pr_df = pd.DataFrame(results) + pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True) + pr_df.to_csv(output_file, sep="\t", index=False) # TODO make "PR" curves from the precision_and_recall file - - def edge_frequency_nodes(ensemble_file: str, node_table:pd.DataFrame, output_file: str, output_png: str): - None - # create one per ensemble file - - def pr_curves (): - None + plt.figure(figsize=(8, 6)) + plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"Precision and Recall Plot") + plt.legend() + plt.grid(True) + plt.savefig(output_png) # TODO make PR curves for the nodes from ensembled files outputs # TODO make the edge frequency node ensembles + def select_max_freq_and_node(row): + max_freq = 0 + node = "" + if pd.isna(row['Node2']) and pd.isna(row['Freq2']): + max_freq = row['Freq1'] + node = row['Node1'] + elif pd.isna(row['Node1']) and pd.isna(row['Freq1']): + max_freq = row['Freq2'] + node = row['Node2'] + else: + max_freq = max(row['Freq1'], row['Freq2']) + node = row['Node1'] + return node, max_freq + + def edge_frequency_node_ensemble(ensemble_file: str, node_table:pd.DataFrame): + + print(node_table) + print(type(ensemble_file)) + ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0) + print(ensemble_df) + if not ensemble_df.empty: + node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction']) + node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction']) + max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() + max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) + max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() + max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) + node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') + node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand') + node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) + node_df_merged.sort_values('max_freq', ascending= False, inplace = True) + print(node_df_merged) + return node_df_merged + else: + return pd.DataFrame(columns = ['Node', 'max_freq']) + + + def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): + + gold_standard_nodes = set(node_table['NODEID']) + + if not node_ensemble.empty: + y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']] + y_scores = node_ensemble['max_freq'].tolist() + precision, recall, thresholds = precision_recall_curve(y_true, y_scores) + auc_precision_recall = average_precision_score(y_true, y_scores) + + plt.figure() + plt.plot(recall, precision, marker='o', label='Precision-Recall curve') + plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.legend() + plt.grid(True) + plt.savefig(output_png) + else: + plt.figure() + plt.savefig(output_png) - def pca_chosen_pathway(): - None # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway - \ No newline at end of file + def pca_chosen_pathway(coordinates_file: str, output_dir:str): + + print(output_dir) + coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0) + + centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid'] + centroid = centroid_row.iloc[0, 1:].tolist() + + coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid'] + + pc_columns = [col for col in coord_df.columns if col.startswith('PC')] + coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) + print(coord_df.sort_values(by='Distance To Centroid')) + closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] + print(closest_to_centroid) + rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")] + + print(rep_pathway) + + return rep_pathway \ No newline at end of file From 20d20bfc28bd542fa905099fd99844a29513b90a Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 4 Nov 2024 11:34:29 -0600 Subject: [PATCH 04/22] new updates to the integration --- Snakefile | 33 ++-- config/config.yaml | 9 +- config/egfr-param-tuning.yaml | 1 + parameter-tuning.py | 288 ---------------------------------- spras/config.py | 6 + spras/evaluation.py | 83 ++++++---- 6 files changed, 80 insertions(+), 340 deletions(-) delete mode 100644 parameter-tuning.py diff --git a/Snakefile b/Snakefile index bfd09d35..f717f739 100644 --- a/Snakefile +++ b/Snakefile @@ -106,12 +106,13 @@ def make_final_input(wildcards): if _config.config.analysis_include_evaluation: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + if _config.config.analysis_include_evaluation_aggregate_algo: + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) @@ -382,7 +383,8 @@ def get_dataset_label(wildcards): return dataset -# Run evaluation for all pathway outputs and ensemble.txt for a dataset against its paired gold standard +# Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard +# TODO: figure out why this works when all one rule, but the per algorithm doesn't work like that rule evaluation: input: gold_standard_file = get_gold_standard_pickle_file, @@ -400,29 +402,29 @@ rule evaluation: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png) - node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table) - Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png) + node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) + Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png) pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) - - -# Run evaluation per algortihm for all associated pathway outputs and ensemble.txt for a dataset against its paired gold standard - +# Returns all pathways for a specific algorithm and dataset def collect_pathways_per_algo_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label) +# Returns ensemble file for a specific algorithm and dataset def collect_ensemble_per_algo_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) - print(dataset_label) return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt" +# Returns pca coordinates for a specific algorithm and dataset def collect_pca_coordinates_per_algo_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt" +# Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard +# TODO: only works when these rules are broken up rule evaluation_per_algo_pathways: input: gold_standard_file = get_gold_standard_pickle_file, @@ -442,8 +444,8 @@ rule evaluation_per_algo_ensemble_pr_curve: pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file, node_table) - Evaluation.pr_curves_ensemble_nodes(node_ensemble, node_table, output.pr_curve_png) + node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) + Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png) rule evaluation_per_algo_pca_chosen: input: @@ -457,7 +459,6 @@ rule evaluation_per_algo_pca_chosen: pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) - # Remove the output directory rule clean: shell: f'rm -rf {out_dir}' diff --git a/config/config.yaml b/config/config.yaml index 4b473050..fd47638b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -173,9 +173,8 @@ analysis: # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' evaluation: + # evaluation per dataset-goldstandard pair include: true - # update to decouple the evaluation parts? - # - ensemble vs all pathways vs pca chosen pathway - # pr curves from ensemble files - # "pr" curves from all pathways - # p and r from pca chosen pathway + # adds evaluation per algorithm per dataset-goldstandard pair + aggregate_per_algorithm: true + diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml index c219a471..ecc2a65f 100644 --- a/config/egfr-param-tuning.yaml +++ b/config/egfr-param-tuning.yaml @@ -3457,3 +3457,4 @@ analysis: metric: euclidean evaluation: include: true + aggregate_per_algorithm: true diff --git a/parameter-tuning.py b/parameter-tuning.py deleted file mode 100644 index 0094e3c1..00000000 --- a/parameter-tuning.py +++ /dev/null @@ -1,288 +0,0 @@ -import glob -import os -import pickle as pkl -from pathlib import Path -from typing import Dict, Iterable - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from sklearn.metrics import ( - PrecisionRecallDisplay, - average_precision_score, - precision_recall_curve, - precision_score, - recall_score, -) - -from spras.analysis.ml import summarize_networks -from spras.evaluation import Evaluation - -# make directories -directories = ["parameter-tuning","parameter-tuning/ensembling-parameter-tuning", "parameter-tuning/no-parameter-tuning", "parameter-tuning/pca-parameter-tuning"] - -for directory in directories: - if not os.path.exists(directory): - os.makedirs(directory) - print(f"Directory {directory} was created.") - else: - print(f"Directory {directory} already exists.") - - -# ################################################################################################################################################# -# Parameter Tuning with Ensemble networks - -def select_max_freq_and_node(row): - max_freq = 0 - node = "" - if pd.isna(row['Node2']) and pd.isna(row['Freq2']): - max_freq = row['Freq1'] - node = row['Node1'] - elif pd.isna(row['Node1']) and pd.isna(row['Freq1']): - max_freq = row['Freq2'] - node = row['Node2'] - else: - max_freq = max(row['Freq1'], row['Freq2']) - node = row['Node1'] - return node, max_freq - -def precision_recall(file, node_table, node_freq_filename, output_file): - gold_standard_nodes = set(node_table['NODEID']) - - df = pd.read_table(file, sep="\t", header=0) - - node1_freq = df.drop(columns = ['Node2', 'Direction']) - node2_freq = df.drop(columns = ['Node1', 'Direction']) - max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() - max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) - max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() - max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) - node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') - node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(select_max_freq_and_node, axis=1, result_type='expand') - node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) - - node_df_merged.sort_values('max_freq', ascending= False, inplace = True) - node_df_merged.to_csv(node_freq_filename, sep = "\t",header=True, index=False) - - y_true = [1 if node in gold_standard_nodes else 0 for node in node_df_merged['Node']] - y_scores = node_df_merged['max_freq'].tolist() - - # print(f"y_true:\n{y_true}") - # print(f"y_score:\n{y_scores}") - - plt.figure() - precision, recall, thresholds = precision_recall_curve(y_true, y_scores) - # print(f"precision:{precision}\n recall:{recall}\n thresholds:{thresholds}\n") - auc_precision_recall = average_precision_score(y_true, y_scores) - - plt.plot(recall, precision, marker='o', label='Precision-Recall curve') - plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}') - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.title('Precision-Recall Curve') - plt.legend() - plt.grid(True) - plt.savefig(output_filename) - - # print(f"overlapping nodes: {len(set(node_df_merged['Node'].tolist()) & gold_standard_nodes)}") - # print(f"average_precision_score: {auc_precision_recall}") - -# TODO: fix mincostflow bug with summarize networks -algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] - -gold_standard_file = "output/gs_egfr-merged.pickle" -node_table = Evaluation.from_file(gold_standard_file).node_table -new_folder_path = 'parameter-tuning/ensembling-parameter-tuning/' - -for algo in algorithms: - ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt" - node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt" - output_filename = f"{new_folder_path}{algo}-pr.png" - try: - precision_recall(ensemble_filename, node_table, node_freq_filename, output_filename) - except Exception as error: - print(error) - -# code to work for MEO -algorithms = ['meo'] - -for algo in algorithms: - ensemble_filename = f"output/tps_egfr-ml/{algo}-ensemble-pathway.txt" - df = pd.read_table(ensemble_filename, sep="\t", header=0) - df['Node1'] = df['Node1'] + '_HUMAN' - df['Node2'] = df['Node2'] + '_HUMAN' - df['Node1'] = df['Node1'].replace({ - 'Ca++_HUMAN': 'Ca++_PSEUDONODE', - 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', - 'DAG_HUMAN': 'DAG_PSEUDONODE' - }) - df['Node2'] = df['Node2'].replace({ - 'Ca++_HUMAN': 'Ca++_PSEUDONODE', - 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', - 'DAG_HUMAN': 'DAG_PSEUDONODE' - }) - - updated_ensemble_filename = f"{new_folder_path}meo-ensemble-pathway-updated.txt" - df.to_csv(updated_ensemble_filename, sep="\t", header=True, index=False) - node_freq_filename = f"{new_folder_path}{algo}-frequencies.txt" - output_filename = f"{new_folder_path}{algo}-pr.png" - try: - precision_recall(updated_ensemble_filename, node_table, node_freq_filename, output_filename) - except Exception as error: - print(error) - - -################################################################################################################################################# -# No Parameter Tuning - -def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): - """ - Takes in file paths for a specific dataset and an associated gold standard node table. - Calculates recall for each pathway file - Returns output back to output_file - @param file_paths: file paths of pathway reconstruction algorithm outputs - @param node_table: the gold standard nodes - @param output_file: the filename to save the precision of each pathway - """ - y_true = set(node_table['NODEID']) - results = [] - - for file in file_paths: - df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) - y_pred = set(df['Node1']).union(set(df['Node2'])) - all_nodes = y_true.union(y_pred) - y_true_binary = [1 if node in y_true else 0 for node in all_nodes] - y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] - - # default to 0.0 if there is a divide by 0 error - precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) - recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) - results.append({"Pathway": file, "Precision": precision, "Recall": recall}) - - pr_df = pd.DataFrame(results) - pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True) - pr_df.to_csv(output_file, sep="\t", index=False) - return pr_df - - -algorithms = ['mincostflow', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] - -gold_standard_file = "output/gs_egfr-merged.pickle" -node_table = Evaluation.from_file(gold_standard_file).node_table -folder_path = 'output/' -new_folder_path = 'parameter-tuning/no-parameter-tuning/' - -for algo in algorithms: - file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt") - files = glob.glob(file_pattern) - output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" - prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png" - - pr_df = precision_and_recall(file_paths=files, node_table=node_table, output_file=output_file) - - plt.figure(figsize=(8, 6)) - plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.title(f"{algo} Precision-Recall Curve") - plt.legend() - plt.grid(True) - plt.savefig(prcurve_filename) - - -# code to work for MEO -def precision_and_recall_meo(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str): - """ - Takes in file paths for a specific dataset and an associated gold standard node table. - Calculates recall for each pathway file - Returns output back to output_file - @param file_paths: file paths of pathway reconstruction algorithm outputs - @param node_table: the gold standard nodes - @param output_file: the filename to save the precision of each pathway - """ - y_true = set(node_table['NODEID']) - results = [] - - for file in file_paths: - df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) - df['Node1'] = df['Node1'] + '_HUMAN' - df['Node2'] = df['Node2'] + '_HUMAN' - df['Node1'] = df['Node1'].replace({ - 'Ca++_HUMAN': 'Ca++_PSEUDONODE', - 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', - 'DAG_HUMAN': 'DAG_PSEUDONODE' - }) - df['Node2'] = df['Node2'].replace({ - 'Ca++_HUMAN': 'Ca++_PSEUDONODE', - 'PI3,4,5P3_HUMAN': 'PI3,4,5P3_PSEUDONODE', - 'DAG_HUMAN': 'DAG_PSEUDONODE' - }) - - y_pred = set(df['Node1']).union(set(df['Node2'])) - all_nodes = y_true.union(y_pred) - y_true_binary = [1 if node in y_true else 0 for node in all_nodes] - y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] - - # default to 0.0 if there is a divide by 0 error - precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) - recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) - results.append({"Pathway": file, "Precision": precision, "Recall": recall}) - - pr_df = pd.DataFrame(results) - pr_df.sort_values(by=["Recall"], axis=0, ascending=True, inplace=True) - pr_df.to_csv(output_file, sep="\t", index=False) - return pr_df - -algorithms = ['meo'] - -for algo in algorithms: - - file_pattern = os.path.join(folder_path, f"tps_egfr-{algo}-*", "pathway.txt") - files = glob.glob(file_pattern) - output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" - prcurve_filename = f"{new_folder_path}{algo}-precision-and-recall-curve.png" - - pr_df = precision_and_recall_meo(file_paths=files, node_table=node_table, output_file=output_file) - - plt.figure(figsize=(8, 6)) - plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.title(f"{algo} Precision-Recall Curve") - plt.legend() - plt.grid(True) - plt.savefig(prcurve_filename) - -################################################################################################################################################# -# PCA parameter tuning - -algorithms = ['omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'domino', 'meo', 'allpairs'] -folder_path = 'output/' -gold_standard_file = "output/gs_egfr-merged.pickle" -node_table = Evaluation.from_file(gold_standard_file).node_table -new_folder_path = 'parameter-tuning/pca-parameter-tuning/' - -for algo in algorithms: - file_path = os.path.join(folder_path, f"tps_egfr-ml", f"{algo}-pca-coordinates.txt") - try: - coord_df = pd.read_csv(file_path, delimiter="\t", header=0) - except Exception as error: - print(f"PCA parameter tuning: {error}") - continue - - # centroid - centroid_row = coord_df[coord_df['algorithm'] == 'centroid'] - centroid = centroid_row.iloc[0, 1:].tolist() - - # update df to exclude centroid point - coord_df = coord_df[coord_df['algorithm'] != 'centroid'] - - # euclidean distance - pc_columns = [col for col in coord_df.columns if col.startswith('PC')] - coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) - closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] - - # finding the rep pathway - rep_pathway = [os.path.join(folder_path, f"{closest_to_centroid['algorithm']}", "pathway.txt")] - output_file = f"{new_folder_path}{algo}-precision-and-recall.txt" - precision_and_recall(rep_pathway, node_table, output_file) diff --git a/spras/config.py b/spras/config.py index 14f1a926..b476f98f 100644 --- a/spras/config.py +++ b/spras/config.py @@ -233,6 +233,7 @@ def process_config(self, raw_config): self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} + self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} self.pca_params = {} if "components" in self.ml_params: @@ -260,3 +261,8 @@ def process_config(self, raw_config): self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False + + if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: + self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] + else: + self.analysis_include_evaluation_aggregate_algo = False diff --git a/spras/evaluation.py b/spras/evaluation.py index 67346f2f..6757dcf9 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -3,10 +3,15 @@ from pathlib import Path from typing import Dict, Iterable -import pandas as pd -from sklearn.metrics import precision_score, recall_score, precision_recall_curve, average_precision_score import matplotlib.pyplot as plt import numpy as np +import pandas as pd +from sklearn.metrics import ( + average_precision_score, + precision_recall_curve, + precision_score, + recall_score, +) class Evaluation: @@ -81,7 +86,8 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o Returns output back to output_file @param file_paths: file paths of pathway reconstruction algorithm outputs @param node_table: the gold standard nodes - @param output_file: the filename to save the precision of each pathway + @param output_file: the filename to save the precision and recall of each pathway + @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) """ y_true = set(node_table['NODEID']) results = [] @@ -102,7 +108,6 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True) pr_df.to_csv(output_file, sep="\t", index=False) - # TODO make "PR" curves from the precision_and_recall file plt.figure(figsize=(8, 6)) plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") plt.xlabel("Recall") @@ -111,10 +116,13 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o plt.legend() plt.grid(True) plt.savefig(output_png) + # TODO: what to do when this is empty - # TODO make PR curves for the nodes from ensembled files outputs - # TODO make the edge frequency node ensembles - def select_max_freq_and_node(row): + def select_max_freq_and_node(row): # TODO: what (:type) would this row be + """ + Selects the node and frequency with the highest frequency value from two potential nodes in a row. + Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency. + """ max_freq = 0 node = "" if pd.isna(row['Node2']) and pd.isna(row['Freq2']): @@ -128,31 +136,42 @@ def select_max_freq_and_node(row): node = row['Node1'] return node, max_freq - def edge_frequency_node_ensemble(ensemble_file: str, node_table:pd.DataFrame): - - print(node_table) - print(type(ensemble_file)) + def edge_frequency_node_ensemble(ensemble_file: str): + """ + Processes an ensemble of edge frequencies to identify the highest frequency associated with each node + Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node. + Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty. + @param ensemble_file: the pre-computed node_ensemble + """ ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0) - print(ensemble_df) + if not ensemble_df.empty: node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction']) node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction']) + max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) - node_df_merged = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') - node_df_merged[['Node', 'max_freq']] = node_df_merged.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand') - node_df_merged.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) - node_df_merged.sort_values('max_freq', ascending= False, inplace = True) - print(node_df_merged) - return node_df_merged + + node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') + node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand') + node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) + node_ensemble.sort_values('max_freq', ascending= False, inplace = True) + return node_ensemble else: + # TODO: figure out how to deal with empty ensemble files return pd.DataFrame(columns = ['Node', 'max_freq']) - - def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): - + def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): + """ + Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table. + Plots a precision and recall curve for the node ensemble against its associated gold standard node table + Returns output back to output_png + @param node_ensemble: the pre-computed node_ensemble + @param node_table: the gold standard nodes + @param output_file: the filename to save the precision and recall curves + """ gold_standard_nodes = set(node_table['NODEID']) if not node_ensemble.empty: @@ -170,28 +189,30 @@ def pr_curves_ensemble_nodes(node_ensemble:pd.DataFrame, node_table:pd.DataFrame plt.legend() plt.grid(True) plt.savefig(output_png) - else: + else: + # TODO figure out how to deal with empty ensemble files (still will have the header) plt.figure() + plt.text(0.5, 0.5, "empty ensemble file", ha='center', va='center', fontsize=12, color='red') + plt.axis('off') plt.savefig(output_png) - # TODO PCA chosen pathway, will need to use precision and recall code for the nodes of the chosen pathway def pca_chosen_pathway(coordinates_file: str, output_dir:str): - - print(output_dir) + """ + Identifies the pathway closest to a specified centroid based on PCA coordinates + Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway. + Returns the file path for the representative pathway associated with the closest data point. + @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset + @param output_dir: the main reconstruction directory + """ coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0) centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid'] centroid = centroid_row.iloc[0, 1:].tolist() - coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid'] pc_columns = [col for col in coord_df.columns if col.startswith('PC')] coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) - print(coord_df.sort_values(by='Distance To Centroid')) closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] - print(closest_to_centroid) rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")] - print(rep_pathway) - - return rep_pathway \ No newline at end of file + return rep_pathway From 71aa43eda389d58111423c294dde84c17438fe22 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 4 Nov 2024 11:41:39 -0600 Subject: [PATCH 05/22] clean up of comments --- Snakefile | 3 +-- spras/analysis/ml.py | 10 +--------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/Snakefile b/Snakefile index f717f739..e1e8cdbd 100644 --- a/Snakefile +++ b/Snakefile @@ -396,7 +396,6 @@ rule evaluation: pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]), pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']), pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']), - # add pca png and file that is needed by Evaluation.precision_and_recall pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]), pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]), run: @@ -430,7 +429,7 @@ rule evaluation_per_algo_pathways: gold_standard_file = get_gold_standard_pickle_file, pathways = collect_pathways_per_algo_per_dataset, output: - pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), # these all need to be updated to use the algortihm in it + pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 1585db89..b477e0f9 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -146,14 +146,12 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: scaler.fit(X) # calc mean and standard deviation X_scaled = scaler.transform(X) - # choosing the PCA pca_instance = PCA(n_components=components) pca_instance.fit(X_scaled) X_pca = pca_instance.transform(X_scaled) variance = pca_instance.explained_variance_ratio_ * 100 - # TODO: add in centroid code from other branch # calculating the centroid centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples @@ -167,17 +165,11 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)") plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)") - # saving the coordinates of each algorithm - # make_required_dirs(output_coord) - # coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) - # coordinates_df.insert(0, 'algorithm', columns.tolist()) - # coordinates_df.to_csv(output_coord, sep='\t', index=False) - # saving the coordinates of each algorithm make_required_dirs(output_coord) coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) coordinates_df.insert(0, 'datapoint_labels', columns.tolist()) - centroid_row = ['centroid'] + centroid.tolist() # TODO: do we want a seperate file for the centroid, or add it to the end of the coordinates_df df as a new datapoint + centroid_row = ['centroid'] + centroid.tolist() coordinates_df.loc[len(coordinates_df)] = centroid_row coordinates_df.to_csv(output_coord, sep='\t', index=False) From 691673c95008424f8df58c0c1437c1ea1297c34b Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 4 Nov 2024 12:09:36 -0600 Subject: [PATCH 06/22] updated test_ml.py to work with new updates --- test/ml/expected/expected-pca-coordinates.tsv | 9 +++++---- test/ml/test_ml.py | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv index b6371c84..ac10f2db 100644 --- a/test/ml/expected/expected-pca-coordinates.tsv +++ b/test/ml/expected/expected-pca-coordinates.tsv @@ -1,4 +1,5 @@ -algorithm PC1 PC2 -test-data-s1 -2.006650210482033 -0.9865875190637743 -test-data-s2 -1.5276508866841987 1.0799457247533237 -test-data-s3 3.534301097166232 -0.0933582056895495 \ No newline at end of file +datapoint_labels PC1 PC2 +test-data-s1 -2.0066502104820323 -0.9865875190637746 +test-data-s2 -1.5276508866841985 1.0799457247533233 +test-data-s3 3.5343010971662308 -0.09335820568954915 +centroid 0.0 -1.6190752442450199e-16 diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 2b5720ae..b9ca69ca 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -77,14 +77,15 @@ def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') expected = expected.round(5) + expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) + for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='algorithm', ignore_index=True, inplace=True) - + coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) assert coord.equals(expected) for _ in range(5): @@ -93,7 +94,7 @@ def test_pca_robustness(self): OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) assert coord.equals(expected) From 26178f98136ecf19e158fa7397520eca4bf65c2f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 7 Nov 2024 11:34:17 -0600 Subject: [PATCH 07/22] in progress of testing --- .../expected/expected-node-ensemble.csv | 13 +++++++ test/evaluate/input/ensemble-network.tsv | 10 ++++++ test/evaluate/test_evaluate.py | 36 +++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 test/evaluate/expected/expected-node-ensemble.csv create mode 100644 test/evaluate/input/ensemble-network.tsv create mode 100644 test/evaluate/test_evaluate.py diff --git a/test/evaluate/expected/expected-node-ensemble.csv b/test/evaluate/expected/expected-node-ensemble.csv new file mode 100644 index 00000000..ba467d55 --- /dev/null +++ b/test/evaluate/expected/expected-node-ensemble.csv @@ -0,0 +1,13 @@ +Node max_freq +C 0.75 +E 0.75 +D 0.75 +F 0.75 +A 0.5 +B 0.5 +L 0.5 +M 0.5 +O 0.25 +P 0.25 +N 0.25 +Q 0.25 diff --git a/test/evaluate/input/ensemble-network.tsv b/test/evaluate/input/ensemble-network.tsv new file mode 100644 index 00000000..293ec3f5 --- /dev/null +++ b/test/evaluate/input/ensemble-network.tsv @@ -0,0 +1,10 @@ +Node1 Node2 Frequency Direction +A B 0.5 U +C D 0.75 U +E F 0.75 U +L M 0.5 U +M N 0.25 U +O P 0.25 U +P Q 0.25 U +A B 0.25 D +B A 0.25 D \ No newline at end of file diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py new file mode 100644 index 00000000..b0a60196 --- /dev/null +++ b/test/evaluate/test_evaluate.py @@ -0,0 +1,36 @@ +import filecmp +from pathlib import Path + +import pandas as pd +import pytest + +from spras.evaluation import Evaluation + +INPUT_DIR = 'test/evaluate/input/' +OUT_DIR = 'test/evaluate/output/' +EXPECT_DIR = 'test/evaluate/expected/' + + +class TestEvaluate: + @classmethod + def setup_class(cls): + """ + Create the expected output directory + """ + Path(OUT_DIR).mkdir(parents=True, exist_ok=True) + + def test_node_ensemble(self): + ensemble_file = INPUT_DIR + 'ensemble-network.tsv' + edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file) + edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False) + assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False) + + def test_PRC_node_ensemble(self): + None + + def test_precision_and_recall(self): + None + + def test_pca_chosen_pathway(self): + None + From a5b320545d015102e4dd819294b9512969bfef68 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 14 Nov 2024 11:03:49 -0600 Subject: [PATCH 08/22] spras/con --- spras/config.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/spras/config.py b/spras/config.py index b476f98f..85aa3875 100644 --- a/spras/config.py +++ b/spras/config.py @@ -101,6 +101,10 @@ def __init__(self, raw_config): self.analysis_include_ml = None # A Boolean specifying whether to run the Evaluation analysis self.analysis_include_evaluation = None + # A Boolean specifying whether to run the ML per algorithm analysis + self.analysis_include_ml_aggregate_algo = None + # A Boolean specifying whether to run the Evaluation per algorithm aanalysis + self.analysis_include_evaluation_aggregate_algo = None _raw_config = copy.deepcopy(raw_config) self.process_config(_raw_config) @@ -253,16 +257,26 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] - if self.gold_standards == {} and self.analysis_include_evaluation: - raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " - "Please set evaluation include to false or provide gold standard data.") - + # only run ml aggregate_per_algorithm if analysis_include_ml is set to true if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False + if self.gold_standards == {} and self.analysis_include_evaluation: + raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " + "Please set evaluation include to false or provide gold standard data.") + + # only run evaluation if ml is set to true + if not self.analysis_include_ml and self.analysis_include_evaluation: + self.analysis_include_evaluation = False + + # only run evaluation aggregate_per_algorithm if analysis_include_ml is set to true if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] else: self.analysis_include_evaluation_aggregate_algo = False + + # only run evaluation per algo if ml per algo is set to true + if not self.analysis_include_ml_aggregate_algo and self.analysis_include_evaluation_aggregate_algo: + self.analysis_include_evaluation_aggregate_algo = False From 0aeda9514c19d272eff4231706c1f881baf15a9a Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 10:00:01 -0600 Subject: [PATCH 09/22] update to config.py to deal with ml and eval coupling --- spras/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spras/config.py b/spras/config.py index 85aa3875..cd8c228b 100644 --- a/spras/config.py +++ b/spras/config.py @@ -257,26 +257,27 @@ def process_config(self, raw_config): self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] - # only run ml aggregate_per_algorithm if analysis_include_ml is set to true + # Only run ML aggregate per algorithm if analysis include ML is set to True if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False + # Raises an error if Evaluation is enabled but no gold standard data is provided if self.gold_standards == {} and self.analysis_include_evaluation: raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " "Please set evaluation include to false or provide gold standard data.") - # only run evaluation if ml is set to true + # Only run Evaluation if ML is set to True if not self.analysis_include_ml and self.analysis_include_evaluation: self.analysis_include_evaluation = False - # only run evaluation aggregate_per_algorithm if analysis_include_ml is set to true + # Only run Evaluation aggregate per algorithm if analysis include ML is set to True if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] else: self.analysis_include_evaluation_aggregate_algo = False - # only run evaluation per algo if ml per algo is set to true + # Only run Evaluation per algorithm if ML per algorithm is set to True if not self.analysis_include_ml_aggregate_algo and self.analysis_include_evaluation_aggregate_algo: self.analysis_include_evaluation_aggregate_algo = False From 46c87fc5cb136693a347e37cbc3aa0e049984621 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 10:00:51 -0600 Subject: [PATCH 10/22] added TODO comments on ideas to scale the binary data, still not sure what idea to use --- spras/analysis/ml.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index b477e0f9..7d45e091 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -10,7 +10,7 @@ from scipy.cluster.hierarchy import dendrogram, fcluster from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler, StandardScaler from spras.util import make_required_dirs @@ -142,8 +142,14 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: if not isinstance(labels, bool): raise ValueError(f"labels={labels} must be True or False") - scaler = StandardScaler() + #TODO: MinMaxScaler changes nothing about the data + # scaler = MinMaxScaler() + # scaler.fit(X) # calc mean and standard deviation + # X_scaled = scaler.transform(X) + + scaler = StandardScaler() # TODO: StandardScalar doesn't make sense on binary data because the mean and variance lead to values outside the binary range scaler.fit(X) # calc mean and standard deviation + scaler.transform(X) X_scaled = scaler.transform(X) # choosing the PCA From 5265c53ba7ad3845db7c8deee4c2837eb5cb01c7 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 10:28:59 -0600 Subject: [PATCH 11/22] cleaned up file names and left TODOs --- Snakefile | 51 ++++++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/Snakefile b/Snakefile index e1e8cdbd..c8ab2407 100644 --- a/Snakefile +++ b/Snakefile @@ -104,19 +104,17 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) if _config.config.analysis_include_evaluation: - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) + # TODO: should we provide the node ensemble frequencies if _config.config.analysis_include_evaluation_aggregate_algo: - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-plot.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-and-recall-pca-chosen.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-pr-curves-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) - + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) + # TODO: should we provide the node ensemble frequencies per algortihm if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. # (if analysis is specified, these should be implicitly run). @@ -384,27 +382,24 @@ def get_dataset_label(wildcards): # Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard -# TODO: figure out why this works when all one rule, but the per algorithm doesn't work like that rule evaluation: input: gold_standard_file = get_gold_standard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt", - # add PCA coordinates file pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt" output: - pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall.txt"]), - pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-curves-ensemble-nodes.png']), - pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-and-recall-plot.png']), - pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.txt"]), - pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-and-recall-pca-chosen.png"]), + pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]), + pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']), + pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']), + pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png) + Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png) node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png) pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file) # Returns all pathways for a specific algorithm and dataset def collect_pathways_per_algo_per_dataset(wildcards): @@ -423,24 +418,23 @@ def collect_pca_coordinates_per_algo_per_dataset(wildcards): return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt" # Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard -# TODO: only works when these rules are broken up rule evaluation_per_algo_pathways: input: gold_standard_file = get_gold_standard_pickle_file, pathways = collect_pathways_per_algo_per_dataset, output: - pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall.txt"]), - pr_plot_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-and-recall-plot.png']), + pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]), + pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_plot_png) + Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png) rule evaluation_per_algo_ensemble_pr_curve: input: gold_standard_file = get_gold_standard_pickle_file, ensemble_file = collect_ensemble_per_algo_per_dataset, output: - pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-pr-curves-ensemble-nodes.png']), + pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-curve-ensemble-nodes.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) @@ -451,12 +445,11 @@ rule evaluation_per_algo_pca_chosen: gold_standard_file = get_gold_standard_pickle_file, pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset output: - pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.txt"]), - pca_chosen_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-and-recall-pca-chosen.png"]), + pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-pca-chosen-pathway.txt"]), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file, output.pca_chosen_pr_png) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file) # Remove the output directory rule clean: From 9b7e6875b6ba6219cb1db4e1543a70fe3ec810a3 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:40:17 -0600 Subject: [PATCH 12/22] added the eval test cases, made a todo for config test case --- ...ted-precision-recall-per-pathway-empty.txt | 2 + ...recision-recall-per-pathway-pca-chosen.txt | 2 + .../expected-precision-recall-per-pathway.txt | 5 ++ .../input/data-test-params-123/pathway.txt | 3 + .../input/data-test-params-456/pathway.txt | 2 + .../input/data-test-params-789/pathway.txt | 3 + .../input/data-test-params-empty/pathway.txt | 1 + test/evaluate/input/node-ensemble-empty.csv | 2 + test/evaluate/input/node-ensemble.csv | 13 ++++ test/evaluate/input/node_table.csv | 4 ++ test/evaluate/input/pca-coordinates.tsv | 6 ++ test/evaluate/test_evaluate.py | 64 +++++++++++++++---- test/test_config.py | 2 + 13 files changed, 97 insertions(+), 12 deletions(-) create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway.txt create mode 100644 test/evaluate/input/data-test-params-123/pathway.txt create mode 100644 test/evaluate/input/data-test-params-456/pathway.txt create mode 100644 test/evaluate/input/data-test-params-789/pathway.txt create mode 100644 test/evaluate/input/data-test-params-empty/pathway.txt create mode 100644 test/evaluate/input/node-ensemble-empty.csv create mode 100644 test/evaluate/input/node-ensemble.csv create mode 100644 test/evaluate/input/node_table.csv create mode 100644 test/evaluate/input/pca-coordinates.tsv diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt new file mode 100644 index 00000000..6c97ff7e --- /dev/null +++ b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt @@ -0,0 +1,2 @@ +Pathway Precision Recall +test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt new file mode 100644 index 00000000..6c97ff7e --- /dev/null +++ b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt @@ -0,0 +1,2 @@ +Pathway Precision Recall +test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway.txt b/test/evaluate/expected/expected-precision-recall-per-pathway.txt new file mode 100644 index 00000000..02e17a7c --- /dev/null +++ b/test/evaluate/expected/expected-precision-recall-per-pathway.txt @@ -0,0 +1,5 @@ +Pathway Precision Recall +test/evaluate/input/data-test-params-456/pathway.txt 0.0 0.0 +test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 +test/evaluate/input/data-test-params-123/pathway.txt 0.6666666666666666 0.6666666666666666 +test/evaluate/input/data-test-params-789/pathway.txt 1.0 1.0 diff --git a/test/evaluate/input/data-test-params-123/pathway.txt b/test/evaluate/input/data-test-params-123/pathway.txt new file mode 100644 index 00000000..21768464 --- /dev/null +++ b/test/evaluate/input/data-test-params-123/pathway.txt @@ -0,0 +1,3 @@ +Node1 Node2 Rank Direction +A B 1 U +B C 1 U diff --git a/test/evaluate/input/data-test-params-456/pathway.txt b/test/evaluate/input/data-test-params-456/pathway.txt new file mode 100644 index 00000000..d445d80f --- /dev/null +++ b/test/evaluate/input/data-test-params-456/pathway.txt @@ -0,0 +1,2 @@ +Node1 Node2 Rank Direction +F L 1 U diff --git a/test/evaluate/input/data-test-params-789/pathway.txt b/test/evaluate/input/data-test-params-789/pathway.txt new file mode 100644 index 00000000..352698a0 --- /dev/null +++ b/test/evaluate/input/data-test-params-789/pathway.txt @@ -0,0 +1,3 @@ +Node1 Node2 Rank Direction +A B 1 U +B Q 1 U diff --git a/test/evaluate/input/data-test-params-empty/pathway.txt b/test/evaluate/input/data-test-params-empty/pathway.txt new file mode 100644 index 00000000..63fda2b1 --- /dev/null +++ b/test/evaluate/input/data-test-params-empty/pathway.txt @@ -0,0 +1 @@ +Node1 Node2 Rank Direction \ No newline at end of file diff --git a/test/evaluate/input/node-ensemble-empty.csv b/test/evaluate/input/node-ensemble-empty.csv new file mode 100644 index 00000000..e488f56a --- /dev/null +++ b/test/evaluate/input/node-ensemble-empty.csv @@ -0,0 +1,2 @@ +Node max_freq + diff --git a/test/evaluate/input/node-ensemble.csv b/test/evaluate/input/node-ensemble.csv new file mode 100644 index 00000000..ba467d55 --- /dev/null +++ b/test/evaluate/input/node-ensemble.csv @@ -0,0 +1,13 @@ +Node max_freq +C 0.75 +E 0.75 +D 0.75 +F 0.75 +A 0.5 +B 0.5 +L 0.5 +M 0.5 +O 0.25 +P 0.25 +N 0.25 +Q 0.25 diff --git a/test/evaluate/input/node_table.csv b/test/evaluate/input/node_table.csv new file mode 100644 index 00000000..5b9cd41b --- /dev/null +++ b/test/evaluate/input/node_table.csv @@ -0,0 +1,4 @@ +NODEID +A +B +Q \ No newline at end of file diff --git a/test/evaluate/input/pca-coordinates.tsv b/test/evaluate/input/pca-coordinates.tsv new file mode 100644 index 00000000..92fc6b3d --- /dev/null +++ b/test/evaluate/input/pca-coordinates.tsv @@ -0,0 +1,6 @@ +datapoint_labels PC1 PC2 +data-test-params-123 -1.3973472526239425 -1.632993161855452 +data-test-params-456 2.025440509784659 1.9566080710032526e-16 +data-test-params-789 -1.3973472526239425 1.632993161855452 +data-test-params-empty 0.7692539954632259 -4.1496185644351084e-16 +centroid -2.7755575615628914e-17 -4.822931287961988e-17 diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index b0a60196..1c1e1e9b 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -4,13 +4,13 @@ import pandas as pd import pytest +import spras.analysis.ml as ml from spras.evaluation import Evaluation INPUT_DIR = 'test/evaluate/input/' OUT_DIR = 'test/evaluate/output/' EXPECT_DIR = 'test/evaluate/expected/' - - +NODE_TABLE = pd.read_csv(INPUT_DIR + "node_table.csv", header=0) class TestEvaluate: @classmethod def setup_class(cls): @@ -24,13 +24,53 @@ def test_node_ensemble(self): edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file) edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False) assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False) - - def test_PRC_node_ensemble(self): - None - - def test_precision_and_recall(self): - None - - def test_pca_chosen_pathway(self): - None - + + def test_precision_recal_curve_ensemble_nodes(self): + out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes.png") + out_path.unlink(missing_ok=True) + ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep="\t", header=0) + Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path) + assert out_path.exists() + + def test_precision_recal_curve_ensemble_nodes_empty(self): + out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes-empty.png") + out_path.unlink(missing_ok=True) + ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep="\t", header=0) + Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path) + assert out_path.exists() + + def test_precision_recall_per_pathway(self): + file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"] + algorithms = ["test"] + output_file = OUT_DIR + "test-precision-recall-per-pathway.txt" + output_png = OUT_DIR + "test-precision-recall-per-pathway.png" + + Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False) + + + + def test_precision_recall_per_pathway_empty(self): + + file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"] + algorithms = ["test"] + output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt" + output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png" + + Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False) + + + def test_precision_recall_pca_chosen_pathway(self): + file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"] + algorithms = ["test"] + output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt" + output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png" + + dataframe = ml.summarize_networks(file_paths) + ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv') + + pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR) + print(pathway) + Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False) diff --git a/test/test_config.py b/test/test_config.py index bf13cd6e..0f0d813e 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -142,3 +142,5 @@ def test_error_gs_dataset_mismatch(self): with pytest.raises(ValueError): config.init_global(test_config) + + # TODO: should I add a test case on the new config eval / ml couple code From 23d1070a8b7dcf348077cc20968fc61e6250df05 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:40:46 -0600 Subject: [PATCH 13/22] added algorithms to be used for eval code --- Snakefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Snakefile b/Snakefile index c8ab2407..caee3428 100644 --- a/Snakefile +++ b/Snakefile @@ -395,11 +395,11 @@ rule evaluation: pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png) + Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) - Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png) + Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png) pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) # Returns all pathways for a specific algorithm and dataset def collect_pathways_per_algo_per_dataset(wildcards): @@ -427,7 +427,7 @@ rule evaluation_per_algo_pathways: pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table - Evaluation.precision_and_recall(input.pathways, node_table, output.pr_file, output.pr_png) + Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) rule evaluation_per_algo_ensemble_pr_curve: input: @@ -438,7 +438,7 @@ rule evaluation_per_algo_ensemble_pr_curve: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) - Evaluation.PRC_node_ensemble(node_ensemble, node_table, output.pr_curve_png) + Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png) rule evaluation_per_algo_pca_chosen: input: @@ -449,7 +449,7 @@ rule evaluation_per_algo_pca_chosen: run: node_table = Evaluation.from_file(input.gold_standard_file).node_table pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, output.pca_chosen_pr_file) + Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) # Remove the output directory rule clean: From 0408a202c9e3b08d9ec402391bbb011fe06231dd Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:41:43 -0600 Subject: [PATCH 14/22] pre commit test_evaluate.py --- test/evaluate/test_evaluate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index 1c1e1e9b..cfb2e8a2 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -48,8 +48,6 @@ def test_precision_recall_per_pathway(self): Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False) - - def test_precision_recall_per_pathway_empty(self): file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"] From f1f58e79261336c29c07c2a528959a405f536c3f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:42:24 -0600 Subject: [PATCH 15/22] updated evalute.py --- spras/evaluation.py | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 6757dcf9..36dc59b4 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -79,27 +79,28 @@ def load_files_from_dict(self, gold_standard_dict: Dict): # TODO: later iteration - chose between node and edge file, or allow both @staticmethod - def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str, output_png: str ): + def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None): """ Takes in file paths for a specific dataset and an associated gold standard node table. Calculates precision and recall for each pathway file Returns output back to output_file @param file_paths: file paths of pathway reconstruction algorithm outputs @param node_table: the gold standard nodes + @param algorithms: list of algorithms used in current run of SPRAS @param output_file: the filename to save the precision and recall of each pathway - @param output_png: the filename to plot the precision and recall of each pathway (not a PRC) + @param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC) """ y_true = set(node_table['NODEID']) results = [] - for file in file_paths: df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"]) + # TODO: do we want to include the pathways that are empty for evaluation / in the pr_df? y_pred = set(df['Node1']).union(set(df['Node2'])) all_nodes = y_true.union(y_pred) y_true_binary = [1 if node in y_true else 0 for node in all_nodes] y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes] - # default to 0.0 if there is a divide by 0 error + # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0) recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0) results.append({"Pathway": file, "Precision": precision, "Recall": recall}) @@ -107,18 +108,43 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, o pr_df = pd.DataFrame(results) pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True) pr_df.to_csv(output_file, sep="\t", index=False) - - plt.figure(figsize=(8, 6)) - plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="PR") - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.title(f"Precision and Recall Plot") - plt.legend() - plt.grid(True) - plt.savefig(output_png) - # TODO: what to do when this is empty - - def select_max_freq_and_node(row): # TODO: what (:type) would this row be + print(pr_df) + + num_of_algorithms_used = 0 + if output_png is not None: + if not pr_df.empty: + plt.figure(figsize=(8, 6)) + # plot a line per algorithm + for algorithm in algorithms: #TODO I think there is a better way than doing this; using split on the filepaths doesn't work bc it is not adaptable + subset = pr_df[pr_df["Pathway"].str.contains(algorithm)] + if not subset.empty: + plt.plot( + subset["Recall"], + subset["Precision"], + marker='o', + linestyle='-', + label=f"{algorithm}" + ) + num_of_algorithms_used += 1 + + # plot overall precision and recall from all the algorithms + if num_of_algorithms_used > 1: + plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="Overall Precision-Recall") + + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"Precision and Recall Plot") + plt.legend() + plt.grid(True) + plt.savefig(output_png) + else: + plt.figure() + plt.plot([], []) + plt.title("Empty Pathway Files") + plt.savefig(output_png) + + + def select_max_freq_and_node(row: pd.Series): """ Selects the node and frequency with the highest frequency value from two potential nodes in a row. Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency. @@ -160,10 +186,9 @@ def edge_frequency_node_ensemble(ensemble_file: str): node_ensemble.sort_values('max_freq', ascending= False, inplace = True) return node_ensemble else: - # TODO: figure out how to deal with empty ensemble files return pd.DataFrame(columns = ['Node', 'max_freq']) - def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): + def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): """ Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table. Plots a precision and recall curve for the node ensemble against its associated gold standard node table @@ -190,10 +215,9 @@ def PRC_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, outpu plt.grid(True) plt.savefig(output_png) else: - # TODO figure out how to deal with empty ensemble files (still will have the header) plt.figure() - plt.text(0.5, 0.5, "empty ensemble file", ha='center', va='center', fontsize=12, color='red') - plt.axis('off') + plt.plot([], []) + plt.title("Empty Ensemble File") plt.savefig(output_png) def pca_chosen_pathway(coordinates_file: str, output_dir:str): From 35074755db2017747a0bce8cf5fd5cde91474131 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:42:45 -0600 Subject: [PATCH 16/22] updated all config files --- config/config.yaml | 6 +++++- config/egfr-param-tuning.yaml | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index fd47638b..79b7d086 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -177,4 +177,8 @@ analysis: include: true # adds evaluation per algorithm per dataset-goldstandard pair aggregate_per_algorithm: true - + # TODO: should we decouple parts of eval that involve ml + # it will be good to seperate them otherwise if ml doesn't work then eval won't work at all + # pca_chosen + # ensemble + # precisin and recall diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml index ecc2a65f..a0a965b7 100644 --- a/config/egfr-param-tuning.yaml +++ b/config/egfr-param-tuning.yaml @@ -3439,7 +3439,7 @@ gold_standards: - tps_egfr reconstruction_settings: locations: - reconstruction_dir: output + reconstruction_dir: output/tps_egfr run: true analysis: summary: From dd0359f58f3674fdffa1ec18433a7335e0824171 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 15 Nov 2024 14:48:08 -0600 Subject: [PATCH 17/22] cleane dup spras/evaluation.py --- spras/evaluation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 36dc59b4..b621daad 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -108,7 +108,6 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a pr_df = pd.DataFrame(results) pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True) pr_df.to_csv(output_file, sep="\t", index=False) - print(pr_df) num_of_algorithms_used = 0 if output_png is not None: From ef15799f3bf653638b1139557887aa5de49975da Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Nov 2024 12:15:02 -0600 Subject: [PATCH 18/22] updated spacing and added comments to the config files --- config/config.yaml | 2 ++ config/egfr.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index 79b7d086..76231276 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -174,8 +174,10 @@ analysis: metric: 'euclidean' evaluation: # evaluation per dataset-goldstandard pair + # evalution will not run unless ml include is set to true include: true # adds evaluation per algorithm per dataset-goldstandard pair + # evalution per algortihm will not run unless ml include and ml aggregate_per_algorithm is set to true aggregate_per_algorithm: true # TODO: should we decouple parts of eval that involve ml # it will be good to seperate them otherwise if ml doesn't work then eval won't work at all diff --git a/config/egfr.yaml b/config/egfr.yaml index 0b41f0a5..93cbccec 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -90,4 +90,4 @@ analysis: ml: include: false evaluation: - include: false + include: false From 47dab1a4ea313f7fdaa3d1d7e2666e89ba17678c Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Nov 2024 12:15:16 -0600 Subject: [PATCH 19/22] updated evalution.py code --- spras/evaluation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index b621daad..3e2b1e0b 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -109,12 +109,11 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True) pr_df.to_csv(output_file, sep="\t", index=False) - num_of_algorithms_used = 0 if output_png is not None: if not pr_df.empty: plt.figure(figsize=(8, 6)) # plot a line per algorithm - for algorithm in algorithms: #TODO I think there is a better way than doing this; using split on the filepaths doesn't work bc it is not adaptable + for algorithm in algorithms: subset = pr_df[pr_df["Pathway"].str.contains(algorithm)] if not subset.empty: plt.plot( @@ -124,11 +123,10 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a linestyle='-', label=f"{algorithm}" ) - num_of_algorithms_used += 1 - # plot overall precision and recall from all the algorithms - if num_of_algorithms_used > 1: - plt.plot(pr_df["Recall"], pr_df["Precision"], marker='o', linestyle='-', color='b', label="Overall Precision-Recall") + # plot combined precision and recall from all the algorithms + if len(algorithms) > 1: + plt.plot(pr_df["Recall"], pr_df["Precision"], linestyle='--', color='b', label="Overall Precision-Recall", alpha = 0.3) plt.xlabel("Recall") plt.ylabel("Precision") From b3504b52cb3644903ac18c7abba65b2dd2751f2b Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Nov 2024 12:15:42 -0600 Subject: [PATCH 20/22] cleaned up eval tests and added coupling tests to config --- test/evaluate/test_evaluate.py | 1 - test/test_config.py | 135 ++++++++++++++++++++++++++++++++- 2 files changed, 132 insertions(+), 4 deletions(-) diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index cfb2e8a2..5dc0b8f3 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -69,6 +69,5 @@ def test_precision_recall_pca_chosen_pathway(self): ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv') pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR) - print(pathway) Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png) assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False) diff --git a/test/test_config.py b/test/test_config.py index 0f0d813e..c89d7123 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -27,7 +27,8 @@ def get_test_config(): "include": False }, "ml": { - "include": False + "include": False, + "aggregate_per_algorithm": False }, "graphspace": { "include": False @@ -36,7 +37,8 @@ def get_test_config(): "include": False }, "evaluation": { - "include": False + "include": False, + "aggregate_per_algorithm": False }, }, } @@ -143,4 +145,131 @@ def test_error_gs_dataset_mismatch(self): with pytest.raises(ValueError): config.init_global(test_config) - # TODO: should I add a test case on the new config eval / ml couple code + def test_eval_ml_coupling(self): + test_config = get_test_config() + include_combos = [(True, True), (True, False), (False, True), (False, False)] + + # ml: True evaluation: True + test_config["analysis"]["ml"]["include"] = include_combos[0][0] + test_config["analysis"]["evaluation"]["include"] = include_combos[0][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == True and config.config.analysis_include_evaluation == True + + # ml: True evaluation: False + test_config["analysis"]["ml"]["include"] = include_combos[1][0] + test_config["analysis"]["evaluation"]["include"] = include_combos[1][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == True and config.config.analysis_include_evaluation == False + + # ml: False evaluation: True + test_config["analysis"]["ml"]["include"] = include_combos[2][0] + test_config["analysis"]["evaluation"]["include"] = include_combos[2][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == False and config.config.analysis_include_evaluation == False + + # ml: False evaluation: False + test_config["analysis"]["ml"]["include"] = include_combos[3][0] + test_config["analysis"]["evaluation"]["include"] = include_combos[3][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == False and config.config.analysis_include_evaluation == False + + + def test_ml_agg_algo_coupling(self): + + test_config = get_test_config() + include_combos = [(True, True), (True, False), (False, True), (False, False)] + + test_config["analysis"]["ml"]["include"] = include_combos[0][0] + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[0][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == True + + + test_config["analysis"]["ml"]["include"] = include_combos[1][0] + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[1][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False + + + test_config["analysis"]["ml"]["include"] = include_combos[2][0] + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[2][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False + + + test_config["analysis"]["ml"]["include"] = include_combos[3][0] + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = include_combos[3][1] + config.init_global(test_config) + assert config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False + + def test_eval_agg_algo_coupling(self): + + test_config = get_test_config() + test_config["analysis"]["ml"]["include"] = True + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True + + include_combos = [(True, True), (True, False), (False, True), (False, False)] + + test_config["analysis"]["evaluation"]["include"] = include_combos[0][0] + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[0][1] + config.init_global(test_config) + assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == True + + + test_config["analysis"]["evaluation"]["include"] = include_combos[1][0] + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[1][1] + config.init_global(test_config) + assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == False + + + test_config["analysis"]["evaluation"]["include"] = include_combos[2][0] + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[2][1] + config.init_global(test_config) + assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False + + + test_config["analysis"]["evaluation"]["include"] = include_combos[3][0] + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = include_combos[3][1] + config.init_global(test_config) + assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False + + def test_eval_ml_agg_algo_coupling(self): + + # the value of ml include and ml aggregate_per_algorithm can affect the value of evaluation include and evaluation aggregate_per_algorithm + + test_config = get_test_config() + + test_config["analysis"]["ml"]["include"] = False + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True + test_config["analysis"]["evaluation"]["include"] = True + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True + config.init_global(test_config) + assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False + + test_config["analysis"]["ml"]["include"] = True + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False + test_config["analysis"]["evaluation"]["include"] = True + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True + config.init_global(test_config) + assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False + + test_config["analysis"]["ml"]["include"] = False + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False + test_config["analysis"]["evaluation"]["include"] = True + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True + config.init_global(test_config) + assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == False and config.config.analysis_include_ml_aggregate_algo == False + + test_config["analysis"]["ml"]["include"] = True + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True + test_config["analysis"]["evaluation"]["include"] = True + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = True + config.init_global(test_config) + assert config.config.analysis_include_evaluation == True and config.config.analysis_include_evaluation_aggregate_algo == True and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == True + + test_config["analysis"]["ml"]["include"] = True + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = False + test_config["analysis"]["evaluation"]["include"] = False + test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = False + config.init_global(test_config) + assert config.config.analysis_include_evaluation == False and config.config.analysis_include_evaluation_aggregate_algo == False and config.config.analysis_include_ml == True and config.config.analysis_include_ml_aggregate_algo == False From dfcd302c554f50a3a2b2f93c9e495debd8ad28a7 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 5 Dec 2024 16:03:05 -0600 Subject: [PATCH 21/22] change how plot is --- spras/evaluation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 3e2b1e0b..61628f59 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -120,14 +120,11 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a subset["Recall"], subset["Precision"], marker='o', - linestyle='-', + linestyle='', label=f"{algorithm}" ) - # plot combined precision and recall from all the algorithms - if len(algorithms) > 1: - plt.plot(pr_df["Recall"], pr_df["Precision"], linestyle='--', color='b', label="Overall Precision-Recall", alpha = 0.3) - + plt.xlabel("Recall") plt.ylabel("Precision") plt.title(f"Precision and Recall Plot") From 97a7d7be46ec2516710cb4ca0122a013bc88d897 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 9 Dec 2024 16:01:16 -0600 Subject: [PATCH 22/22] precommit --- spras/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 61628f59..e6f60c0b 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -124,7 +124,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a label=f"{algorithm}" ) - + plt.xlabel("Recall") plt.ylabel("Precision") plt.title(f"Precision and Recall Plot")