diff --git a/Snakefile b/Snakefile index 71a8a6ed..4e00a7c1 100644 --- a/Snakefile +++ b/Snakefile @@ -102,6 +102,9 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + # if _config.config.evaluation_include: + # final_input.extend(expand('{out_dir}{sep}{dataset}-{goldstandard}.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. # (if analysis is specified, these should be implicitly run). @@ -153,6 +156,10 @@ rule merge_input: dataset_dict = get_dataset(_config.config.datasets, wildcards.dataset) runner.merge_input(dataset_dict, output.dataset_file) +# TODO: add a merge input for gold standard data? +# may need to update runner.py to add a merge_gs_input function + + # The checkpoint is like a rule but can be used in dynamic workflows # The workflow directed acyclic graph is re-evaluated after the checkpoint job runs # If the checkpoint has not executed for the provided wildcard values, it will be run and then the rest of the diff --git a/config/config.yaml b/config/config.yaml index 741d8ca9..bcff35f2 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -115,6 +115,13 @@ datasets: other_files: [] # Relative path from the spras directory data_dir: "input" + +gold_standard: + - + label: gs + node_files: ["gs_nodes.txt"] + # edge_files: [] TODO: later iteration + data_dir: "input" # If we want to reconstruct then we should set run to true. # TODO: if include is true above but run is false here, algs are not run. @@ -156,3 +163,5 @@ analysis: linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + evaluation: + include: true diff --git a/spras/evaluation.py b/spras/evaluation.py new file mode 100644 index 00000000..e3df89fe --- /dev/null +++ b/spras/evaluation.py @@ -0,0 +1,46 @@ +import os +import pickle as pkl +import warnings + +import pandas as pd + +class Evaluation: + + def __init__(self, gold_standard_dict): + self.label = None + self.node_table = None + # self.edge_table = None TODO: later iteration + self.load_files_from_dict(gold_standard_dict) + return + + def to_file(self, file_name): + """ + Saves dataset object to pickle file + """ + with open(file_name, "wb") as f: + pkl.dump(self, f) + + @classmethod + def from_file(cls, file_name): + """ + Loads dataset object from a pickle file. + Usage: dataset = Dataset.from_file(pickle_file) + """ + with open(file_name, "rb") as f: + return pkl.load(f) + + def load_files_from_dict(self, gold_standard_dict): + + self.label = gold_standard_dict["label"] + node_data_files = gold_standard_dict["node_files"] + data_loc = gold_standard_dict["data_dir"] + + single_node_table = pd.read_table(os.path.join(data_loc, node_file)) + self.node_table = single_node_table + + # self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID]) + # for loop? and read in node dataset into a pandas df + +def percision_recall(): + None + # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html