diff --git a/BLEval/__init__.py b/BLEval/__init__.py index e042b83a..7f46a9e2 100644 --- a/BLEval/__init__.py +++ b/BLEval/__init__.py @@ -58,33 +58,58 @@ class InputSettings(object): :type algorithms: list ''' - def __init__(self, - datadir, datasets, algorithms) -> None: - - self.datadir = datadir + def __init__(self, datadir, datasets, algorithms, use_embeddings=False) -> None: + self.datadir = Path(datadir) self.datasets = datasets self.algorithms = algorithms + self.use_embeddings = use_embeddings - + if use_embeddings: + self.adjust_paths_for_embeddings() + + def adjust_paths_for_embeddings(self): + for dataset in self.datasets: + processed_path = 'processed_ExpressionData' + dataset['exprData'] = str(Path(processed_path) / 'EmbeddingsData.csv') + dataset['refNetwork'] = str(Path(processed_path) / 'refNetwork.csv') + + def get_true_edges_path(self, dataset_name): + dataset = next((d for d in self.datasets if d['name'] == dataset_name), None) + if dataset is None: + raise ValueError(f"Dataset with name {dataset_name} not found in datasets list.") + return self.datadir / dataset_name / 'processed_ExpressionData' / dataset['trueEdges'] + class OutputSettings(object): ''' The class for storing the names of directories that output should - be written to. This initilizes an OutputSettings object based on the + be written to. This initializes an OutputSettings object based on the following two parameters. - :param base_dir: output root directory, typically 'outputs/' :type base_dir: str :param output_prefix: A prefix added to the final output files. :type str: ''' - def __init__(self, base_dir, output_prefix: Path) -> None: - self.base_dir = base_dir + def __init__(self, base_dir, output_prefix: Path, datasets, use_embeddings=False) -> None: + self.base_dir = Path(base_dir) self.output_prefix = output_prefix + self.datasets = datasets + if use_embeddings: + self.adjust_paths_for_embeddings() + def adjust_paths_for_embeddings(self): + self.base_dir = self.base_dir / 'processed_ExpressionData' + def get_output_path(self, dataset_name, file_name): + dataset = next((d for d in self.datasets if d['name'] == dataset_name), None) + if dataset is None: + raise ValueError(f"Dataset with name {dataset_name} not found in datasets list.") + + # Construct the output path based on the required structure + output_path = self.base_dir / dataset_name / 'processed_ExpressionData' / f"{dataset_name}-{file_name}" + return output_path class BLEval(object): ''' @@ -102,7 +127,6 @@ def __init__(self, def computeAUC(self, directed = True): - ''' Computes areas under the precision-recall (PR) and and ROC plots for each algorithm-dataset combination. @@ -122,12 +146,15 @@ def computeAUC(self, directed = True): AUROCDict = {} for dataset in tqdm(self.input_settings.datasets, - total = len(self.input_settings.datasets), unit = " Datasets"): - + total=len(self.input_settings.datasets), unit=" Datasets"): + if self.input_settings.use_embeddings == True: + true_edges_path = self.input_settings.get_true_edges_path(dataset['name']) + AUPRC, AUROC = PRROC(dataset, self.input_settings, - directed = directed, selfEdges = False, plotFlag = False) + directed=directed, selfEdges=False, plotFlag=False) AUPRCDict[dataset['name']] = AUPRC AUROCDict[dataset['name']] = AUROC + AUPRC = pd.DataFrame(AUPRCDict) AUROC = pd.DataFrame(AUROCDict) return AUPRC, AUROC @@ -151,7 +178,6 @@ def parseTime(self): return TimeDict def computeJaccard(self): - ''' Computes Jaccard Index between top-k edge predictions of the same algorithm. @@ -252,15 +278,11 @@ def computeEarlyPrec(self): ''' Eprec = {} - outDir = str(self.output_settings.base_dir) + \ - str(self.input_settings.datadir).split("inputs")[1] + "/" for algo in tqdm(self.input_settings.algorithms, unit = " Algorithms"): if algo[1]['should_run'] == True: - Eprec[algo[0]] = EarlyPrec(self, algo[0]) + Eprec[algo[0]] = EarlyPrec(self, self.input_settings, algo[0]) return pd.DataFrame(Eprec).T - - - + def computeSignedEPrec(self): ''' @@ -323,7 +345,7 @@ class ConfigParser(object): used in the BLEval. ''' @staticmethod - def parse(config_file_handle) -> BLEval: + def parse(config_file_handle, use_embeddings=False) -> BLEval: ''' A method for parsing the input .yaml file. @@ -334,48 +356,24 @@ def parse(config_file_handle) -> BLEval: An object of class :class:`BLEval.BLEval`. ''' - config_map = yaml.load(config_file_handle) - return BLEval( - ConfigParser.__parse_input_settings( - config_map['input_settings']), - ConfigParser.__parse_output_settings( - config_map['output_settings'])) + config_map = yaml.load(config_file_handle, Loader=yaml.SafeLoader) + input_settings = ConfigParser.__parse_input_settings(config_map['input_settings'], use_embeddings) + output_settings = ConfigParser.__parse_output_settings(config_map['output_settings'], input_settings.datasets, use_embeddings) + return BLEval(input_settings, output_settings) @staticmethod - def __parse_input_settings(input_settings_map) -> InputSettings: + def __parse_input_settings(input_settings_map, use_embeddings=False) -> InputSettings: ''' A method for parsing and initializing InputSettings object. ''' - input_dir = input_settings_map['input_dir'] - dataset_dir = input_settings_map['dataset_dir'] - # Check if datasets have been specified or not - if 'datasets' in input_settings_map: - datasets_specified = True - else: - datasets_specified = False - - # If no datasets specified, run all datasets in dataset_dir - if datasets_specified is False: - subfolder_dir = glob(os.path.join(input_dir, dataset_dir, "*/"), recursive = True) - datasets = [] - for x in subfolder_dir: - datasets.append({"name": pathlib.Path(x).name, - "exprData": "ExpressionData.csv", - "cellData": "PseudoTime.csv", - "trueEdges": "refNetwork.csv"}) - # If datasets specified, run the corresponding datasets - else: - datasets = input_settings_map['datasets'] - if datasets is None: - print("Please specify input datasets!") - - return InputSettings( - Path(input_dir, dataset_dir), - datasets, - ConfigParser.__parse_algorithms( - input_settings_map['algorithms'])) + input_dir = input_settings_map['input_dir'] # e.g., 'inputs' + dataset_dir = input_settings_map['dataset_dir'] # e.g., 'example' + datasets = input_settings_map['datasets'] + + datadir = Path(input_dir) / dataset_dir # e.g., 'inputs/example' + return InputSettings(datadir, datasets, ConfigParser.__parse_algorithms(input_settings_map['algorithms']), use_embeddings) @staticmethod def __parse_algorithms(algorithms_list): @@ -405,13 +403,11 @@ def __parse_algorithms(algorithms_list): return algorithms @staticmethod - def __parse_output_settings(output_settings_map) -> OutputSettings: + def __parse_output_settings(output_settings_map,datasets,use_embeddings=False) -> OutputSettings: ''' A method for parsing and initializing Output object. ''' output_dir = Path(output_settings_map['output_dir']) output_prefix = Path(output_settings_map['output_prefix']) - - return OutputSettings(output_dir, - output_prefix) + return OutputSettings(output_dir, output_prefix, datasets, use_embeddings) diff --git a/BLEval/computeDGAUC.py b/BLEval/computeDGAUC.py index 86b34805..e33b4a64 100644 --- a/BLEval/computeDGAUC.py +++ b/BLEval/computeDGAUC.py @@ -12,15 +12,14 @@ from rpy2.robjects import FloatVector -def PRROC(dataDict, inputSettings, directed = True, selfEdges = False, plotFlag = False): +def PRROC(dataDict, inputSettings, directed=True, selfEdges=False, plotFlag=False): ''' Computes areas under the precision-recall and ROC curves for a given dataset for each algorithm. - :param directed: A flag to indicate whether to treat predictions as directed edges (directed = True) or undirected edges (directed = False). :type directed: bool - :param selfEdges: A flag to indicate whether to includeself-edges (selfEdges = True) or exclude self-edges (selfEdges = False) from evaluation. + :param selfEdges: A flag to indicate whether to include self-edges (selfEdges = True) or exclude self-edges (selfEdges = False) from evaluation. :type selfEdges: bool :param plotFlag: A flag to indicate whether or not to save PR and ROC plots. :type plotFlag: bool @@ -30,12 +29,18 @@ def PRROC(dataDict, inputSettings, directed = True, selfEdges = False, plotFlag - AUROC: A dictionary containing AUROC values for each algorithm ''' - # Read file for trueEdges - trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] + + if inputSettings.use_embeddings == True: + true_edges_path = inputSettings.get_true_edges_path(dataDict['name']) + trueEdgesDF = pd.read_csv(true_edges_path, sep=',', header=0, index_col=None) + outDir = Path("outputs") / inputSettings.datadir.relative_to("inputs") / dataDict['name'] / 'processed_ExpressionData' + else: + trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] + '/' +dataDict['trueEdges'], sep = ',', header = 0, index_col = None) - + outDir = Path("outputs") / inputSettings.datadir.relative_to("inputs") / dataDict['name'] + + # Initialize data dictionaries precisionDict = {} recallDict = {} @@ -44,176 +49,109 @@ def PRROC(dataDict, inputSettings, directed = True, selfEdges = False, plotFlag AUPRC = {} AUROC = {} - # set-up outDir that stores output directory name - outDir = "outputs/"+str(inputSettings.datadir).split("inputs/")[1]+ '/' +dataDict['name'] if directed: for algo in tqdm(inputSettings.algorithms, - total = len(inputSettings.algorithms), unit = " Algorithms"): + total=len(inputSettings.algorithms), unit=" Algorithms"): # check if the output rankedEdges file exists - if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists(): - # Initialize Precsion - - predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \ - sep = '\t', header = 0, index_col = None) + ranked_edges_path = outDir / algo[0] / 'rankedEdges.csv' + if ranked_edges_path.exists(): + # Initialize Precision + predDF = pd.read_csv(ranked_edges_path, sep='\t', header=0, index_col=None) - precisionDict[algo[0]], recallDict[algo[0]], FPRDict[algo[0]], TPRDict[algo[0]], AUPRC[algo[0]], AUROC[algo[0]] = computeScores(trueEdgesDF, predDF, directed = True, selfEdges = selfEdges) + precisionDict[algo[0]], recallDict[algo[0]], FPRDict[algo[0]], TPRDict[algo[0]], AUPRC[algo[0]], AUROC[algo[0]] = computeScores(trueEdgesDF, predDF, directed=True, selfEdges=selfEdges) else: - print(outDir + '/' +algo[0]+'/rankedEdges.csv', \ - ' does not exist. Skipping...') - PRName = '/PRplot' - ROCName = '/ROCplot' + print(f'{ranked_edges_path} does not exist. Skipping...') + PRName = 'PRplot' + ROCName = 'ROCplot' else: for algo in tqdm(inputSettings.algorithms, - total = len(inputSettings.algorithms), unit = " Algorithms"): + total=len(inputSettings.algorithms), unit=" Algorithms"): # check if the output rankedEdges file exists - if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists(): - # Initialize Precsion - - predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \ - sep = '\t', header = 0, index_col = None) + ranked_edges_path = outDir / algo[0] / 'rankedEdges.csv' + if ranked_edges_path.exists(): + # Initialize Precision + predDF = pd.read_csv(ranked_edges_path, sep='\t', header=0, index_col=None) - precisionDict[algo[0]], recallDict[algo[0]], FPRDict[algo[0]], TPRDict[algo[0]], AUPRC[algo[0]], AUROC[algo[0]] = computeScores(trueEdgesDF, predDF, directed = False, selfEdges = selfEdges) + precisionDict[algo[0]], recallDict[algo[0]], FPRDict[algo[0]], TPRDict[algo[0]], AUPRC[algo[0]], AUROC[algo[0]] = computeScores(trueEdgesDF, predDF, directed=False, selfEdges=selfEdges) else: - print(outDir + '/' +algo[0]+'/rankedEdges.csv', \ - ' does not exist. Skipping...') + print(f'{ranked_edges_path} does not exist. Skipping...') - PRName = '/uPRplot' - ROCName = '/uROCplot' - if (plotFlag): - ## Make PR curves + PRName = 'uPRplot' + ROCName = 'uROCplot' + + if plotFlag: + # Make PR curves legendList = [] for key in recallDict.keys(): - sns.lineplot(recallDict[key],precisionDict[key], ci=None) - legendList.append(key + ' (AUPRC = ' + str("%.2f" % (AUPRC[key]))+')') - plt.xlim(0,1) - plt.ylim(0,1) + sns.lineplot(recallDict[key], precisionDict[key], ci=None) + legendList.append(f'{key} (AUPRC = {AUPRC[key]:.2f})') + plt.xlim(0, 1) + plt.ylim(0, 1) plt.xlabel('Recall') plt.ylabel('Precision') - plt.legend(legendList) - plt.savefig(outDir+PRName+'.pdf') - plt.savefig(outDir+PRName+'.png') + plt.legend(legendList) + plt.savefig(outDir / f'{PRName}.pdf') + plt.savefig(outDir / f'{PRName}.png') plt.clf() - ## Make ROC curves + # Make ROC curves legendList = [] for key in recallDict.keys(): - sns.lineplot(FPRDict[key],TPRDict[key], ci=None) - legendList.append(key + ' (AUROC = ' + str("%.2f" % (AUROC[key]))+')') + sns.lineplot(FPRDict[key], TPRDict[key], ci=None) + legendList.append(f'{key} (AUROC = {AUROC[key]:.2f})') - plt.plot([0, 1], [0, 1], linewidth = 1.5, color = 'k', linestyle = '--') + plt.plot([0, 1], [0, 1], linewidth=1.5, color='k', linestyle='--') - plt.xlim(0,1) - plt.ylim(0,1) + plt.xlim(0, 1) + plt.ylim(0, 1) plt.xlabel('FPR') plt.ylabel('TPR') - plt.legend(legendList) - plt.savefig(outDir+ROCName+'.pdf') - plt.savefig(outDir+ROCName+'.png') + plt.legend(legendList) + plt.savefig(outDir / f'{ROCName}.pdf') + plt.savefig(outDir / f'{ROCName}.png') plt.clf() + return AUPRC, AUROC def computeScores(trueEdgesDF, predEdgeDF, directed = True, selfEdges = True): - ''' - Computes precision-recall and ROC curves - using scikit-learn for a given set of predictions in the - form of a DataFrame. - :param trueEdgesDF: A pandas dataframe containing the true classes.The indices of this dataframe are all possible edgesin a graph formed using the genes in the given dataset. This dataframe only has one column to indicate the classlabel of an edge. If an edge is present in the reference network, it gets a class label of 1, else 0. - :type trueEdgesDF: DataFrame - - :param predEdgeDF: A pandas dataframe containing the edge ranks from the prediced network. The indices of this dataframe are all possible edges.This dataframe only has one column to indicate the edge weightsin the predicted network. Higher the weight, higher the edge confidence. - :type predEdgeDF: DataFrame + # Get unique genes + unique_genes = np.unique(trueEdgesDF.loc[:, ['Gene1', 'Gene2']]) - :param directed: A flag to indicate whether to treat predictionsas directed edges (directed = True) or undirected edges (directed = False). - :type directed: bool - - :param selfEdges: A flag to indicate whether to includeself-edges (selfEdges = True) or exclude self-edges (selfEdges = False) from evaluation. - :type selfEdges: bool - - :returns: - - prec: A list of precision values (for PR plot) - - recall: A list of precision values (for PR plot) - - fpr: A list of false positive rates (for ROC plot) - - tpr: A list of true positive rates (for ROC plot) - - AUPRC: Area under the precision-recall curve - - AUROC: Area under the ROC curve - ''' + # Convert DataFrames to dictionaries for faster lookup + true_edges = set(map(tuple, trueEdgesDF[['Gene1', 'Gene2']].values)) + pred_edges = dict(zip(map(tuple, predEdgeDF[['Gene1', 'Gene2']].values), + predEdgeDF['EdgeWeight'])) - if directed: - # Initialize dictionaries with all - # possible edges - if selfEdges: - possibleEdges = list(product(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), - repeat = 2)) - else: - possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), - r = 2)) - - TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges} - PredEdgeDict = {'|'.join(p):0 for p in possibleEdges} - - # Compute TrueEdgeDict Dictionary - # 1 if edge is present in the ground-truth - # 0 if edge is not present in the ground-truth - for key in TrueEdgeDict.keys(): - if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == key.split('|')[0]) & - (trueEdgesDF['Gene2'] == key.split('|')[1])])>0: - TrueEdgeDict[key] = 1 - - for key in PredEdgeDict.keys(): - subDF = predEdgeDF.loc[(predEdgeDF['Gene1'] == key.split('|')[0]) & - (predEdgeDF['Gene2'] == key.split('|')[1])] - if len(subDF)>0: - PredEdgeDict[key] = np.abs(subDF.EdgeWeight.values[0]) - - # if undirected + TrueEdgeDict = {} + PredEdgeDict = {} + + if directed: + edge_generator = product(unique_genes, repeat=2) if selfEdges else permutations(unique_genes, r=2) else: + edge_generator = combinations_with_replacement(unique_genes, r=2) if selfEdges else combinations(unique_genes, r=2) + + for edge in edge_generator: + key = '|'.join(edge) - # Initialize dictionaries with all - # possible edges - if selfEdges: - possibleEdges = list(combinations_with_replacement(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), - r = 2)) + # Compute TrueEdgeDict + TrueEdgeDict[key] = int(edge in true_edges or (not directed and edge[::-1] in true_edges)) + + # Compute PredEdgeDict + if directed: + PredEdgeDict[key] = abs(pred_edges.get(edge, 0)) else: - possibleEdges = list(combinations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), - r = 2)) - TrueEdgeDict = {'|'.join(p):0 for p in possibleEdges} - PredEdgeDict = {'|'.join(p):0 for p in possibleEdges} - - # Compute TrueEdgeDict Dictionary - # 1 if edge is present in the ground-truth - # 0 if edge is not present in the ground-truth - - for key in TrueEdgeDict.keys(): - if len(trueEdgesDF.loc[((trueEdgesDF['Gene1'] == key.split('|')[0]) & - (trueEdgesDF['Gene2'] == key.split('|')[1])) | - ((trueEdgesDF['Gene2'] == key.split('|')[0]) & - (trueEdgesDF['Gene1'] == key.split('|')[1]))]) > 0: - TrueEdgeDict[key] = 1 - - # Compute PredEdgeDict Dictionary - # from predEdgeDF - - for key in PredEdgeDict.keys(): - subDF = predEdgeDF.loc[((predEdgeDF['Gene1'] == key.split('|')[0]) & - (predEdgeDF['Gene2'] == key.split('|')[1])) | - ((predEdgeDF['Gene2'] == key.split('|')[0]) & - (predEdgeDF['Gene1'] == key.split('|')[1]))] - if len(subDF)>0: - PredEdgeDict[key] = max(np.abs(subDF.EdgeWeight.values)) - - - + PredEdgeDict[key] = max(abs(pred_edges.get(edge, 0)), abs(pred_edges.get(edge[::-1], 0))) + # Combine into one dataframe - # to pass it to sklearn - outDF = pd.DataFrame([TrueEdgeDict,PredEdgeDict]).T - outDF.columns = ['TrueEdges','PredEdges'] + outDF = pd.DataFrame({'TrueEdges': TrueEdgeDict, 'PredEdges': PredEdgeDict}) + prroc = importr('PRROC') prCurve = prroc.pr_curve(scores_class0 = FloatVector(list(outDF['PredEdges'].values)), weights_class0 = FloatVector(list(outDF['TrueEdges'].values))) diff --git a/BLEval/computeEarlyPrec.py b/BLEval/computeEarlyPrec.py index fc82d1d6..0a8c380d 100644 --- a/BLEval/computeEarlyPrec.py +++ b/BLEval/computeEarlyPrec.py @@ -12,7 +12,7 @@ from multiprocessing import Pool, cpu_count from networkx.convert_matrix import from_pandas_adjacency -def EarlyPrec(evalObject, algorithmName, TFEdges = False): +def EarlyPrec(evalObject, inputSettings, algorithmName, TFEdges = False): ''' Computes early precision for a given algorithm for each dataset. We define early precision as the fraction of true @@ -32,23 +32,31 @@ def EarlyPrec(evalObject, algorithmName, TFEdges = False): for a given algorithm for each dataset. ''' + rankDict = {} for dataset in tqdm(evalObject.input_settings.datasets): - trueEdgesDF = pd.read_csv(str(evalObject.input_settings.datadir)+'/'+ \ - dataset['name'] + '/' +\ - dataset['trueEdges'], sep = ',', - header = 0, index_col = None) + + if inputSettings.use_embeddings == True: + true_edges_path = inputSettings.get_true_edges_path(dataset['name']) + trueEdgesDF = pd.read_csv(true_edges_path, sep=',', header=0, index_col=None) + outDir = Path("outputs") / inputSettings.datadir.relative_to("inputs") / dataset['name'] / 'processed_ExpressionData' / algorithmName + rank_path = outDir / "rankedEdges.csv" + + else: + trueEdgesDF = pd.read_csv(str(evalObject.input_settings.datadir)+'/'+ \ + dataset['name'] + '/' +\ + dataset['trueEdges'], sep = ',', + header = 0, index_col = None) + outDir = str(evalObject.output_settings.base_dir) + \ + str(evalObject.input_settings.datadir).split("inputs")[1] + \ + "/" + dataset["name"] + "/" + algorithmName + rank_path = outDir + "/rankedEdges.csv" + trueEdgesDF = trueEdgesDF.loc[(trueEdgesDF['Gene1'] != trueEdgesDF['Gene2'])] trueEdgesDF.drop_duplicates(keep = 'first', inplace=True) trueEdgesDF.reset_index(drop=True, inplace=True) - - outDir = str(evalObject.output_settings.base_dir) + \ - str(evalObject.input_settings.datadir).split("inputs")[1] + \ - "/" + dataset["name"] + "/" + algorithmName - #algos = evalObject.input_settings.algorithms - rank_path = outDir + "/rankedEdges.csv" if not os.path.isdir(outDir): rankDict[dataset["name"]] = set([]) continue diff --git a/BLEvaluator.py b/BLEvaluator.py index 20f18111..fed6eea8 100644 --- a/BLEvaluator.py +++ b/BLEvaluator.py @@ -62,6 +62,9 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument('-b','--borda', action="store_true", default=False, help="Compute edge ranked list using the various Borda aggregatio methods.") + + parser.add_argument('--use_embeddings', action="store_true", default=False, + help="Use processed expression data embeddings for evaluation.") return parser @@ -82,7 +85,7 @@ def main(): evalConfig = None with open(config_file, 'r') as conf: - evalConfig = ev.ConfigParser.parse(conf) + evalConfig = ev.ConfigParser.parse(conf, use_embeddings=opts.use_embeddings) print('\nPost-run evaluation started...') evalSummarizer = ev.BLEval(evalConfig.input_settings, evalConfig.output_settings) @@ -92,12 +95,28 @@ def main(): str(evalSummarizer.output_settings.output_prefix) + "-" # Compute and plot ROC, PRC and report median AUROC, AUPRC - if (opts.auc): + if opts.auc: print('\n\nComputing areas under ROC and PR curves...') AUPRC, AUROC = evalSummarizer.computeAUC() - AUPRC.to_csv(outDir+'AUPRC.csv') - AUROC.to_csv(outDir+'AUROC.csv') + + # Construct the output directory based on InputSettings + # This ensures that the path mirrors the structure used in InputSettings + dataset_name = str(evalSummarizer.output_settings.output_prefix) # Assuming this holds the dataset name, e.g., 'GSD' + if opts.use_embeddings: + # Use the processed path if embeddings are used + outDir = Path("outputs") / evalSummarizer.input_settings.datadir.relative_to("inputs") / dataset_name / "processed_ExpressionData" + # Ensure that outDir exists + outDir.mkdir(parents=True, exist_ok=True) + else: + # Use the regular path structure + outDir = Path("outputs") / evalSummarizer.input_settings.datadir.relative_to("inputs") / dataset_name + # Ensure that outDir exists + outDir.mkdir(parents=True, exist_ok=True) + # Save the output files in the correct directory + AUPRC.to_csv(outDir / "AUPRC.csv") + AUROC.to_csv(outDir / "AUROC.csv") + # Compute Jaccard index if (opts.jaccard): @@ -124,7 +143,14 @@ def main(): if (opts.epr): print('\n\nComputing early precision values...') ePRDF = evalSummarizer.computeEarlyPrec() - ePRDF.to_csv(outDir + "EPr.csv") + dataset_name = str(evalSummarizer.output_settings.output_prefix) + if opts.use_embeddings: + # Use the processed path if embeddings are used + outDir = Path("outputs") / evalSummarizer.input_settings.datadir.relative_to("inputs") / dataset_name / "processed_ExpressionData" + else: + # Use the regular path structure + outDir = Path("outputs") / evalSummarizer.input_settings.datadir.relative_to("inputs") / dataset_name + ePRDF.to_csv(outDir / "EPr.csv") # Compute early precision for activation and inhibitory edges if (opts.sepr): diff --git a/BLRun/generate_embeds.py b/BLRun/generate_embeds.py new file mode 100644 index 00000000..f796c660 --- /dev/null +++ b/BLRun/generate_embeds.py @@ -0,0 +1,35 @@ +import subprocess +import os +import argparse + +def generate_embeddings(input_file): + input_dir = os.path.dirname(os.path.abspath(input_file)) + input_filename = os.path.basename(input_file) + + output_file = os.path.join(input_dir, "EmbeddingsData.csv") + + cmd = [ + "docker", "run", "--rm", + "-v", f"{input_dir}:/input", + "scgpt_human", + "--input", f"/input/{input_filename}", + "--model_dir", "/app" + ] + + try: + subprocess.run(cmd, check=True) + print(f"Embeddings generated successfully. Output saved to {output_file}") + except subprocess.CalledProcessError as e: + print(f"Error generating embeddings: {e}") + raise + +def main(): + parser = argparse.ArgumentParser(description='Generate gene embeddings using scGPT model') + parser.add_argument('--input', required=True, help='Path to input expression data CSV file') + + args = parser.parse_args() + + generate_embeddings(args.input) + +if __name__ == "__main__": + main() diff --git a/BLRun/grnboost2Runner.py b/BLRun/grnboost2Runner.py index b8186e69..8feb0f67 100644 --- a/BLRun/grnboost2Runner.py +++ b/BLRun/grnboost2Runner.py @@ -40,7 +40,6 @@ def run(RunnerObj): print(cmdToRun) os.system(cmdToRun) - def parseOutput(RunnerObj): ''' Function to parse outputs from GRNBOOST2. diff --git a/BLRun/runner.py b/BLRun/runner.py index e28ec0c5..1478f49d 100644 --- a/BLRun/runner.py +++ b/BLRun/runner.py @@ -12,7 +12,9 @@ import BLRun.singeRunner as SINGE import BLRun.scribeRunner as SCRIBE import BLRun.scsglRunner as SCSGL - +import os +import subprocess +import shutil from pathlib import Path InputMapper = {'SCODE':SCODE.generateInputs, @@ -70,21 +72,74 @@ class Runner(object): ''' A runnable analysis to be incorporated into the pipeline ''' - def __init__(self, - params): + def __init__(self, params): self.name = params['name'] self.inputDir = params['inputDir'] self.params = params['params'] self.exprData = params['exprData'] self.cellData = params['cellData'] - self.trueEdges = params['trueEdges'] #used for evaluation + self.trueEdges = params['trueEdges'] # used for evaluation + self.use_embeddings = params.get('use_embeddings', False) + self.embeddings_file = None + + def generate_embeddings(self): + embed_script_path = Path(__file__).resolve().parent / "generate_embeds.py" + + if not embed_script_path.exists(): + raise FileNotFoundError(f"Embeddings script not found at {embed_script_path}") + + expr_filename = Path(self.exprData).stem + new_input_dir = Path(self.inputDir) / f"processed_{expr_filename}" + new_input_dir.mkdir(parents=True, exist_ok=True) + + self.embeddings_file = new_input_dir / "EmbeddingsData.csv" + print(f"Using input file: {self.exprData}") + print(f"Running embedding generation script at: {embed_script_path}") + print(f"Embeddings will be saved to: {self.embeddings_file}") + + command = [ + "python", + str(embed_script_path), + "--input", str(Path(self.inputDir) / self.exprData), + ] + + try: + subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + print("Embeddings generated successfully.") + + generated_file = Path(self.inputDir) / "EmbeddingsData.csv" + if generated_file.exists(): + shutil.move(str(generated_file), str(self.embeddings_file)) + print(f"Embeddings moved to {self.embeddings_file}") + + ref_network_file = Path(self.inputDir) / "refNetwork.csv" + if ref_network_file.exists(): + shutil.copy(str(ref_network_file), str(new_input_dir)) + print(f"refNetwork.csv copied to {new_input_dir}") + else: + print("refNetwork.csv not found in the input directory.") + + self.inputDir = new_input_dir + self.exprData = "EmbeddingsData.csv" + + except subprocess.CalledProcessError as e: + print(f"Error generating embeddings: {e}") + print(f"Script output: {e.output.decode()}") + print(f"Script error: {e.stderr.decode()}") + raise + def generateInputs(self): + if self.use_embeddings: + self.generate_embeddings() InputMapper[self.name](self) - def run(self): + input_path = Path(self.inputDir) + output_path = Path(str(input_path).replace("inputs", "outputs")) + output_path.mkdir(parents=True, exist_ok=True) + AlgorithmMapper[self.name](self) def parseOutput(self): - OutputParser[self.name](self) + OutputParser[self.name](self) \ No newline at end of file diff --git a/BLRunner.py b/BLRunner.py index e958d5be..a675ca18 100644 --- a/BLRunner.py +++ b/BLRunner.py @@ -37,6 +37,9 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument('--config', default='config.yaml', help='Path to config file') + + parser.add_argument('--use_embeddings', action='store_true', + help='Use embeddings for expression data') return parser @@ -54,12 +57,14 @@ def main(): opts = parse_arguments() config_file = opts.config - with open(config_file, 'r') as conf: evaluation = br.ConfigParser.parse(conf) - print(evaluation) - print('Evaluation started') + + for idx in range(len(evaluation.runners)): + evaluation.runners[idx].use_embeddings = opts.use_embeddings + print(evaluation) + print('Evaluation started') for idx in range(len(evaluation.runners)): evaluation.runners[idx].generateInputs() @@ -72,6 +77,5 @@ def main(): print('Evaluation complete') - if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/Models/scGPT_human/dockerfile b/Models/scGPT_human/dockerfile new file mode 100644 index 00000000..c7453034 --- /dev/null +++ b/Models/scGPT_human/dockerfile @@ -0,0 +1,32 @@ +# Use a base image with conda pre-installed +FROM continuumio/miniconda3 + +LABEL Maintainer="Faizal Rahman " + +USER root + +# Install system dependencies +RUN apt-get update && apt-get install -y time + +WORKDIR /app + +# Copy model files, requirements, and environment file +COPY best_model.pt vocab.json args.json req.yaml /app/ + +# Create conda environment and install packages +RUN conda env create -f req.yaml && \ + conda run -n nrnb238 pip install pandas scgpt torch==2.1.2 && \ + conda clean -afy + +# Copy the script for generating embeddings +COPY generate_embeddings.py /app/ + +# Create a shell script to run our Python script with debugging +RUN echo '#!/bin/bash\n\ +echo "Debugging information:"\n\ +conda run -n nrnb238 python -c "import sys; print(sys.path)"\n\ +conda run --no-capture-output -n nrnb238 python generate_embeddings.py "$@"' > /app/run_script.sh && \ + chmod +x /app/run_script.sh + +# Set the entrypoint to run our shell script +ENTRYPOINT ["/app/run_script.sh"] \ No newline at end of file diff --git a/Models/scGPT_human/generate_embeddings.py b/Models/scGPT_human/generate_embeddings.py new file mode 100644 index 00000000..6abb17cf --- /dev/null +++ b/Models/scGPT_human/generate_embeddings.py @@ -0,0 +1,96 @@ +import pandas as pd +import json +import warnings +import torch +from scgpt.tokenizer.gene_tokenizer import GeneVocab +from scgpt.model import TransformerModel +from scgpt.utils import set_seed +import os +import argparse + +warnings.filterwarnings('ignore') +set_seed(42) + +def initialize_model(args_file, model_file, vocab_file): + if not all(os.path.exists(f) for f in [args_file, model_file, vocab_file]): + raise FileNotFoundError(f"Required model files not found: {args_file}, {model_file}, {vocab_file}") + + vocab = GeneVocab.from_file(vocab_file) + special_tokens = ["", "", ""] + for s in special_tokens: + if s not in vocab: + vocab.append_token(s) + + with open(args_file, "r") as f: + model_configs = json.load(f) + + ntokens = len(vocab) + model = TransformerModel( + ntokens, + model_configs["embsize"], + model_configs["nheads"], + model_configs["d_hid"], + model_configs["nlayers"], + vocab=vocab, + pad_value=model_configs.get("pad_value", -2), + n_input_bins=model_configs.get("n_bins", 51), + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + state_dict = torch.load(model_file, map_location=device) + + model.load_state_dict(state_dict, strict=False) + model.to(device) + model.eval() + + return model, vocab, device + +def generate_and_save_embeddings(file_path, model, vocab, device, output_file): + expression_data = pd.read_csv(file_path, index_col=0) + gene_names = expression_data.index.tolist() + tokenized_genes = [] + for gene in gene_names: + upper_gene = gene.upper() + if upper_gene in vocab: + tokenized_genes.append(vocab[upper_gene]) + else: + tokenized_genes.append(vocab[""]) # Use token for unknown genes + + gene_ids = torch.tensor(tokenized_genes, dtype=torch.long).to(device) + with torch.no_grad(): + gene_embeddings = model.encoder(gene_ids) + gene_embeddings = gene_embeddings.detach().cpu().numpy() + + gene_embeddings_dict = {gene: gene_embeddings[i] for i, gene in enumerate(gene_names)} + embeddings_df = pd.DataFrame(gene_embeddings_dict).T + embeddings_df.to_csv(output_file) + print(f'Saved gene embeddings for {len(gene_embeddings_dict)} genes to {output_file}') + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate gene embeddings') + parser.add_argument('--input', required=True, help='Path to input expression data CSV file') + parser.add_argument('--model_dir', default='/app', help='Directory containing model files') + args = parser.parse_args() + + # Generate output path based on input path + input_dir = os.path.dirname(args.input) + input_filename = os.path.basename(args.input) + output_filename = "EmbeddingsData.csv" + output_path = os.path.join(input_dir, output_filename) + + print("Debug Information:") + print(f"Input file path: {args.input}") + print(f"Output file path: {output_path}") + print(f"Model directory: {args.model_dir}") + + args_file = os.path.join(args.model_dir, "args.json") + model_file = os.path.join(args.model_dir, "best_model.pt") + vocab_file = os.path.join(args.model_dir, "vocab.json") + + try: + model, vocab, device = initialize_model(args_file, model_file, vocab_file) + generate_and_save_embeddings(args.input, model, vocab, device, output_path) + except FileNotFoundError as e: + print(f"Error: {e}") + print("Please ensure that the model files (args.json, best_model.pt, vocab.json) are in the correct directory.") + print(f"Current model directory: {args.model_dir}") diff --git a/Models/scGPT_human/req.yaml b/Models/scGPT_human/req.yaml new file mode 100644 index 00000000..c806d524 --- /dev/null +++ b/Models/scGPT_human/req.yaml @@ -0,0 +1,85 @@ +name: nrnb238 +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - asttokens=2.4.1=pyhd8ed1ab_0 + - bzip2=1.0.8=h4bc722e_7 + - ca-certificates=2024.7.4=hbcca054_0 + - comm=0.2.2=pyhd8ed1ab_0 + - debugpy=1.8.5=py310hea249c9_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - exceptiongroup=1.2.2=pyhd8ed1ab_0 + - executing=2.0.1=pyhd8ed1ab_0 + - importlib-metadata=8.2.0=pyha770c72_0 + - importlib_metadata=8.2.0=hd8ed1ab_0 + - ipykernel=6.29.5=pyh3099207_0 + - ipython=8.26.0=pyh707e725_0 + - jedi=0.19.1=pyhd8ed1ab_0 + - jupyter_client=8.6.2=pyhd8ed1ab_0 + - jupyter_core=5.7.2=py310hff52083_0 + - keyutils=1.6.1=h166bdaf_0 + - krb5=1.21.3=h659f571_0 + - ld_impl_linux-64=2.40=hf3520f5_7 + - libedit=3.1.20191231=he28a2e2_2 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=14.1.0=h77fa898_0 + - libgomp=14.1.0=h77fa898_0 + - libnsl=2.0.1=hd590300_0 + - libsodium=1.0.18=h36c2ea0_1 + - libsqlite=3.46.0=hde9e2c9_0 + - libstdcxx-ng=14.1.0=hc0a3c3a_0 + - libuuid=2.38.1=h0b41bf4_0 + - libxcrypt=4.4.36=hd590300_1 + - libzlib=1.2.13=h4ab18f5_6 + - matplotlib-inline=0.1.7=pyhd8ed1ab_0 + - ncurses=6.5=h59595ed_0 + - nest-asyncio=1.6.0=pyhd8ed1ab_0 + - openssl=3.3.1=h4bc722e_2 + - packaging=24.1=pyhd8ed1ab_0 + - parso=0.8.4=pyhd8ed1ab_0 + - pexpect=4.9.0=pyhd8ed1ab_0 + - pickleshare=0.7.5=py_1003 + - pip=24.2=pyhd8ed1ab_0 + - platformdirs=4.2.2=pyhd8ed1ab_0 + - prompt-toolkit=3.0.47=pyha770c72_0 + - psutil=6.0.0=py310hc51659f_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.3=pyhd8ed1ab_0 + - pygments=2.18.0=pyhd8ed1ab_0 + - python=3.10.14=hd12c33a_0_cpython + - python-dateutil=2.9.0=pyhd8ed1ab_0 + - python_abi=3.10=4_cp310 + - pyzmq=26.0.3=py310h6883aea_0 + - readline=8.2=h8228510_1 + - setuptools=72.1.0=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - sqlite=3.46.0=h6d4b2fc_0 + - stack_data=0.6.2=pyhd8ed1ab_0 + - tk=8.6.14=h39e8969_0 + - tornado=6.4.1=py310hc51659f_0 + - traitlets=5.14.3=pyhd8ed1ab_0 + - typing_extensions=4.12.2=pyha770c72_0 + - tzdata=2024a=h0c530f3_0 + - wcwidth=0.2.13=pyhd8ed1ab_0 + - wheel=0.44.0=pyhd8ed1ab_0 + - xz=5.4.6=h5eee18b_1 + - zeromq=4.3.5=h75354e8_4 + - zipp=3.19.2=pyhd8ed1ab_0 + - zlib=1.2.13=h4ab18f5_6 + - pip: + - certifi==2024.7.4 + - idna==3.7 + - markupsafe==2.1.5 + - pillow==10.4.0 + - protobuf==5.27.3 + - pytz==2024.1 + - pyyaml==6.0.1 + - urllib3==2.2.2 + - pandas + - scgpt + - torch==2.1.2 + + diff --git a/README.md b/README.md index 4a24ae9c..b85e4af7 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,9 @@ Quick setup: We provided an example dataset under inputs/example/GSD/ and a corresponding configuration file necessary for running GRN inference using 12 methods described in BEELINE. - To compute proposed reconstructions on the example dataset, run `python BLRunner.py --config config-files/config.yaml`. Running this script for the first time can be slow as it involves downloading the contianers from Docker hub. +- To compute proposed reconstructions using Embeddings data on the example dataset, run `python BLRunner.py --config config-files/config.yaml --use_embeddings`. - To compute areas under the ROC and PR curves for the proposed reconstructions, run `python BLEvaluator.py --config config-files/config.yaml --auc`. To display the complete list of evalutation options, run `python BLEvaluator.py --help`. +- To compute areas under the ROC and PR curves for the proposed reconstructions generated using Embeddings data, run `python BLEvaluator.py --config config-files/config.yaml --auc --use_embeddings`. If you use BEELINE in your research, please cite: diff --git a/config-files/config.yaml b/config-files/config.yaml index 1a59123e..e32e0d3e 100644 --- a/config-files/config.yaml +++ b/config-files/config.yaml @@ -22,11 +22,11 @@ input_settings: # If this parameter is absent (or commented out), run BEELINE on every sub-directory of dataset_dir. # BEELINE will assume that these files are present under the default names # "ExpressionData.csv", "PseudoTime.csv" and "refNetwork.csv", respectively. - # datasets: - # - name: "GSD" - # exprData: "ExpressionData.csv" - # cellData: "PseudoTime.csv" - # trueEdges: "refNetwork.csv" + datasets: + - name: "GSD" + exprData: "ExpressionData.csv" + cellData: "PseudoTime.csv" + trueEdges: "refNetwork.csv" # Denotes a list of algorithms to run. Each has the following parameters: # name: Name of the algorithm. Must be recognized by the pipeline, see @@ -47,9 +47,9 @@ input_settings: - name: "GRNVBEM" params: - should_run: [True] + should_run: [True] + - - name: "GENIE3" params: diff --git a/initialize.sh b/initialize.sh index e3f6ae9e..5c625481 100755 --- a/initialize.sh +++ b/initialize.sh @@ -5,6 +5,24 @@ echo "This may take a while..." BASEDIR=$(pwd) +echo "Building Docker image for scGPT model..." +cd $BASEDIR/Models/scGPT_human +DOCKER_IMAGE_TAG="dostofizky/scgpt_human:latest" +LOCAL_IMAGE_TAG="scgpt_human:latest" + +# Pull the Docker image from Docker Hub +echo "Pulling Docker image for scGPT model..." +docker pull $DOCKER_IMAGE_TAG +if [ $? -eq 0 ]; then + echo "Successfully pulled scGPT image from DockerHub" + echo "Tagging the pulled image as '$LOCAL_IMAGE_TAG'" + docker tag $DOCKER_IMAGE_TAG $LOCAL_IMAGE_TAG + echo "The scGPT Docker image is now ready for use with the local tag '$LOCAL_IMAGE_TAG'" +else + echo "Failed to pull scGPT image from DockerHub" +fi + + # You may remove the -q flag if you want to see the docker build status cd $BASEDIR/Algorithms/ARBORETO docker build -q -t arboreto:base .