From 16f6c68496564926932dfc15ec848b18086de09f Mon Sep 17 00:00:00 2001 From: Aditya Pratapa Date: Mon, 24 Feb 2020 05:06:32 +0000 Subject: [PATCH] Added Borda analysis to master branch. --- BLEval/__init__.py | 56 ++++++++--- BLEval/computeBorda.py | 78 +++++++++++++++ BLEval/computePathStats.py | 193 +++++++++++++++++++++++++++++++++++++ BLEvaluator.py | 18 +++- 4 files changed, 332 insertions(+), 13 deletions(-) create mode 100644 BLEval/computeBorda.py create mode 100644 BLEval/computePathStats.py diff --git a/BLEval/__init__.py b/BLEval/__init__.py index 1d91c996..94adee08 100644 --- a/BLEval/__init__.py +++ b/BLEval/__init__.py @@ -30,11 +30,12 @@ # local imports from BLEval.parseTime import getTime from BLEval.computeDGAUC import PRROC +from BLEval.computeBorda import Borda from BLEval.computeJaccard import Jaccard from BLEval.computeSpearman import Spearman from BLEval.computeNetMotifs import Motifs from BLEval.computeEarlyPrec import EarlyPrec -#from BLEval.computePathStats import pathAnalysis +from BLEval.computePathStats import pathAnalysis from BLEval.computeSignedEPrec import signedEPrec @@ -223,17 +224,17 @@ def computeNetMotifs(self): return FBL, FFL, MI - # def computePaths(self): - # ''' - # For each algorithm-dataset combination, this function computes path lengths - # through TP edges and FP edges, returns statistics on path lengths. - - # :returns: - # - pathStats: A dataframe path lengths in predicted network - # ''' - # for dataset in tqdm(self.input_settings.datasets, - # total = len(self.input_settings.datasets), unit = " Datasets"): - # pathAnalysis(dataset, self.input_settings) + def computePaths(self): + ''' + For each algorithm-dataset combination, this function computes path lengths + through TP edges and FP edges, returns statistics on path lengths. + + :returns: + - pathStats: A dataframe path lengths in predicted network + ''' + for dataset in tqdm(self.input_settings.datasets, + total = len(self.input_settings.datasets), unit = " Datasets"): + pathAnalysis(dataset, self.input_settings) def computeEarlyPrec(self): @@ -281,6 +282,37 @@ def computeSignedEPrec(self): sEPRDict['EPrec Inhibition'][algo[0]] = sEPrecDF['-'] return(pd.DataFrame(sEPRDict['EPrec Activation']).T, pd.DataFrame(sEPRDict['EPrec Inhibition']).T) + def computeBorda(self, selectedAlgorithms=None, aggregationMethod="average"): + + ''' + Computes edge ranked list using the Borda method for each dataset. + Parameters + ---------- + selectedAlgorithms: [str] + List of algorithm names used to run borda method on selected + algorithms. If nothing is provided, the function runs borda on + all available algorithms. + aggregationMethod: str + Method used to aggregate rank in borda method. Available options are + {‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’ + :returns: + None + ''' + feasibleAlgorithmOptions = [algorithmName for algorithmName, _ in self.input_settings.algorithms] + feasibleaggregationMethodOptions = ['average', 'min', 'max', 'first'] + + selectedAlgorithms = feasibleAlgorithmOptions if selectedAlgorithms is None else selectedAlgorithms + + for a in selectedAlgorithms: + if a not in feasibleAlgorithmOptions: + print("\nERROR: No data available on algorithm %s. Please choose an algorithm from the following options: %s" % (a, feasibleAlgorithmOptions)) + return + + if aggregationMethod not in feasibleaggregationMethodOptions: + print("\nERROR: Please choose an aggregation method algorithm from following options: " % feasibleaggregationMethodOptions) + return + + Borda(self, selectedAlgorithms, aggregationMethod) class ConfigParser(object): ''' diff --git a/BLEval/computeBorda.py b/BLEval/computeBorda.py new file mode 100644 index 00000000..cb569ebe --- /dev/null +++ b/BLEval/computeBorda.py @@ -0,0 +1,78 @@ +import os +import math +import numpy as np +import pandas as pd +import seaborn as sns +from tqdm import tqdm +from pathlib import Path +import concurrent.futures +import matplotlib.pyplot as plt +from itertools import permutations +from sklearn import preprocessing +from sklearn.metrics import precision_recall_curve, roc_curve, auc +sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks") + + +def Borda(evalObject, selectedAlgorithms=None, aggregationMethod="average"): + """ + A function to compute edge ranked list using the Borda method from + the predicted ranked edges, i.e., the outputs of different datasets + generated from the same reference network, for each dataset. + Parameters + ---------- + evalObject: BLEval + An object of class :class:`BLEval.BLEval`. + selectedAlgorithms: [str] + List of algorithm names used to run borda method on selected + algorithms. If nothing is provided, the function runs borda on + all available algorithms. + aggregationMethod: str + Method used to aggregate rank in borda method. Available options are + {‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’ + :returns: + - None + """ + evaluationDFs = [] + for dataset in tqdm(evalObject.input_settings.datasets): + edges = [] + for algorithmName, _ in tqdm(evalObject.input_settings.algorithms): + outDir = str(evalObject.output_settings.base_dir) + \ + str(evalObject.input_settings.datadir).split("inputs")[1] + \ + "/" + dataset["name"] + inDir = str(evalObject.input_settings.datadir) + "/" + dataset["name"] + rank_path = outDir + "/" + algorithmName + "/rankedEdges.csv" + refNetwork_path = inDir + "/" + dataset["trueEdges"] + + if not os.path.isdir(outDir) or not os.path.isdir(inDir): + continue + try: + df = pd.read_csv(rank_path, sep="\t", header=0, index_col=None) + refNetwork = pd.read_csv(refNetwork_path, header=0, index_col=None) + refNetwork['edge'] = refNetwork.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1) + refNetwork = refNetwork[refNetwork.Gene1!=refNetwork.Gene2] + refNetwork['isReferenceEdge'] = 1 + all_edges_df = pd.DataFrame(list(permutations(np.unique(refNetwork.loc[:,['Gene1','Gene2']]), r = 2)), columns=['Gene1','Gene2']) + ranked_edges = pd.merge(all_edges_df, df, on=['Gene1','Gene2'], how='left') + ranked_edges.EdgeWeight = ranked_edges.EdgeWeight.fillna(0) + ranked_edges['absEdgeWeight'] = ranked_edges['EdgeWeight'].abs() + ranked_edges['normEdgeWeight'] = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(ranked_edges[['absEdgeWeight']]))[0] + ranked_edges['dataset'] = dataset["name"] + ranked_edges['algo'] = algorithmName + ranked_edges['edge'] = ranked_edges.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1) + edges.append(ranked_edges) + except Exception as e: + print("\nSkipping Borda computation for ", algorithmName, "on path", outDir) + selectedAlgorithms.remove(algorithmName) + continue + rank_df = pd.pivot_table(pd.concat(edges), values='normEdgeWeight', index='edge', columns='algo') + rank_df['BORDA'] = __normalize__(rank_df.rank(ascending=True, method=aggregationMethod).mean(axis=1).values) + rank_df['mBORDA'] = __normalize__(rank_df.rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values) + rank_df['sBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=True, method=aggregationMethod).mean(axis=1).values) + rank_df['smBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values) + rank_df['Gene1'], rank_df['Gene2'] = rank_df.index.str.split('-', 1).str + rank_df[['Gene1','Gene2','BORDA','mBORDA','sBORDA','smBORDA']].to_csv(outDir+"/Borda.csv", index=False) + + + +def __normalize__(arr): + return 1.0*(arr-np.min(arr))/(np.max(arr)-np.min(arr)) diff --git a/BLEval/computePathStats.py b/BLEval/computePathStats.py new file mode 100644 index 00000000..10ac574f --- /dev/null +++ b/BLEval/computePathStats.py @@ -0,0 +1,193 @@ +import pandas as pd +import sys +import numpy as np +import seaborn as sns +from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns +sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks") +from sklearn.metrics import precision_recall_curve, roc_curve, auc +from itertools import product, permutations, combinations, combinations_with_replacement +from tqdm import tqdm +import networkx as nx + +def pathAnalysis(dataDict, inputSettings): + ''' + Computes "directed","feed-forward", + "cascade", and "mutual" motifs. + ''' + + # Read file for trueEdges + trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] + + '/' +dataDict['trueEdges'], + sep = ',', + header = 0, index_col = None) + + possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), + r = 2)) + EdgeDict = {'|'.join(p):0 for p in possibleEdges} + + refGraph = nx.DiGraph() + + for key in EdgeDict.keys(): + u = key.split('|')[0] + v = key.split('|')[1] + if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == u) & + (trueEdgesDF['Gene2'] == v)])>0: + refGraph.add_edge(u,v) + + #refCC, refFB, refFF, refMI = getNetProp(refGraph) + + + # set-up outDir that stores output directory name + outDir = "outputs/"+str(inputSettings.datadir).split("inputs/")[1]+ '/' +dataDict['name'] + #print(dataDict['name']) + + ################################################## + # Get counts of tp,fp with, and fp without paths # + ################################################## + collection = {} + + for algo in inputSettings.algorithms: + # check if the output rankedEdges file exists + if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists() and algo[0] not in ['PPCOR','PIDC']: + # Initialize Precsion + predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \ + sep = '\t', header = 0, index_col = None) + + predDF.EdgeWeight = predDF.EdgeWeight.round(6) + predDF.EdgeWeight = predDF.EdgeWeight.abs() + predDF = predDF.loc[(predDF['EdgeWeight'] > 0)] + predDF.drop_duplicates(keep = 'first', inplace=True) + predDF.reset_index(drop = True, inplace= True) + predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])] + if predDF.shape[0] != 0: + maxk = min(predDF.shape[0], len(refGraph.edges())) + edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight + + newDF = predDF.loc[(predDF['EdgeWeight'] >= edgeWeightTopk)] + + predGraph = nx.DiGraph() + + + for key in EdgeDict.keys(): + u = key.split('|')[0] + v = key.split('|')[1] + if len(newDF.loc[(newDF['Gene1'] == u) & + (newDF['Gene2'] == v)])>0: + predGraph.add_edge(u,v) + + dataDict = pathStats(predGraph, refGraph) + collection[algo[0]] = dataDict + if algo[0] == 'PPCOR' or algo[0] == 'PIDC': + collection[algo[0]] = {} + else: + print(outDir + '/' +algo[0]+'/rankedEdges.csv', \ + ' is an undirected graph. Skipping...') + + hMap = '/pathStats' + largest = max([len(algopaths.keys()) for algo,algopaths in collection.items()]) + allpathsizes = set() + for algo,algopaths in collection.items(): + allpathsizes.update(set(algopaths.keys())) + + for algo, algopaths in collection.items(): + notpresent = allpathsizes.difference(set(algopaths.keys())) + collection[algo].update({np:0 for np in notpresent}) + + dataDF = pd.DataFrame(collection) + dataDF = dataDF.T + dataDF.to_csv(outDir+hMap+'.csv') + + + + +def getNetProp(inGraph): + ''' + Function to compute properties + of a given network. + ''' + + # number of weakly connected components in + # reference network + numCC = len(list(nx.weakly_connected_components(inGraph))) + + # number of feedback loop + # in reference network + allCyc = nx.simple_cycles(inGraph) + cycSet = set() + for cyc in allCyc: + if len(cyc) == 3: + cycSet.add(frozenset(cyc)) + + numFB = len(cycSet) + + # number of feedfwd loops + # in reference network + allPaths = [] + allPathsSet = set() + for u,v in inGraph.edges(): + allPaths = nx.all_simple_paths(inGraph, u, v, cutoff=2) + for p in allPaths: + if len(p) > 2: + allPathsSet.add(frozenset(p)) + + numFF= len(allPathsSet) + + + # number of mutual interactions + numMI = 0.0 + for u,v in inGraph.edges(): + if (v,u) in inGraph.edges(): + numMI += 0.5 + + return numCC, numFB, numFF, numMI + +def getEdgeHistogram(inGraph, refGraph): + falsePositives = set(inGraph.edges()).difference(refGraph.edges()) + edgeHistogramCounts = {0:0} + + for fe in falsePositives: + u,v = fe + try: + path = nx.dijkstra_path(refGraph,u,v) + pathlength = len(path) -1 + if pathlength in edgeHistogramCounts.keys(): + edgeHistogramCounts[pathlength] +=1 + else: + edgeHistogramCounts[pathlength] = 0 + + except nx.exception.NetworkXNoPath: + edgeHistogramCounts[0] +=1 + return edgeHistogramCounts + + + +def pathStats(inGraph, refGraph): + """ + Only returns TP, FP, numPredictions for each networks + """ + falsePositives = set(inGraph.edges()).difference(refGraph.edges()) + truePositives = set(inGraph.edges()).intersection(refGraph.edges()) + numPredictions = len(inGraph.edges()) + nopath = 0 + yespath = 0 + edgeCounts = {0:0,2:0,3:0,4:0,5:0} + for fe in falsePositives: + u,v = fe + try: + path = nx.dijkstra_path(refGraph,u,v) + pathlength = len(path) -1 + yespath +=1 + if pathlength in edgeCounts.keys(): + edgeCounts[pathlength] +=1 + + except nx.exception.NetworkXNoPath: + nopath +=1 + + edgeCounts['numPred'] = numPredictions + edgeCounts['numTP'] = len(truePositives) + edgeCounts['numFP_withPath'] = yespath + edgeCounts['numFP_noPath'] = nopath + return edgeCounts + \ No newline at end of file diff --git a/BLEvaluator.py b/BLEvaluator.py index 7dda6fa7..20f18111 100644 --- a/BLEvaluator.py +++ b/BLEvaluator.py @@ -56,8 +56,13 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument('-m','--motifs', action="store_true", default=False, help="Compute network motifs in the predicted top-k networks.") + + parser.add_argument('-p','--paths', action="store_true", default=False, + help="Compute path length statistics on the predicted top-k networks.") - + parser.add_argument('-b','--borda', action="store_true", default=False, + help="Compute edge ranked list using the various Borda aggregatio methods.") + return parser def parse_arguments(): @@ -137,6 +142,17 @@ def main(): FBL.to_csv(outDir+'NetworkMotifs-FBL.csv') FFL.to_csv(outDir+'NetworkMotifs-FFL.csv') MI.to_csv(outDir+'NetworkMotifs-MI.csv') + + # Compute path statistics such as number of TP, FP, + # and path lengths among TP in the top-k networks. + if (opts.paths): + print('\n\nComputing path length statistics on predicted networks...') + evalSummarizer.computePaths() + + # Compute edge ranked list using the borda method + if (opts.borda): + print('\n\nComputing edge ranked list using the borda method') + evalSummarizer.computeBorda() print('\n\nEvaluation complete...\n')