Added Borda analysis to master branch.

Murali-group · Feb 24, 2020 · 16f6c68 · 16f6c68
1 parent e5d932c
commit 16f6c68
Show file tree

Hide file tree

Showing 4 changed files with 332 additions and 13 deletions.
diff --git a/BLEval/__init__.py b/BLEval/__init__.py
@@ -30,11 +30,12 @@
 # local imports
 from BLEval.parseTime import getTime
 from BLEval.computeDGAUC import PRROC
+from BLEval.computeBorda import Borda
 from BLEval.computeJaccard import Jaccard
 from BLEval.computeSpearman import Spearman
 from BLEval.computeNetMotifs import Motifs
 from BLEval.computeEarlyPrec import EarlyPrec
-#from BLEval.computePathStats import pathAnalysis
+from BLEval.computePathStats import pathAnalysis
 from BLEval.computeSignedEPrec import signedEPrec
 
 
@@ -223,17 +224,17 @@ def computeNetMotifs(self):
         return FBL, FFL, MI
 
 
-    # def computePaths(self):
-    #     '''
-    #     For each algorithm-dataset combination, this function computes path lengths
-    #     through TP edges and FP edges, returns statistics on path lengths.
-
-    #     :returns:
-    #         - pathStats: A dataframe path lengths in predicted network
-    #     '''
-    #     for dataset in tqdm(self.input_settings.datasets, 
-    #                         total = len(self.input_settings.datasets), unit = " Datasets"):
-    #         pathAnalysis(dataset, self.input_settings)
+    def computePaths(self):
+        '''
+        For each algorithm-dataset combination, this function computes path lengths
+        through TP edges and FP edges, returns statistics on path lengths.
+
+        :returns:
+            - pathStats: A dataframe path lengths in predicted network
+        '''
+        for dataset in tqdm(self.input_settings.datasets, 
+                            total = len(self.input_settings.datasets), unit = " Datasets"):
+            pathAnalysis(dataset, self.input_settings)
 
 
     def computeEarlyPrec(self):
@@ -281,6 +282,37 @@ def computeSignedEPrec(self):
                 sEPRDict['EPrec Inhibition'][algo[0]] = sEPrecDF['-']
         return(pd.DataFrame(sEPRDict['EPrec Activation']).T, pd.DataFrame(sEPRDict['EPrec Inhibition']).T)
 
+    def computeBorda(self, selectedAlgorithms=None, aggregationMethod="average"):
+
+        '''
+        Computes edge ranked list using the Borda method for each dataset.
+        Parameters
+        ----------
+        selectedAlgorithms: [str]
+          List of algorithm names used to run borda method on selected
+          algorithms. If nothing is provided, the function runs borda on
+          all available algorithms.
+        aggregationMethod: str
+          Method used to aggregate rank in borda method. Available options are
+          {‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’
+        :returns:
+            None
+        '''
+        feasibleAlgorithmOptions = [algorithmName for algorithmName, _ in self.input_settings.algorithms]
+        feasibleaggregationMethodOptions = ['average', 'min', 'max', 'first']
+
+        selectedAlgorithms = feasibleAlgorithmOptions if selectedAlgorithms is None else selectedAlgorithms
+
+        for a in selectedAlgorithms:
+            if a not in feasibleAlgorithmOptions:
+                print("\nERROR: No data available on algorithm %s. Please choose an algorithm from the following options: %s" % (a, feasibleAlgorithmOptions))
+                return
+
+        if aggregationMethod not in feasibleaggregationMethodOptions:
+                print("\nERROR: Please choose an aggregation method algorithm from following options: " % feasibleaggregationMethodOptions)
+                return
+
+        Borda(self, selectedAlgorithms, aggregationMethod)
 
 class ConfigParser(object):
     '''

diff --git a/BLEval/computeBorda.py b/BLEval/computeBorda.py
@@ -0,0 +1,78 @@
+import os
+import math
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from tqdm import tqdm
+from pathlib import Path
+import concurrent.futures
+import matplotlib.pyplot as plt
+from itertools import permutations
+from sklearn import preprocessing
+from sklearn.metrics import precision_recall_curve, roc_curve, auc
+sns.set(rc={"lines.linewidth": 2}, palette  = "deep", style = "ticks")
+
+
+def Borda(evalObject, selectedAlgorithms=None, aggregationMethod="average"):
+    """
+    A function to compute edge ranked list using the Borda method from
+    the predicted ranked edges, i.e., the outputs of different datasets
+    generated from the same reference network, for each dataset.
+    Parameters
+    ----------
+    evalObject: BLEval
+      An object of class :class:`BLEval.BLEval`.
+    selectedAlgorithms: [str]
+      List of algorithm names used to run borda method on selected
+      algorithms. If nothing is provided, the function runs borda on
+      all available algorithms.
+    aggregationMethod: str
+      Method used to aggregate rank in borda method. Available options are
+      {‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’
+    :returns:
+        - None
+    """
+    evaluationDFs = []
+    for dataset in tqdm(evalObject.input_settings.datasets):
+        edges = []
+        for algorithmName, _ in tqdm(evalObject.input_settings.algorithms):
+            outDir = str(evalObject.output_settings.base_dir) + \
+                     str(evalObject.input_settings.datadir).split("inputs")[1] + \
+                     "/" + dataset["name"]
+            inDir = str(evalObject.input_settings.datadir) + "/" + dataset["name"]
+            rank_path = outDir + "/" + algorithmName + "/rankedEdges.csv"
+            refNetwork_path = inDir + "/" + dataset["trueEdges"]
+
+            if not os.path.isdir(outDir) or not os.path.isdir(inDir):
+                continue
+            try:
+                df = pd.read_csv(rank_path, sep="\t", header=0, index_col=None)
+                refNetwork = pd.read_csv(refNetwork_path, header=0, index_col=None)
+                refNetwork['edge'] = refNetwork.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1)
+                refNetwork = refNetwork[refNetwork.Gene1!=refNetwork.Gene2]
+                refNetwork['isReferenceEdge'] = 1
+                all_edges_df = pd.DataFrame(list(permutations(np.unique(refNetwork.loc[:,['Gene1','Gene2']]), r = 2)), columns=['Gene1','Gene2'])
+                ranked_edges = pd.merge(all_edges_df, df, on=['Gene1','Gene2'], how='left')
+                ranked_edges.EdgeWeight = ranked_edges.EdgeWeight.fillna(0)
+                ranked_edges['absEdgeWeight'] = ranked_edges['EdgeWeight'].abs()
+                ranked_edges['normEdgeWeight'] = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(ranked_edges[['absEdgeWeight']]))[0]
+                ranked_edges['dataset'] = dataset["name"]
+                ranked_edges['algo'] = algorithmName
+                ranked_edges['edge'] = ranked_edges.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1)
+                edges.append(ranked_edges)
+            except Exception as e:
+                print("\nSkipping Borda computation for ", algorithmName, "on path", outDir)
+                selectedAlgorithms.remove(algorithmName)
+                continue
+        rank_df = pd.pivot_table(pd.concat(edges), values='normEdgeWeight', index='edge', columns='algo')
+        rank_df['BORDA'] = __normalize__(rank_df.rank(ascending=True, method=aggregationMethod).mean(axis=1).values)
+        rank_df['mBORDA'] = __normalize__(rank_df.rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values)
+        rank_df['sBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=True, method=aggregationMethod).mean(axis=1).values)
+        rank_df['smBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values)
+        rank_df['Gene1'], rank_df['Gene2'] = rank_df.index.str.split('-', 1).str
+        rank_df[['Gene1','Gene2','BORDA','mBORDA','sBORDA','smBORDA']].to_csv(outDir+"/Borda.csv", index=False)
+
+
+
+def __normalize__(arr):
+    return 1.0*(arr-np.min(arr))/(np.max(arr)-np.min(arr))
diff --git a/BLEval/computePathStats.py b/BLEval/computePathStats.py
@@ -0,0 +1,193 @@
+import pandas as pd
+import sys
+import numpy as np
+import seaborn as sns
+from pathlib import Path
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set(rc={"lines.linewidth": 2}, palette  = "deep", style = "ticks")
+from sklearn.metrics import precision_recall_curve, roc_curve, auc
+from itertools import product, permutations, combinations, combinations_with_replacement
+from tqdm import tqdm
+import networkx as nx
+
+def pathAnalysis(dataDict, inputSettings):
+    '''
+    Computes "directed","feed-forward", 
+    "cascade", and "mutual" motifs.
+    '''
+
+    # Read file for trueEdges
+    trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] +
+                                '/' +dataDict['trueEdges'],
+                                sep = ',', 
+                                header = 0, index_col = None)
+
+    possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]),
+                                 r = 2))        
+    EdgeDict = {'|'.join(p):0 for p in possibleEdges}
+
+    refGraph = nx.DiGraph()
+
+    for key in EdgeDict.keys():
+        u = key.split('|')[0]
+        v = key.split('|')[1]
+        if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == u) &
+               (trueEdgesDF['Gene2'] == v)])>0:
+                refGraph.add_edge(u,v)
+
+    #refCC, refFB, refFF, refMI = getNetProp(refGraph)
+
+
+    # set-up outDir that stores output directory name
+    outDir = "outputs/"+str(inputSettings.datadir).split("inputs/")[1]+ '/' +dataDict['name']
+    #print(dataDict['name'])
+
+    ##################################################
+    # Get counts of tp,fp with, and fp without paths #
+    ##################################################
+    collection = {}
+
+    for algo in inputSettings.algorithms:
+        # check if the output rankedEdges file exists
+        if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists() and algo[0] not in ['PPCOR','PIDC']:
+            # Initialize Precsion
+            predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \
+                                        sep = '\t', header =  0, index_col = None)
+
+            predDF.EdgeWeight = predDF.EdgeWeight.round(6)
+            predDF.EdgeWeight = predDF.EdgeWeight.abs()
+            predDF = predDF.loc[(predDF['EdgeWeight'] > 0)]
+            predDF.drop_duplicates(keep = 'first', inplace=True)
+            predDF.reset_index(drop = True,  inplace= True)
+            predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])]
+            if predDF.shape[0] != 0:
+                maxk = min(predDF.shape[0], len(refGraph.edges()))
+                edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight            
+
+            newDF = predDF.loc[(predDF['EdgeWeight'] >= edgeWeightTopk)]
+
+            predGraph = nx.DiGraph()
+
+
+            for key in EdgeDict.keys():
+                u = key.split('|')[0]
+                v = key.split('|')[1]
+                if len(newDF.loc[(newDF['Gene1'] == u) &
+                       (newDF['Gene2'] == v)])>0:
+                        predGraph.add_edge(u,v)
+
+            dataDict = pathStats(predGraph, refGraph)
+            collection[algo[0]] = dataDict
+            if algo[0] == 'PPCOR' or algo[0] == 'PIDC':
+                collection[algo[0]] = {}
+        else:
+            print(outDir + '/' +algo[0]+'/rankedEdges.csv', \
+                  ' is an undirected graph. Skipping...')
+
+    hMap = '/pathStats'
+    largest = max([len(algopaths.keys()) for algo,algopaths in collection.items()])
+    allpathsizes = set()
+    for algo,algopaths in collection.items():
+        allpathsizes.update(set(algopaths.keys()))
+
+    for algo, algopaths in collection.items():
+        notpresent = allpathsizes.difference(set(algopaths.keys()))
+        collection[algo].update({np:0 for np in notpresent})
+
+    dataDF = pd.DataFrame(collection)
+    dataDF = dataDF.T
+    dataDF.to_csv(outDir+hMap+'.csv')
+
+
+
+
+def getNetProp(inGraph):
+    '''
+    Function to compute properties
+    of a given network.
+    '''
+
+    # number of weakly connected components in 
+    # reference network
+    numCC = len(list(nx.weakly_connected_components(inGraph)))
+
+    # number of feedback loop 
+    # in reference network
+    allCyc = nx.simple_cycles(inGraph)
+    cycSet = set()
+    for cyc in allCyc:
+        if len(cyc) == 3:
+            cycSet.add(frozenset(cyc))
+
+    numFB = len(cycSet)
+
+    # number of feedfwd loops
+    # in reference network
+    allPaths = []
+    allPathsSet = set()   
+    for u,v in inGraph.edges():
+        allPaths = nx.all_simple_paths(inGraph, u, v, cutoff=2)
+        for p in allPaths:
+            if len(p) >  2:
+                allPathsSet.add(frozenset(p))
+
+    numFF= len(allPathsSet)
+
+
+    # number of mutual interactions
+    numMI = 0.0
+    for u,v in inGraph.edges():
+        if (v,u) in inGraph.edges():
+            numMI += 0.5
+
+    return numCC, numFB, numFF, numMI
+
+def getEdgeHistogram(inGraph, refGraph):
+    falsePositives = set(inGraph.edges()).difference(refGraph.edges())
+    edgeHistogramCounts = {0:0}
+
+    for fe in falsePositives:
+        u,v = fe
+        try:
+            path = nx.dijkstra_path(refGraph,u,v)
+            pathlength = len(path) -1 
+            if pathlength in edgeHistogramCounts.keys():
+                edgeHistogramCounts[pathlength] +=1
+            else:
+                edgeHistogramCounts[pathlength] = 0
+
+        except nx.exception.NetworkXNoPath:
+            edgeHistogramCounts[0] +=1
+    return edgeHistogramCounts
+
+
+
+def pathStats(inGraph, refGraph):
+    """
+    Only returns TP, FP, numPredictions for each networks
+    """
+    falsePositives = set(inGraph.edges()).difference(refGraph.edges())
+    truePositives = set(inGraph.edges()).intersection(refGraph.edges())
+    numPredictions = len(inGraph.edges())
+    nopath = 0
+    yespath = 0
+    edgeCounts = {0:0,2:0,3:0,4:0,5:0}    
+    for fe in falsePositives:
+        u,v = fe
+        try:
+            path = nx.dijkstra_path(refGraph,u,v)
+            pathlength = len(path) -1
+            yespath +=1
+            if pathlength in edgeCounts.keys():
+                edgeCounts[pathlength] +=1
+
+        except nx.exception.NetworkXNoPath:
+            nopath +=1
+
+    edgeCounts['numPred'] = numPredictions
+    edgeCounts['numTP'] = len(truePositives)
+    edgeCounts['numFP_withPath'] = yespath
+    edgeCounts['numFP_noPath'] = nopath
+    return edgeCounts
+
diff --git a/BLEvaluator.py b/BLEvaluator.py
@@ -56,8 +56,13 @@ def get_parser() -> argparse.ArgumentParser:
 
     parser.add_argument('-m','--motifs', action="store_true", default=False,
       help="Compute network motifs in the predicted top-k networks.")
+
+    parser.add_argument('-p','--paths', action="store_true", default=False,
+      help="Compute path length statistics on the predicted top-k networks.")
 
-
+    parser.add_argument('-b','--borda', action="store_true", default=False,
+      help="Compute edge ranked list using the various Borda aggregatio methods.")
+
     return parser
 
 def parse_arguments():
@@ -137,6 +142,17 @@ def main():
         FBL.to_csv(outDir+'NetworkMotifs-FBL.csv')
         FFL.to_csv(outDir+'NetworkMotifs-FFL.csv')
         MI.to_csv(outDir+'NetworkMotifs-MI.csv')
+
+    # Compute path statistics such as number of TP, FP, 
+    # and path lengths among TP in the top-k networks.
+    if (opts.paths):
+        print('\n\nComputing path length statistics on predicted networks...')
+        evalSummarizer.computePaths()
+
+   # Compute edge ranked list using the borda method
+    if (opts.borda):
+        print('\n\nComputing edge ranked list using the borda method')
+        evalSummarizer.computeBorda()
 
 
     print('\n\nEvaluation complete...\n')