Skip to content

Commit

Permalink
Added Borda analysis to master branch.
Browse files Browse the repository at this point in the history
  • Loading branch information
adyprat committed Feb 24, 2020
1 parent e5d932c commit 16f6c68
Show file tree
Hide file tree
Showing 4 changed files with 332 additions and 13 deletions.
56 changes: 44 additions & 12 deletions BLEval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@
# local imports
from BLEval.parseTime import getTime
from BLEval.computeDGAUC import PRROC
from BLEval.computeBorda import Borda
from BLEval.computeJaccard import Jaccard
from BLEval.computeSpearman import Spearman
from BLEval.computeNetMotifs import Motifs
from BLEval.computeEarlyPrec import EarlyPrec
#from BLEval.computePathStats import pathAnalysis
from BLEval.computePathStats import pathAnalysis
from BLEval.computeSignedEPrec import signedEPrec


Expand Down Expand Up @@ -223,17 +224,17 @@ def computeNetMotifs(self):
return FBL, FFL, MI


# def computePaths(self):
# '''
# For each algorithm-dataset combination, this function computes path lengths
# through TP edges and FP edges, returns statistics on path lengths.

# :returns:
# - pathStats: A dataframe path lengths in predicted network
# '''
# for dataset in tqdm(self.input_settings.datasets,
# total = len(self.input_settings.datasets), unit = " Datasets"):
# pathAnalysis(dataset, self.input_settings)
def computePaths(self):
'''
For each algorithm-dataset combination, this function computes path lengths
through TP edges and FP edges, returns statistics on path lengths.
:returns:
- pathStats: A dataframe path lengths in predicted network
'''
for dataset in tqdm(self.input_settings.datasets,
total = len(self.input_settings.datasets), unit = " Datasets"):
pathAnalysis(dataset, self.input_settings)


def computeEarlyPrec(self):
Expand Down Expand Up @@ -281,6 +282,37 @@ def computeSignedEPrec(self):
sEPRDict['EPrec Inhibition'][algo[0]] = sEPrecDF['-']
return(pd.DataFrame(sEPRDict['EPrec Activation']).T, pd.DataFrame(sEPRDict['EPrec Inhibition']).T)

def computeBorda(self, selectedAlgorithms=None, aggregationMethod="average"):

'''
Computes edge ranked list using the Borda method for each dataset.
Parameters
----------
selectedAlgorithms: [str]
List of algorithm names used to run borda method on selected
algorithms. If nothing is provided, the function runs borda on
all available algorithms.
aggregationMethod: str
Method used to aggregate rank in borda method. Available options are
{‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’
:returns:
None
'''
feasibleAlgorithmOptions = [algorithmName for algorithmName, _ in self.input_settings.algorithms]
feasibleaggregationMethodOptions = ['average', 'min', 'max', 'first']

selectedAlgorithms = feasibleAlgorithmOptions if selectedAlgorithms is None else selectedAlgorithms

for a in selectedAlgorithms:
if a not in feasibleAlgorithmOptions:
print("\nERROR: No data available on algorithm %s. Please choose an algorithm from the following options: %s" % (a, feasibleAlgorithmOptions))
return

if aggregationMethod not in feasibleaggregationMethodOptions:
print("\nERROR: Please choose an aggregation method algorithm from following options: " % feasibleaggregationMethodOptions)
return

Borda(self, selectedAlgorithms, aggregationMethod)

class ConfigParser(object):
'''
Expand Down
78 changes: 78 additions & 0 deletions BLEval/computeBorda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import concurrent.futures
import matplotlib.pyplot as plt
from itertools import permutations
from sklearn import preprocessing
from sklearn.metrics import precision_recall_curve, roc_curve, auc
sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks")


def Borda(evalObject, selectedAlgorithms=None, aggregationMethod="average"):
"""
A function to compute edge ranked list using the Borda method from
the predicted ranked edges, i.e., the outputs of different datasets
generated from the same reference network, for each dataset.
Parameters
----------
evalObject: BLEval
An object of class :class:`BLEval.BLEval`.
selectedAlgorithms: [str]
List of algorithm names used to run borda method on selected
algorithms. If nothing is provided, the function runs borda on
all available algorithms.
aggregationMethod: str
Method used to aggregate rank in borda method. Available options are
{‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’
:returns:
- None
"""
evaluationDFs = []
for dataset in tqdm(evalObject.input_settings.datasets):
edges = []
for algorithmName, _ in tqdm(evalObject.input_settings.algorithms):
outDir = str(evalObject.output_settings.base_dir) + \
str(evalObject.input_settings.datadir).split("inputs")[1] + \
"/" + dataset["name"]
inDir = str(evalObject.input_settings.datadir) + "/" + dataset["name"]
rank_path = outDir + "/" + algorithmName + "/rankedEdges.csv"
refNetwork_path = inDir + "/" + dataset["trueEdges"]

if not os.path.isdir(outDir) or not os.path.isdir(inDir):
continue
try:
df = pd.read_csv(rank_path, sep="\t", header=0, index_col=None)
refNetwork = pd.read_csv(refNetwork_path, header=0, index_col=None)
refNetwork['edge'] = refNetwork.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1)
refNetwork = refNetwork[refNetwork.Gene1!=refNetwork.Gene2]
refNetwork['isReferenceEdge'] = 1
all_edges_df = pd.DataFrame(list(permutations(np.unique(refNetwork.loc[:,['Gene1','Gene2']]), r = 2)), columns=['Gene1','Gene2'])
ranked_edges = pd.merge(all_edges_df, df, on=['Gene1','Gene2'], how='left')
ranked_edges.EdgeWeight = ranked_edges.EdgeWeight.fillna(0)
ranked_edges['absEdgeWeight'] = ranked_edges['EdgeWeight'].abs()
ranked_edges['normEdgeWeight'] = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(ranked_edges[['absEdgeWeight']]))[0]
ranked_edges['dataset'] = dataset["name"]
ranked_edges['algo'] = algorithmName
ranked_edges['edge'] = ranked_edges.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1)
edges.append(ranked_edges)
except Exception as e:
print("\nSkipping Borda computation for ", algorithmName, "on path", outDir)
selectedAlgorithms.remove(algorithmName)
continue
rank_df = pd.pivot_table(pd.concat(edges), values='normEdgeWeight', index='edge', columns='algo')
rank_df['BORDA'] = __normalize__(rank_df.rank(ascending=True, method=aggregationMethod).mean(axis=1).values)
rank_df['mBORDA'] = __normalize__(rank_df.rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values)
rank_df['sBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=True, method=aggregationMethod).mean(axis=1).values)
rank_df['smBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values)
rank_df['Gene1'], rank_df['Gene2'] = rank_df.index.str.split('-', 1).str
rank_df[['Gene1','Gene2','BORDA','mBORDA','sBORDA','smBORDA']].to_csv(outDir+"/Borda.csv", index=False)



def __normalize__(arr):
return 1.0*(arr-np.min(arr))/(np.max(arr)-np.min(arr))
193 changes: 193 additions & 0 deletions BLEval/computePathStats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import pandas as pd
import sys
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks")
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from itertools import product, permutations, combinations, combinations_with_replacement
from tqdm import tqdm
import networkx as nx

def pathAnalysis(dataDict, inputSettings):
'''
Computes "directed","feed-forward",
"cascade", and "mutual" motifs.
'''

# Read file for trueEdges
trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] +
'/' +dataDict['trueEdges'],
sep = ',',
header = 0, index_col = None)

possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]),
r = 2))
EdgeDict = {'|'.join(p):0 for p in possibleEdges}

refGraph = nx.DiGraph()

for key in EdgeDict.keys():
u = key.split('|')[0]
v = key.split('|')[1]
if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == u) &
(trueEdgesDF['Gene2'] == v)])>0:
refGraph.add_edge(u,v)

#refCC, refFB, refFF, refMI = getNetProp(refGraph)


# set-up outDir that stores output directory name
outDir = "outputs/"+str(inputSettings.datadir).split("inputs/")[1]+ '/' +dataDict['name']
#print(dataDict['name'])

##################################################
# Get counts of tp,fp with, and fp without paths #
##################################################
collection = {}

for algo in inputSettings.algorithms:
# check if the output rankedEdges file exists
if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists() and algo[0] not in ['PPCOR','PIDC']:
# Initialize Precsion
predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \
sep = '\t', header = 0, index_col = None)

predDF.EdgeWeight = predDF.EdgeWeight.round(6)
predDF.EdgeWeight = predDF.EdgeWeight.abs()
predDF = predDF.loc[(predDF['EdgeWeight'] > 0)]
predDF.drop_duplicates(keep = 'first', inplace=True)
predDF.reset_index(drop = True, inplace= True)
predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])]
if predDF.shape[0] != 0:
maxk = min(predDF.shape[0], len(refGraph.edges()))
edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight

newDF = predDF.loc[(predDF['EdgeWeight'] >= edgeWeightTopk)]

predGraph = nx.DiGraph()


for key in EdgeDict.keys():
u = key.split('|')[0]
v = key.split('|')[1]
if len(newDF.loc[(newDF['Gene1'] == u) &
(newDF['Gene2'] == v)])>0:
predGraph.add_edge(u,v)

dataDict = pathStats(predGraph, refGraph)
collection[algo[0]] = dataDict
if algo[0] == 'PPCOR' or algo[0] == 'PIDC':
collection[algo[0]] = {}
else:
print(outDir + '/' +algo[0]+'/rankedEdges.csv', \
' is an undirected graph. Skipping...')

hMap = '/pathStats'
largest = max([len(algopaths.keys()) for algo,algopaths in collection.items()])
allpathsizes = set()
for algo,algopaths in collection.items():
allpathsizes.update(set(algopaths.keys()))

for algo, algopaths in collection.items():
notpresent = allpathsizes.difference(set(algopaths.keys()))
collection[algo].update({np:0 for np in notpresent})

dataDF = pd.DataFrame(collection)
dataDF = dataDF.T
dataDF.to_csv(outDir+hMap+'.csv')




def getNetProp(inGraph):
'''
Function to compute properties
of a given network.
'''

# number of weakly connected components in
# reference network
numCC = len(list(nx.weakly_connected_components(inGraph)))

# number of feedback loop
# in reference network
allCyc = nx.simple_cycles(inGraph)
cycSet = set()
for cyc in allCyc:
if len(cyc) == 3:
cycSet.add(frozenset(cyc))

numFB = len(cycSet)

# number of feedfwd loops
# in reference network
allPaths = []
allPathsSet = set()
for u,v in inGraph.edges():
allPaths = nx.all_simple_paths(inGraph, u, v, cutoff=2)
for p in allPaths:
if len(p) > 2:
allPathsSet.add(frozenset(p))

numFF= len(allPathsSet)


# number of mutual interactions
numMI = 0.0
for u,v in inGraph.edges():
if (v,u) in inGraph.edges():
numMI += 0.5

return numCC, numFB, numFF, numMI

def getEdgeHistogram(inGraph, refGraph):
falsePositives = set(inGraph.edges()).difference(refGraph.edges())
edgeHistogramCounts = {0:0}

for fe in falsePositives:
u,v = fe
try:
path = nx.dijkstra_path(refGraph,u,v)
pathlength = len(path) -1
if pathlength in edgeHistogramCounts.keys():
edgeHistogramCounts[pathlength] +=1
else:
edgeHistogramCounts[pathlength] = 0

except nx.exception.NetworkXNoPath:
edgeHistogramCounts[0] +=1
return edgeHistogramCounts



def pathStats(inGraph, refGraph):
"""
Only returns TP, FP, numPredictions for each networks
"""
falsePositives = set(inGraph.edges()).difference(refGraph.edges())
truePositives = set(inGraph.edges()).intersection(refGraph.edges())
numPredictions = len(inGraph.edges())
nopath = 0
yespath = 0
edgeCounts = {0:0,2:0,3:0,4:0,5:0}
for fe in falsePositives:
u,v = fe
try:
path = nx.dijkstra_path(refGraph,u,v)
pathlength = len(path) -1
yespath +=1
if pathlength in edgeCounts.keys():
edgeCounts[pathlength] +=1

except nx.exception.NetworkXNoPath:
nopath +=1

edgeCounts['numPred'] = numPredictions
edgeCounts['numTP'] = len(truePositives)
edgeCounts['numFP_withPath'] = yespath
edgeCounts['numFP_noPath'] = nopath
return edgeCounts

18 changes: 17 additions & 1 deletion BLEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,13 @@ def get_parser() -> argparse.ArgumentParser:

parser.add_argument('-m','--motifs', action="store_true", default=False,
help="Compute network motifs in the predicted top-k networks.")

parser.add_argument('-p','--paths', action="store_true", default=False,
help="Compute path length statistics on the predicted top-k networks.")


parser.add_argument('-b','--borda', action="store_true", default=False,
help="Compute edge ranked list using the various Borda aggregatio methods.")

return parser

def parse_arguments():
Expand Down Expand Up @@ -137,6 +142,17 @@ def main():
FBL.to_csv(outDir+'NetworkMotifs-FBL.csv')
FFL.to_csv(outDir+'NetworkMotifs-FFL.csv')
MI.to_csv(outDir+'NetworkMotifs-MI.csv')

# Compute path statistics such as number of TP, FP,
# and path lengths among TP in the top-k networks.
if (opts.paths):
print('\n\nComputing path length statistics on predicted networks...')
evalSummarizer.computePaths()

# Compute edge ranked list using the borda method
if (opts.borda):
print('\n\nComputing edge ranked list using the borda method')
evalSummarizer.computeBorda()


print('\n\nEvaluation complete...\n')
Expand Down

0 comments on commit 16f6c68

Please sign in to comment.