-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Borda analysis to master branch.
- Loading branch information
Showing
4 changed files
with
332 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import os | ||
import math | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
from tqdm import tqdm | ||
from pathlib import Path | ||
import concurrent.futures | ||
import matplotlib.pyplot as plt | ||
from itertools import permutations | ||
from sklearn import preprocessing | ||
from sklearn.metrics import precision_recall_curve, roc_curve, auc | ||
sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks") | ||
|
||
|
||
def Borda(evalObject, selectedAlgorithms=None, aggregationMethod="average"): | ||
""" | ||
A function to compute edge ranked list using the Borda method from | ||
the predicted ranked edges, i.e., the outputs of different datasets | ||
generated from the same reference network, for each dataset. | ||
Parameters | ||
---------- | ||
evalObject: BLEval | ||
An object of class :class:`BLEval.BLEval`. | ||
selectedAlgorithms: [str] | ||
List of algorithm names used to run borda method on selected | ||
algorithms. If nothing is provided, the function runs borda on | ||
all available algorithms. | ||
aggregationMethod: str | ||
Method used to aggregate rank in borda method. Available options are | ||
{‘average’, ‘min’, ‘max’, ‘first’}, default ‘average’ | ||
:returns: | ||
- None | ||
""" | ||
evaluationDFs = [] | ||
for dataset in tqdm(evalObject.input_settings.datasets): | ||
edges = [] | ||
for algorithmName, _ in tqdm(evalObject.input_settings.algorithms): | ||
outDir = str(evalObject.output_settings.base_dir) + \ | ||
str(evalObject.input_settings.datadir).split("inputs")[1] + \ | ||
"/" + dataset["name"] | ||
inDir = str(evalObject.input_settings.datadir) + "/" + dataset["name"] | ||
rank_path = outDir + "/" + algorithmName + "/rankedEdges.csv" | ||
refNetwork_path = inDir + "/" + dataset["trueEdges"] | ||
|
||
if not os.path.isdir(outDir) or not os.path.isdir(inDir): | ||
continue | ||
try: | ||
df = pd.read_csv(rank_path, sep="\t", header=0, index_col=None) | ||
refNetwork = pd.read_csv(refNetwork_path, header=0, index_col=None) | ||
refNetwork['edge'] = refNetwork.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1) | ||
refNetwork = refNetwork[refNetwork.Gene1!=refNetwork.Gene2] | ||
refNetwork['isReferenceEdge'] = 1 | ||
all_edges_df = pd.DataFrame(list(permutations(np.unique(refNetwork.loc[:,['Gene1','Gene2']]), r = 2)), columns=['Gene1','Gene2']) | ||
ranked_edges = pd.merge(all_edges_df, df, on=['Gene1','Gene2'], how='left') | ||
ranked_edges.EdgeWeight = ranked_edges.EdgeWeight.fillna(0) | ||
ranked_edges['absEdgeWeight'] = ranked_edges['EdgeWeight'].abs() | ||
ranked_edges['normEdgeWeight'] = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(ranked_edges[['absEdgeWeight']]))[0] | ||
ranked_edges['dataset'] = dataset["name"] | ||
ranked_edges['algo'] = algorithmName | ||
ranked_edges['edge'] = ranked_edges.apply(lambda x: '%s-%s' % (x.Gene1, x.Gene2), axis=1) | ||
edges.append(ranked_edges) | ||
except Exception as e: | ||
print("\nSkipping Borda computation for ", algorithmName, "on path", outDir) | ||
selectedAlgorithms.remove(algorithmName) | ||
continue | ||
rank_df = pd.pivot_table(pd.concat(edges), values='normEdgeWeight', index='edge', columns='algo') | ||
rank_df['BORDA'] = __normalize__(rank_df.rank(ascending=True, method=aggregationMethod).mean(axis=1).values) | ||
rank_df['mBORDA'] = __normalize__(rank_df.rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values) | ||
rank_df['sBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=True, method=aggregationMethod).mean(axis=1).values) | ||
rank_df['smBORDA'] = __normalize__(rank_df[selectedAlgorithms].rank(ascending=False, method=aggregationMethod).apply(lambda x: 1.0/(x*x)).mean(axis=1).values) | ||
rank_df['Gene1'], rank_df['Gene2'] = rank_df.index.str.split('-', 1).str | ||
rank_df[['Gene1','Gene2','BORDA','mBORDA','sBORDA','smBORDA']].to_csv(outDir+"/Borda.csv", index=False) | ||
|
||
|
||
|
||
def __normalize__(arr): | ||
return 1.0*(arr-np.min(arr))/(np.max(arr)-np.min(arr)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
import pandas as pd | ||
import sys | ||
import numpy as np | ||
import seaborn as sns | ||
from pathlib import Path | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
sns.set(rc={"lines.linewidth": 2}, palette = "deep", style = "ticks") | ||
from sklearn.metrics import precision_recall_curve, roc_curve, auc | ||
from itertools import product, permutations, combinations, combinations_with_replacement | ||
from tqdm import tqdm | ||
import networkx as nx | ||
|
||
def pathAnalysis(dataDict, inputSettings): | ||
''' | ||
Computes "directed","feed-forward", | ||
"cascade", and "mutual" motifs. | ||
''' | ||
|
||
# Read file for trueEdges | ||
trueEdgesDF = pd.read_csv(str(inputSettings.datadir)+'/'+ dataDict['name'] + | ||
'/' +dataDict['trueEdges'], | ||
sep = ',', | ||
header = 0, index_col = None) | ||
|
||
possibleEdges = list(permutations(np.unique(trueEdgesDF.loc[:,['Gene1','Gene2']]), | ||
r = 2)) | ||
EdgeDict = {'|'.join(p):0 for p in possibleEdges} | ||
|
||
refGraph = nx.DiGraph() | ||
|
||
for key in EdgeDict.keys(): | ||
u = key.split('|')[0] | ||
v = key.split('|')[1] | ||
if len(trueEdgesDF.loc[(trueEdgesDF['Gene1'] == u) & | ||
(trueEdgesDF['Gene2'] == v)])>0: | ||
refGraph.add_edge(u,v) | ||
|
||
#refCC, refFB, refFF, refMI = getNetProp(refGraph) | ||
|
||
|
||
# set-up outDir that stores output directory name | ||
outDir = "outputs/"+str(inputSettings.datadir).split("inputs/")[1]+ '/' +dataDict['name'] | ||
#print(dataDict['name']) | ||
|
||
################################################## | ||
# Get counts of tp,fp with, and fp without paths # | ||
################################################## | ||
collection = {} | ||
|
||
for algo in inputSettings.algorithms: | ||
# check if the output rankedEdges file exists | ||
if Path(outDir + '/' +algo[0]+'/rankedEdges.csv').exists() and algo[0] not in ['PPCOR','PIDC']: | ||
# Initialize Precsion | ||
predDF = pd.read_csv(outDir + '/' +algo[0]+'/rankedEdges.csv', \ | ||
sep = '\t', header = 0, index_col = None) | ||
|
||
predDF.EdgeWeight = predDF.EdgeWeight.round(6) | ||
predDF.EdgeWeight = predDF.EdgeWeight.abs() | ||
predDF = predDF.loc[(predDF['EdgeWeight'] > 0)] | ||
predDF.drop_duplicates(keep = 'first', inplace=True) | ||
predDF.reset_index(drop = True, inplace= True) | ||
predDF = predDF.loc[(predDF['Gene1'] != predDF['Gene2'])] | ||
if predDF.shape[0] != 0: | ||
maxk = min(predDF.shape[0], len(refGraph.edges())) | ||
edgeWeightTopk = predDF.iloc[maxk-1].EdgeWeight | ||
|
||
newDF = predDF.loc[(predDF['EdgeWeight'] >= edgeWeightTopk)] | ||
|
||
predGraph = nx.DiGraph() | ||
|
||
|
||
for key in EdgeDict.keys(): | ||
u = key.split('|')[0] | ||
v = key.split('|')[1] | ||
if len(newDF.loc[(newDF['Gene1'] == u) & | ||
(newDF['Gene2'] == v)])>0: | ||
predGraph.add_edge(u,v) | ||
|
||
dataDict = pathStats(predGraph, refGraph) | ||
collection[algo[0]] = dataDict | ||
if algo[0] == 'PPCOR' or algo[0] == 'PIDC': | ||
collection[algo[0]] = {} | ||
else: | ||
print(outDir + '/' +algo[0]+'/rankedEdges.csv', \ | ||
' is an undirected graph. Skipping...') | ||
|
||
hMap = '/pathStats' | ||
largest = max([len(algopaths.keys()) for algo,algopaths in collection.items()]) | ||
allpathsizes = set() | ||
for algo,algopaths in collection.items(): | ||
allpathsizes.update(set(algopaths.keys())) | ||
|
||
for algo, algopaths in collection.items(): | ||
notpresent = allpathsizes.difference(set(algopaths.keys())) | ||
collection[algo].update({np:0 for np in notpresent}) | ||
|
||
dataDF = pd.DataFrame(collection) | ||
dataDF = dataDF.T | ||
dataDF.to_csv(outDir+hMap+'.csv') | ||
|
||
|
||
|
||
|
||
def getNetProp(inGraph): | ||
''' | ||
Function to compute properties | ||
of a given network. | ||
''' | ||
|
||
# number of weakly connected components in | ||
# reference network | ||
numCC = len(list(nx.weakly_connected_components(inGraph))) | ||
|
||
# number of feedback loop | ||
# in reference network | ||
allCyc = nx.simple_cycles(inGraph) | ||
cycSet = set() | ||
for cyc in allCyc: | ||
if len(cyc) == 3: | ||
cycSet.add(frozenset(cyc)) | ||
|
||
numFB = len(cycSet) | ||
|
||
# number of feedfwd loops | ||
# in reference network | ||
allPaths = [] | ||
allPathsSet = set() | ||
for u,v in inGraph.edges(): | ||
allPaths = nx.all_simple_paths(inGraph, u, v, cutoff=2) | ||
for p in allPaths: | ||
if len(p) > 2: | ||
allPathsSet.add(frozenset(p)) | ||
|
||
numFF= len(allPathsSet) | ||
|
||
|
||
# number of mutual interactions | ||
numMI = 0.0 | ||
for u,v in inGraph.edges(): | ||
if (v,u) in inGraph.edges(): | ||
numMI += 0.5 | ||
|
||
return numCC, numFB, numFF, numMI | ||
|
||
def getEdgeHistogram(inGraph, refGraph): | ||
falsePositives = set(inGraph.edges()).difference(refGraph.edges()) | ||
edgeHistogramCounts = {0:0} | ||
|
||
for fe in falsePositives: | ||
u,v = fe | ||
try: | ||
path = nx.dijkstra_path(refGraph,u,v) | ||
pathlength = len(path) -1 | ||
if pathlength in edgeHistogramCounts.keys(): | ||
edgeHistogramCounts[pathlength] +=1 | ||
else: | ||
edgeHistogramCounts[pathlength] = 0 | ||
|
||
except nx.exception.NetworkXNoPath: | ||
edgeHistogramCounts[0] +=1 | ||
return edgeHistogramCounts | ||
|
||
|
||
|
||
def pathStats(inGraph, refGraph): | ||
""" | ||
Only returns TP, FP, numPredictions for each networks | ||
""" | ||
falsePositives = set(inGraph.edges()).difference(refGraph.edges()) | ||
truePositives = set(inGraph.edges()).intersection(refGraph.edges()) | ||
numPredictions = len(inGraph.edges()) | ||
nopath = 0 | ||
yespath = 0 | ||
edgeCounts = {0:0,2:0,3:0,4:0,5:0} | ||
for fe in falsePositives: | ||
u,v = fe | ||
try: | ||
path = nx.dijkstra_path(refGraph,u,v) | ||
pathlength = len(path) -1 | ||
yespath +=1 | ||
if pathlength in edgeCounts.keys(): | ||
edgeCounts[pathlength] +=1 | ||
|
||
except nx.exception.NetworkXNoPath: | ||
nopath +=1 | ||
|
||
edgeCounts['numPred'] = numPredictions | ||
edgeCounts['numTP'] = len(truePositives) | ||
edgeCounts['numFP_withPath'] = yespath | ||
edgeCounts['numFP_noPath'] = nopath | ||
return edgeCounts | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters