Skip to content

Commit

Permalink
added leonie part
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel.domingo.fernandez committed Sep 8, 2017
1 parent a645dac commit 1f77631
Show file tree
Hide file tree
Showing 6 changed files with 391 additions and 0 deletions.
42 changes: 42 additions & 0 deletions CalculateSignificanceUsingCubicAlgorithm1b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
def calculateSignificanceUsingCubicAlgorithm1b(hypScore, predictionListStats, expDataStats, epsilon):
'''Calculate the p-value of a score given the hypothesis score and the
distribution table (calculated using the cubic algorithm 1b in
Assessing statistical significance in causal graphs - Chindelevitch et al)
'''
import math
# Calculate the log of the factorial for all values in predictionListStats,
#this is to reduce the number of computations done later on.
logOfFactorialOfPredictionListStats = [math.log(math.factorial(n)) for n in predictionListStats]
logDMax = findMaximumDValue(predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats, TRUE)

# Number of positive predictions from the network. This will be an upper bound for r+ (r_p)
q_p = predictionListStats[0] #c(up, down, ambig)
# Number of negative predictions from the network. This will be an upper bound for r- (r_m)
q_m = predictionListStats[1]

# Number of positive results. This will be an upper bound for c+ (c_p)
n_p = experimentalDataStats[0] #c(up, down, nochange)
# Number of negative results. This will be an upper bound for c- (c_m)
n_m = experimentalDataStats[1]

# Array for the D values - first element is the total D values which have a score equal to or better than the cut-off, second element is all D
# values (i.e. the ratio is the P value).
logepsDMax = math.log(epsilon) + logDMax # or [n+math.log(epsilon) for n in logDMax] #
weights = [0, 0]

# In algorithm a superfamily is defined by the left 3x2 submatrix of the contingency table. Now we only need to interate over two quantities: r+ and
# r-.
for r_p in range(0, q_p):
# for 0 to the limit of positive predictions
for r_m in range(0, q_m):
# for 0 to the limit of number of negative results Calculate r0
r_z = n_p + n_m - (r_p + r_m)
# Need to check that n00 is postive, to ensure it is a valid superfamily
if r_z <= predictionListStats[2] and r_z >= 0:
# The five values in the array below define the superfamily
weights = getWeightsAboveHypothesisScoreForAThreeByTwoTable(weights, r_p, r_m, r_z, n_p, n_m, predictionListStats, experimentalDataStats,
logOfFactorialOfPredictionListStats, hypothesisScore, logepsDMax, logDMax)

pValue = weights[0]/weights[1]

return(pValue)
213 changes: 213 additions & 0 deletions CausalR_Cubic1br.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@

# coding: utf-8

# ## Implementation of CalculateSignificanceUsingCubicAlgorithm1b.r
#
# ### Adapted from CausalR

def calculateWeightGivenValuesInThreeByThreeContingencyTable(threeByThreeContingencyTable, logOfFactorialOfPredictionListStats, returnlog):
'''Helper Function for getMaximumDValueFromTwoByTwoContingencyTable'''

def populateTheThreeByThreeContingencyTable(numbersOfCorrectandIncorrectPredictions[1], numbersOfCorrectandIncorrectPredictions[2],
numbersOfCorrectandIncorrectPredictions[3], numbersOfCorrectandIncorrectPredictions[4], predictionListStats, experimentalDataStats):
'''Helper Function for getMaximumDValueFromTwoByTwoContingencyTable'''
return

def getAllPossibleRoundingCombinations(twoByTwoContingencyTable):
'''Helper function for getMaximumDValueFromTwoByTwoContingencyTable: get score for numbers of correct and incorrect predictions
Returns all possible rounding combinations of a 2x2 table. Given the values of n++, n+-, n-+ and n-- (stored in twoByTwoContingencyTable) this function will
compute all possibilities of rounding each value up or down.
@param twoByTwoContingencyTable Approximate values of n++, n+-, n-+ and n--, these values are
calculated to optimise the D-value (see page 6 of Assessing statistical significance of causal graphs)
@return a matrix of rounding combinations'''
from math import floor, ceil
import numpy as np

f_n_pp <- floor(twoByTwoContingencyTable[0]) #(floor in R/python --> largest integer value less than or equal to x)
f_n_pm <- floor(twoByTwoContingencyTable[1])
f_n_mp <- floor(twoByTwoContingencyTable[2])
f_n_mm <- floor(twoByTwoContingencyTable[3])

c_n_pp <- ceil(twoByTwoContingencyTable[0]) #(ceil(ing) in R/python --> smallest integer value bigger than or equal to x)
c_n_pm <- ceil(twoByTwoContingencyTable[1])
c_n_mp <- ceil(twoByTwoContingencyTable[2])
c_n_mm <- ceil(twoByTwoContingencyTable[3])

roundingCombinations <- np.matrix([[f_n_pp, f_n_pm, f_n_mp, f_n_mm], [f_n_pp, f_n_pm, f_n_mp, c_n_mm], [f_n_pp, f_n_pm, c_n_mp, f_n_mm], [f_n_pp, f_n_pm,
c_n_mp, c_n_mm], [f_n_pp, c_n_pm, f_n_mp, f_n_mm], [f_n_pp, c_n_pm, f_n_mp, c_n_mm], [f_n_pp, c_n_pm, c_n_mp, f_n_mm], [f_n_pp, c_n_pm, c_n_mp, c_n_mm],
[c_n_pp, f_n_pm, f_n_mp, f_n_mm], [c_n_pp, f_n_pm, f_n_mp, c_n_mm], [c_n_pp, f_n_pm, c_n_mp, f_n_mm], [c_n_pp, f_n_pm, c_n_mp, c_n_mm], [c_n_pp, c_n_pm,
f_n_mp, f_n_mm], [c_n_pp, c_n_pm, f_n_mp, c_n_mm], [c_n_pp, c_n_pm, c_n_mp, f_n_mm], [c_n_pp, c_n_pm, c_n_mp, c_n_mm]] # in R, ncol = 4, byrow = TRUE


# If any of the input values are integers then there will be duplicates in the list Remove the duplicates
keep = [True for _ in range(16)]
if f_n_pp == c_n_pp:
keep[8:16] = [False for _ in range(8)] #keep[9:16] <- FALSE
if f_n_pm == c_n_pm:
keep[4:8] = [False for _ in range(5)]
keep[12:16] = [False for _ in range(5)] #keep[c(5:8, 13:16)] <- FALSE
if f_n_mp == c_n_mp:
keep[2:4] = [False for _ in range(3)]
keep[6:8] = [False for _ in range(3)]
keep[10:12] = [False for _ in range(3)]
keep[14:16] = [False for _ in range(3)] #keep[c(3:4, 7:8, 11:12, 15:16)] <- FALSE
if f_n_mm == c_n_mm:
keep[1] = False
keep[3] = False
keep[5] = False
keep[7] = False
keep[9] = False
keep[11] = False
keep[3] = False
keep[15] = False #keep[c(2, 4, 6, 8, 10, 12, 14, 16)] <- FALSE

return roundingCombinations[keep, ]


def findApproximateValuesThatWillMaximiseDValue(predictionListStats, experimentalDataStats):
'''Helper function for findMaximumDValue: Finds an approximate table values to maximise D.
Given the values of q+, q-, q0, n+, n- and n0 this function will produce the approximate values
of n++, n+-, n-+ and n-- that will maximise the D value. See Assessing statistical significance of
casual graphs, page 6. The values are approximate since they need to be rounded, although the direction
of rounding is not clear at this stage.
@param predictionListStats a vector containing the values q+, q- and q0:
numbers of positive, negative and non-significant/contradictory predictions
@param experimentalDataStats a vector containing the values n+, n- and n0:
numbers of positive, negative and non-significant/contradictory predictions
@return a 2x2 contingency table which approximately maximises D'''

q_p = predictionListStats[0]
q_m = predictionListStats[1]

n_p = experimentalDataStats[0]
n_m = experimentalDataStats[1]

Tval = sum(predictionListStats)
# The values of n++, n+-, n-+ and n-- that give the maximum D-value are given by the formula within the paper - Assessing statistical significance
# in causal graphs, page 6. The formula is n_ab is approximately equal to q_a*n_b/T, where T = q+ + q- + q0 = n+ + n- + n0, and a,b are either + or
# -.
n_pp = q_p * n_p/Tval
n_pm = q_p * n_m/Tval
n_mp = q_m * n_p/Tval
n_mm = q_m * n_m/Tval

twoByTwoContingencyTable = [n_pp, n_pm, n_mp, n_mm]

return twoByTwoContingencyTable

def getMaximumDValueFromTwoByTwoContingencyTable(twoByTwoContingencyTable, predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats,
returnlog):
'''Helper function for findMaximumDValue: Get maximum D value from two-by-two contingency table
- computes the maximum D value (or weight) given approximate values of n++, n+-, n-+ and n--.
These values are approximate and in general are non-integer values; they are found by using an
approximation detailed in the paper Assessing statistical significance in causal graphs on page 6 i.e. n_ab is approximately
equal to q_a*n_b/t where a and b are either +, - or 0. The value is an approximation since the direction in which the
number should be rounded is not clear and hence this function runs through all possible combinations of rounding before
concluding the maximum D-value.
@param twoByTwoContingencyTable approximate values of n++, n+-, n-+ and n--, these values arecalculated to optimise the D-value
@param predictionListStats a vector containing the values q+, q- and q0 the number of positive/negative/non-significant (or contradictory) predictions)
@param experimentalDataStats a vector containing the values n+, n- and n0 (the number of positive/negative/non-significant (or contradictory) transcripts in the results)
@param logOfFactorialOfPredictionListStats a vector containing the log of the factorial value for each entry in predictionListStats
@param returnlog whether or not he value should be returned as a log (TRUE) or not (FALSE)
@return the maximal D-value'''

import numpy as np
# We need to find all the combinations (maximum of 16) when applying the rounding to the values in twoByTwoContingencyTable. The exception is if
# all four values are integers. combinationsOfRounding will be a nx4 matrix containing the possible values of n_pp, n_pm, n_mp, n_mm
if np.round(twoByTwoContingencyTable) == twoByTwoContingencyTable:
# All four values are integers --> twoByTwoContingencyTable = [n_pp, n_pm, n_mp, n_mm] (c(...) in R)
combinationsOfRounding = np.matrix(twoByTwoContingencyTable)#, ncol = 4)
else:
combinationsOfRounding = getAllPossibleRoundingCombinations(twoByTwoContingencyTable)

maximumDValue = 0

for i in range(combinationsOfRounding.shape[0]): #nrwo in R (loop over number of rows of the matrix)
numbersOfCorrectandIncorrectPredictions = combinationsOfRounding[i] #get all elements of the i-th row ([i,] in R)
threeByThreeContingencyTable = populateTheThreeByThreeContingencyTable(numbersOfCorrectandIncorrectPredictions[0, 0], numbersOfCorrectandIncorrectPredictions[0, 1],
numbersOfCorrectandIncorrectPredictions[0, 2], numbersOfCorrectandIncorrectPredictions[0,3], predictionListStats, experimentalDataStats)
# Some of the combinationations produce an invalid contingency table - this we can check by seeing if any of the values in the table are negative.
if threeByThreeContingencyTable.min() >= 0: #if 3x3ContTable is matrix or nparray! otherwise change to min(x)
weight = calculateWeightGivenValuesInThreeByThreeContingencyTable(threeByThreeContingencyTable, logOfFactorialOfPredictionListStats, returnlog)
#what type is weight? If not integer/float, max needs to be changed (e.g. for list max(max(weight), maximumDValue))
maximumDValue = max(maximumDValue, weight)
return maximumDValue


def findMaximumDValue(predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats, returnlog = FALSE):
'''Helper function for cubic1b: Find maximum D value - computes the maximum possible D-value for given values q+, q-, q0 and n+, n-, n0.
@param predictionListStats a vector containing the predicted values q+, q- and q0:
numbers of positive, negative and non-significant/contradictory predictions
@param experimentalDataStats A vector containing the observed values n+, n- and n0:
numbers of positive, negative and non-significant/contradictory observations
@param logOfFactorialOfPredictionListStats a vector containing the log of the factorial value for
each entry in predictionListStats
@param returnlog should the result be returned as a log; default FALSE
@return the maximum possible D value'''

twoByTwoContingencyTable = findApproximateValuesThatWillMaximiseDValue(predictionListStats, experimentalDataStats)

maximumDValue = getMaximumDValueFromTwoByTwoContingencyTable(twoByTwoContingencyTable, predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats,
returnlog)

return maximumDValue


def getWeightsAboveHypothesisScoreForAThreeByTwoTable(weights, r_p, r_m, r_z, n_p, n_m, predictionListStats, experimentalDataStats,
logOfFactorialOfPredictionListStats, hypothesisScore, logepsDMax, logDMax):
'''Helper function for cubic1b: '''
return [0, 0]

def cubic1b(hypScore, predictionListStats, expDataStats, epsilon):
'''Calculate the p-value of a score given the hypothesis score and the
distribution table (calculated using the cubic algorithm 1b in
Assessing statistical significance in causal graphs - Chindelevitch et al)
'''
import math
# Calculate the log of the factorial for all values in predictionListStats,
#this is to reduce the number of computations done later on.
logOfFactorialOfPredictionListStats = [math.log(math.factorial(n)) for n in predictionListStats]
logDMax = findMaximumDValue(predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats, TRUE)

# Number of positive predictions from the network. This will be an upper bound for r+ (r_p)
q_p = predictionListStats[0] #c(up, down, ambig)
# Number of negative predictions from the network. This will be an upper bound for r- (r_m)
q_m = predictionListStats[1]

# Number of positive results. This will be an upper bound for c+ (c_p)
n_p = experimentalDataStats[0] #c(up, down, nochange)
# Number of negative results. This will be an upper bound for c- (c_m)
n_m = experimentalDataStats[1]

# Array for the D values - first element is the total D values which have a score equal to or better than the cut-off, second element is all D
# values (i.e. the ratio is the P value).
logepsDMax = math.log(epsilon) + logDMax # or [n+math.log(epsilon) for n in logDMax] #
weights = [0, 0]

# In algorithm a superfamily is defined by the left 3x2 submatrix of the contingency table. Now we only need to interate over two quantities: r+ and
# r-.
for r_p in range(0, q_p):
# for 0 to the limit of positive predictions
for r_m in range(0, q_m):
# for 0 to the limit of number of negative results Calculate r0
r_z = n_p + n_m - (r_p + r_m)
# Need to check that n00 is postive, to ensure it is a valid superfamily
if r_z <= predictionListStats[2] and r_z >= 0:
# The five values in the array below define the superfamily
weights = getWeightsAboveHypothesisScoreForAThreeByTwoTable(weights, r_p, r_m, r_z, n_p, n_m, predictionListStats, experimentalDataStats,
logOfFactorialOfPredictionListStats, hypothesisScore, logepsDMax, logDMax)

pValue = weights[0]/weights[1]

return(pValue)








31 changes: 31 additions & 0 deletions FindApproximateValuesThatWillMaximiseDValue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
def findApproximateValuesThatWillMaximiseDValue(predictionListStats, experimentalDataStats):
'''Helper function for findMaximumDValue: Finds an approximate table values to maximise D.
Given the values of q+, q-, q0, n+, n- and n0 this function will produce the approximate values
of n++, n+-, n-+ and n-- that will maximise the D value. See Assessing statistical significance of
casual graphs, page 6. The values are approximate since they need to be rounded, although the direction
of rounding is not clear at this stage.
@param predictionListStats a vector containing the values q+, q- and q0:
numbers of positive, negative and non-significant/contradictory predictions
@param experimentalDataStats a vector containing the values n+, n- and n0:
numbers of positive, negative and non-significant/contradictory predictions
@return a 2x2 contingency table which approximately maximises D'''

q_p = predictionListStats[0]
q_m = predictionListStats[1]

n_p = experimentalDataStats[0]
n_m = experimentalDataStats[1]

Tval = sum(predictionListStats)
# The values of n++, n+-, n-+ and n-- that give the maximum D-value are given by the formula within the paper - Assessing statistical significance
# in causal graphs, page 6. The formula is n_ab is approximately equal to q_a*n_b/T, where T = q+ + q- + q0 = n+ + n- + n0, and a,b are either + or
# -.
n_pp = q_p * n_p/Tval
n_pm = q_p * n_m/Tval
n_mp = q_m * n_p/Tval
n_mm = q_m * n_m/Tval

twoByTwoContingencyTable = [n_pp, n_pm, n_mp, n_mm]

return twoByTwoContingencyTable
17 changes: 17 additions & 0 deletions FindMaximumDValue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def findMaximumDValue(predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats, returnlog = FALSE):
'''Helper function for cubic1b: Find maximum D value - computes the maximum possible D-value for given values q+, q-, q0 and n+, n-, n0.
@param predictionListStats a vector containing the predicted values q+, q- and q0:
numbers of positive, negative and non-significant/contradictory predictions
@param experimentalDataStats A vector containing the observed values n+, n- and n0:
numbers of positive, negative and non-significant/contradictory observations
@param logOfFactorialOfPredictionListStats a vector containing the log of the factorial value for
each entry in predictionListStats
@param returnlog should the result be returned as a log; default FALSE
@return the maximum possible D value'''

twoByTwoContingencyTable = findApproximateValuesThatWillMaximiseDValue(predictionListStats, experimentalDataStats)

maximumDValue = getMaximumDValueFromTwoByTwoContingencyTable(twoByTwoContingencyTable, predictionListStats, experimentalDataStats, logOfFactorialOfPredictionListStats,
returnlog)

return maximumDValue
Loading

0 comments on commit 1f77631

Please sign in to comment.