-
-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a high class for calculating binomial probabilities at unlimite…
…d precision. (#311)
- Loading branch information
Showing
2 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
137 changes: 137 additions & 0 deletions
137
src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2017 Fulcrum Genomics LLC | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.math | ||
|
||
import java.math.MathContext | ||
|
||
import com.fulcrumgenomics.FgBioDef._ | ||
|
||
import scala.math.{BigDecimal, BigInt} | ||
|
||
/** | ||
* A high-precision implementation of the math for binomial probabilities. This implementation is several | ||
* times slower (though not orders of magnitude slower) than the implementation in Apache Commons Math, but | ||
* retains precision throughout the computation where the Commons implementation underflows. | ||
* | ||
* One implementation choice to be aware of when using this implementation is that each instance of the | ||
* class will calculate and cache factorials up to factorial(n) where n is the highest value of n | ||
* supplied to any call to [[probability()]] or [[cumulativeProbability()]]. Thus to ensure reasonable | ||
* performance it is recommended to create one instance of this class and reuse it for as many | ||
* calculations as is practical. | ||
* | ||
* @param mc the MathContext to use for controlling precision and rounding. Changing from the default | ||
* [[MathContext.UNLIMITED]] can lead to a loss of precision. | ||
*/ | ||
class BinomialDistribution(val mc: MathContext = MathContext.UNLIMITED) { | ||
// This array will get expanded with more factorials any time a higher n is queried | ||
private var factorials = Array(BigInt(0), BigInt(1)) | ||
|
||
// Constant values for 0 and 1 in BigDecimal to avoid making them all the time | ||
private val Zero = BigDecimal(0, mc) | ||
private val One = BigDecimal(1, mc) | ||
|
||
/** Limits a value to the range 0-1. */ | ||
@inline private def limit(d: BigDecimal): BigDecimal = { | ||
if (d < Zero) Zero | ||
else if (d > One) One | ||
else d | ||
} | ||
|
||
/** Get the factorial of n. Retrieves the value from a cached set of factorials. If the cache | ||
* does not contain factorial(n) yet, then all factorials up to factorial(n) are computed | ||
* and cached. | ||
* | ||
* @param n the number to compute the factorial of | ||
* @return the factorial value as a BigInt | ||
*/ | ||
private def factorial(n: Int): BigInt = { | ||
if (n > factorials.length - 1) { | ||
// Expand the array | ||
val oldLength = factorials.length | ||
val tmp = new Array[BigInt](n+1) | ||
System.arraycopy(factorials, 0, tmp, 0, oldLength) | ||
factorials = tmp | ||
|
||
// Fill in the new slots | ||
forloop (from=oldLength, until=factorials.length) { i => | ||
factorials(i) = factorials(i-1) * BigInt(i) | ||
} | ||
} | ||
|
||
factorials(n) | ||
} | ||
|
||
/** Computes the binomial coefficient, i.e. the number of combinations of k items given a total of | ||
* n items, often phrased "n choose k". | ||
* | ||
* @param n The number of items being picked from | ||
* @param k The number of items being picked (must be 0 <= k <= n) | ||
* @return The number of combinations of k items from n | ||
*/ | ||
def coefficient(n: Int, k: Int): BigInt = { | ||
require(k <= n, s"k must be <= n. n=$n, k=$k") | ||
require(k >= 0, s"Can't compute coefficient for picking fewer than 0 values. n=$n, k=$k") | ||
|
||
if (k == 0 || k == n) 1 | ||
else factorial(n) / (factorial(k) * factorial(n - k)) | ||
} | ||
|
||
/** | ||
* Calculates the probability of exactly `k` successes in `n` trials given `p` probability. | ||
* Akin to `dbinom` in R. | ||
* | ||
* @param k The number of successful trials | ||
* @param n The number of trials | ||
* @param p The probability of the success in any one trial | ||
* @return The probability of exactly k successes in n trials of p probability | ||
*/ | ||
def probability(k: Int, n: Int, p: BigDecimal): BigDecimal = { | ||
require(p >= Zero && p <= One, s"Probability p must be between 0 and 1. p=$p.") | ||
val coeff = BigDecimal(coefficient(n, k), mc) | ||
val p1 = p.pow(k) | ||
val p2 = (One-p).pow(n - k) | ||
coeff * p1 * p2 | ||
} | ||
|
||
/** | ||
* Calculates the cumulative probability of `[0, k]` successes from `n` trials with `p` probability | ||
* of success in any individual trial. | ||
* Akin to `pbinom` in R. | ||
* | ||
* @param k the number of successes to compute the cumulative probability up to, inclusive | ||
* @param n the number of trials | ||
* @param p the probability of success in any single trial | ||
* @param lower if true return the cumulative probability of `(0,k)` trials inclusive | ||
* otherwise return the cumulative probability of `(k+1, n)` trials inclusive. | ||
*/ | ||
def cumulativeProbability(k: Int, n: Int, p: BigDecimal, lower: Boolean=true): BigDecimal = { | ||
var result = Zero | ||
forloop (from=0, until=k+1) { ki => | ||
result += probability(ki, n, p) | ||
} | ||
|
||
limit(if (lower) result else One - result) | ||
} | ||
} |
109 changes: 109 additions & 0 deletions
109
src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2017 Fulcrum Genomics LLC | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.math | ||
|
||
import java.math.MathContext | ||
|
||
import com.fulcrumgenomics.testing.UnitSpec | ||
import com.fulcrumgenomics.commons.CommonsDef._ | ||
import com.fulcrumgenomics.util.NumericTypes.LogProbability | ||
import org.apache.commons.math3.distribution.{BinomialDistribution => CommonsBinomial} | ||
|
||
class BinomialDistributionTest extends UnitSpec { | ||
val binom = new BinomialDistribution | ||
|
||
"BinomialDistribution.coefficient" should "calculate the correct number of combinations given k=0 or k=n" in { | ||
binom.coefficient(100, 0) shouldBe 1 | ||
binom.coefficient(100, 100) shouldBe 1 | ||
} | ||
|
||
it should "fail if n or k is less than 0" in { | ||
an[Exception] shouldBe thrownBy { binom.coefficient(100, -2) } | ||
an[Exception] shouldBe thrownBy { binom.coefficient(-2, 0) } | ||
} | ||
|
||
it should "fail if k > n" in { | ||
an[Exception] shouldBe thrownBy { binom.coefficient(n=100, k=101) } | ||
} | ||
|
||
Seq((20, 5, BigInt(15504)), (75, 25, BigInt("52588547141148893628"))).foreach { case (n, k, coeff) => | ||
it should s"correctly calculate coefficient(n=$n, k=$k) = $coeff" in { | ||
binom.coefficient(n=n, k=k) shouldBe coeff | ||
} | ||
} | ||
|
||
it should "work for small values of n and k" in { | ||
binom.coefficient(20, 5) shouldBe BigInt("15504") | ||
} | ||
|
||
it should "work for moderate values of n and k" in { | ||
binom.coefficient(75, 25) shouldBe BigInt("52588547141148893628") | ||
} | ||
|
||
it should "work for very large values" in { | ||
binom.coefficient(200, 100) shouldBe BigInt("90548514656103281165404177077484163874504589675413336841320") | ||
} | ||
|
||
"BinomialCoefficient.cumulativeProbability" should "match commons math for small numbers" in { | ||
val commons = new CommonsBinomial(20, 0.05) | ||
Range.inclusive(0, 20).foreach { k => | ||
binom.cumulativeProbability(k, n=20, p=0.05).toDouble shouldBe commons.cumulativeProbability(k) +- 1e-10 | ||
} | ||
} | ||
|
||
it should "compute values where commons math runs out of precision" in { | ||
val (k, n, p) = (39, 55, 3/150.0) | ||
|
||
val commons = new CommonsBinomial(n, p) | ||
|
||
// Calculate the cumulative using log probabilities | ||
val range = Range(0,k) | ||
val probs = range.map(i => commons.logProbability(i)).toArray | ||
val sum = LogProbability.or(probs) | ||
val oneMinus = LogProbability.not(sum) | ||
val linear = LogProbability.expProb(oneMinus) | ||
|
||
// Results using three methods | ||
val commonsResult = 1 - commons.cumulativeProbability(k-1) | ||
val commonsLogResult = linear | ||
val result = binom.cumulativeProbability(k=k-1, n=n, p=p, lower=false).toDouble | ||
|
||
commonsResult shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows | ||
commonsLogResult shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows | ||
result shouldBe 1.193E-53 +- 0.001E-53 // Calculated @ https://www.wolframalpha.com/input/?i=cumulative+binomial+probability | ||
} | ||
|
||
it should "fail if given probabilities < 0 or > 1" in { | ||
an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= -0.01) } | ||
an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= 1.00000000000001) } | ||
} | ||
|
||
it should "yield answers from lower=true and lower=false than sum to 1" in { | ||
val (k, n, p) = (5, 10, 0.3) | ||
val lower = binom.cumulativeProbability(k, n, p, lower=true) | ||
val upper = binom.cumulativeProbability(k, n, p, lower=false) | ||
(lower + upper).toDouble shouldBe 1.0 | ||
} | ||
} |