Added a high class for calculating binomial probabilities at unlimite…

…d precision. (#311)
fulcrumgenomics · Nov 15, 2017 · a9445b4 · a9445b4
1 parent e2d1d8d
commit a9445b4
Show file tree

Hide file tree

Showing 2 changed files with 246 additions and 0 deletions.
diff --git a/src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala b/src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala
@@ -0,0 +1,137 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2017 Fulcrum Genomics LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package com.fulcrumgenomics.math
+
+import java.math.MathContext
+
+import com.fulcrumgenomics.FgBioDef._
+
+import scala.math.{BigDecimal, BigInt}
+
+/**
+  * A high-precision implementation of the math for binomial probabilities.  This implementation is several
+  * times slower (though not orders of magnitude slower) than the implementation in Apache Commons Math, but
+  * retains precision throughout the computation where the Commons implementation underflows.
+  *
+  * One implementation choice to be aware of when using this implementation is that each instance of the
+  * class will calculate and cache factorials up to factorial(n) where n is the highest value of n
+  * supplied to any call to [[probability()]] or [[cumulativeProbability()]].  Thus to ensure reasonable
+  * performance it is recommended to create one instance of this class and reuse it for as many
+  * calculations as is practical.
+  *
+  * @param mc the MathContext to use for controlling precision and rounding. Changing from the default
+  *           [[MathContext.UNLIMITED]] can lead to a loss of precision.
+  */
+class BinomialDistribution(val mc: MathContext = MathContext.UNLIMITED) {
+  // This array will get expanded with more factorials any time a higher n is queried
+  private var factorials = Array(BigInt(0), BigInt(1))
+
+ // Constant values for 0 and 1 in BigDecimal to avoid making them all the time
+  private val Zero = BigDecimal(0, mc)
+  private val One  = BigDecimal(1, mc)
+
+  /** Limits a value to the range 0-1. */
+  @inline private def limit(d: BigDecimal): BigDecimal = {
+    if (d < Zero) Zero
+    else if (d > One) One
+    else d
+  }
+
+  /** Get the factorial of n. Retrieves the value from a cached set of factorials. If the cache
+    * does not contain factorial(n) yet, then all factorials up to factorial(n) are computed
+    * and cached.
+    *
+    * @param n the number to compute the factorial of
+    * @return the factorial value as a BigInt
+    */
+  private def factorial(n: Int): BigInt = {
+    if (n > factorials.length - 1) {
+      // Expand the array
+      val oldLength = factorials.length
+      val tmp = new Array[BigInt](n+1)
+      System.arraycopy(factorials, 0, tmp, 0, oldLength)
+      factorials = tmp
+
+      // Fill in the new slots
+      forloop (from=oldLength, until=factorials.length) { i =>
+        factorials(i) = factorials(i-1) * BigInt(i)
+      }
+    }
+
+    factorials(n)
+  }
+
+  /** Computes the binomial coefficient, i.e. the number of combinations of k items given a total of
+    * n items, often phrased "n choose k".
+    *
+    * @param n The number of items being picked from
+    * @param k The number of items being picked (must be 0 <= k <= n)
+    * @return The number of combinations of k items from n
+   */
+  def coefficient(n: Int, k: Int): BigInt = {
+    require(k <= n, s"k must be <= n. n=$n, k=$k")
+    require(k >= 0, s"Can't compute coefficient for picking fewer than 0 values. n=$n, k=$k")
+
+    if (k == 0 || k == n) 1
+    else factorial(n) / (factorial(k) * factorial(n - k))
+  }
+
+  /**
+    * Calculates the probability of exactly `k` successes in `n` trials given `p` probability.
+    * Akin to `dbinom` in R.
+    *
+    * @param k The number of successful trials
+    * @param n The number of trials
+    * @param p The probability of the success in any one trial
+    * @return The probability of exactly k successes in n trials of p probability
+    */
+  def probability(k: Int, n: Int, p: BigDecimal): BigDecimal = {
+    require(p >= Zero && p <= One, s"Probability p must be between 0 and 1. p=$p.")
+    val coeff = BigDecimal(coefficient(n, k), mc)
+    val p1 = p.pow(k)
+    val p2 = (One-p).pow(n - k)
+    coeff * p1 * p2
+  }
+
+  /**
+    * Calculates the cumulative probability of `[0, k]` successes from `n` trials with `p` probability
+    * of success in any individual trial.
+    * Akin to `pbinom` in R.
+    *
+    * @param k the number of successes to compute the cumulative probability up to, inclusive
+    * @param n the number of trials
+    * @param p the probability of success in any single trial
+    * @param lower if true return the cumulative probability of `(0,k)` trials inclusive
+    *              otherwise return the cumulative probability of `(k+1, n)` trials inclusive.
+    */
+  def cumulativeProbability(k: Int, n: Int, p: BigDecimal, lower: Boolean=true): BigDecimal = {
+    var result = Zero
+    forloop (from=0, until=k+1) { ki =>
+      result += probability(ki, n, p)
+    }
+
+    limit(if (lower) result else One - result)
+  }
+}
diff --git a/src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala b/src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala
@@ -0,0 +1,109 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2017 Fulcrum Genomics LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+package com.fulcrumgenomics.math
+
+import java.math.MathContext
+
+import com.fulcrumgenomics.testing.UnitSpec
+import com.fulcrumgenomics.commons.CommonsDef._
+import com.fulcrumgenomics.util.NumericTypes.LogProbability
+import org.apache.commons.math3.distribution.{BinomialDistribution => CommonsBinomial}
+
+class BinomialDistributionTest extends UnitSpec {
+  val binom = new BinomialDistribution
+
+  "BinomialDistribution.coefficient" should "calculate the correct number of combinations given k=0 or k=n" in {
+    binom.coefficient(100, 0)   shouldBe 1
+    binom.coefficient(100, 100) shouldBe 1
+  }
+
+  it should "fail if n or k is less than 0" in {
+    an[Exception] shouldBe thrownBy { binom.coefficient(100, -2) }
+    an[Exception] shouldBe thrownBy { binom.coefficient(-2,  0) }
+  }
+
+  it should "fail if k > n" in {
+    an[Exception] shouldBe thrownBy { binom.coefficient(n=100, k=101) }
+  }
+
+  Seq((20, 5, BigInt(15504)), (75, 25, BigInt("52588547141148893628"))).foreach { case (n, k, coeff) =>
+    it should s"correctly calculate coefficient(n=$n, k=$k) = $coeff" in {
+      binom.coefficient(n=n, k=k) shouldBe coeff
+    }
+  }
+
+  it should "work for small values of n and k" in {
+    binom.coefficient(20, 5) shouldBe BigInt("15504")
+  }
+
+  it should "work for moderate values of n and k" in {
+    binom.coefficient(75, 25) shouldBe BigInt("52588547141148893628")
+  }
+
+  it should "work for very large values" in {
+    binom.coefficient(200, 100) shouldBe BigInt("90548514656103281165404177077484163874504589675413336841320")
+  }
+
+  "BinomialCoefficient.cumulativeProbability" should "match commons math for small numbers" in {
+    val commons = new CommonsBinomial(20, 0.05)
+    Range.inclusive(0, 20).foreach { k =>
+      binom.cumulativeProbability(k, n=20, p=0.05).toDouble shouldBe commons.cumulativeProbability(k) +- 1e-10
+    }
+  }
+
+  it should "compute values where commons math runs out of precision" in {
+    val (k, n, p) = (39, 55, 3/150.0)
+
+    val commons = new CommonsBinomial(n, p)
+
+    // Calculate the cumulative using log probabilities
+    val range    = Range(0,k)
+    val probs    = range.map(i => commons.logProbability(i)).toArray
+    val sum      = LogProbability.or(probs)
+    val oneMinus = LogProbability.not(sum)
+    val linear   = LogProbability.expProb(oneMinus)
+
+    // Results using three methods
+    val commonsResult    = 1 - commons.cumulativeProbability(k-1)
+    val commonsLogResult = linear
+    val result           = binom.cumulativeProbability(k=k-1, n=n, p=p, lower=false).toDouble
+
+    commonsResult    shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows
+    commonsLogResult shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows
+    result           shouldBe 1.193E-53 +- 0.001E-53 // Calculated @ https://www.wolframalpha.com/input/?i=cumulative+binomial+probability
+  }
+
+  it should "fail if given probabilities < 0 or > 1" in {
+    an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= -0.01) }
+    an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= 1.00000000000001) }
+  }
+
+  it should "yield answers from lower=true and lower=false than sum to 1" in {
+    val (k, n, p) = (5, 10, 0.3)
+    val lower = binom.cumulativeProbability(k, n, p, lower=true)
+    val upper = binom.cumulativeProbability(k, n, p, lower=false)
+    (lower + upper).toDouble shouldBe 1.0
+  }
+}