diff --git a/src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala b/src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala new file mode 100644 index 000000000..06aa90c0e --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/math/BinomialDistribution.scala @@ -0,0 +1,137 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.math + +import java.math.MathContext + +import com.fulcrumgenomics.FgBioDef._ + +import scala.math.{BigDecimal, BigInt} + +/** + * A high-precision implementation of the math for binomial probabilities. This implementation is several + * times slower (though not orders of magnitude slower) than the implementation in Apache Commons Math, but + * retains precision throughout the computation where the Commons implementation underflows. + * + * One implementation choice to be aware of when using this implementation is that each instance of the + * class will calculate and cache factorials up to factorial(n) where n is the highest value of n + * supplied to any call to [[probability()]] or [[cumulativeProbability()]]. Thus to ensure reasonable + * performance it is recommended to create one instance of this class and reuse it for as many + * calculations as is practical. + * + * @param mc the MathContext to use for controlling precision and rounding. Changing from the default + * [[MathContext.UNLIMITED]] can lead to a loss of precision. + */ +class BinomialDistribution(val mc: MathContext = MathContext.UNLIMITED) { + // This array will get expanded with more factorials any time a higher n is queried + private var factorials = Array(BigInt(0), BigInt(1)) + + // Constant values for 0 and 1 in BigDecimal to avoid making them all the time + private val Zero = BigDecimal(0, mc) + private val One = BigDecimal(1, mc) + + /** Limits a value to the range 0-1. */ + @inline private def limit(d: BigDecimal): BigDecimal = { + if (d < Zero) Zero + else if (d > One) One + else d + } + + /** Get the factorial of n. Retrieves the value from a cached set of factorials. If the cache + * does not contain factorial(n) yet, then all factorials up to factorial(n) are computed + * and cached. + * + * @param n the number to compute the factorial of + * @return the factorial value as a BigInt + */ + private def factorial(n: Int): BigInt = { + if (n > factorials.length - 1) { + // Expand the array + val oldLength = factorials.length + val tmp = new Array[BigInt](n+1) + System.arraycopy(factorials, 0, tmp, 0, oldLength) + factorials = tmp + + // Fill in the new slots + forloop (from=oldLength, until=factorials.length) { i => + factorials(i) = factorials(i-1) * BigInt(i) + } + } + + factorials(n) + } + + /** Computes the binomial coefficient, i.e. the number of combinations of k items given a total of + * n items, often phrased "n choose k". + * + * @param n The number of items being picked from + * @param k The number of items being picked (must be 0 <= k <= n) + * @return The number of combinations of k items from n + */ + def coefficient(n: Int, k: Int): BigInt = { + require(k <= n, s"k must be <= n. n=$n, k=$k") + require(k >= 0, s"Can't compute coefficient for picking fewer than 0 values. n=$n, k=$k") + + if (k == 0 || k == n) 1 + else factorial(n) / (factorial(k) * factorial(n - k)) + } + + /** + * Calculates the probability of exactly `k` successes in `n` trials given `p` probability. + * Akin to `dbinom` in R. + * + * @param k The number of successful trials + * @param n The number of trials + * @param p The probability of the success in any one trial + * @return The probability of exactly k successes in n trials of p probability + */ + def probability(k: Int, n: Int, p: BigDecimal): BigDecimal = { + require(p >= Zero && p <= One, s"Probability p must be between 0 and 1. p=$p.") + val coeff = BigDecimal(coefficient(n, k), mc) + val p1 = p.pow(k) + val p2 = (One-p).pow(n - k) + coeff * p1 * p2 + } + + /** + * Calculates the cumulative probability of `[0, k]` successes from `n` trials with `p` probability + * of success in any individual trial. + * Akin to `pbinom` in R. + * + * @param k the number of successes to compute the cumulative probability up to, inclusive + * @param n the number of trials + * @param p the probability of success in any single trial + * @param lower if true return the cumulative probability of `(0,k)` trials inclusive + * otherwise return the cumulative probability of `(k+1, n)` trials inclusive. + */ + def cumulativeProbability(k: Int, n: Int, p: BigDecimal, lower: Boolean=true): BigDecimal = { + var result = Zero + forloop (from=0, until=k+1) { ki => + result += probability(ki, n, p) + } + + limit(if (lower) result else One - result) + } +} diff --git a/src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala b/src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala new file mode 100644 index 000000000..14c6f8a11 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/math/BinomialDistributionTest.scala @@ -0,0 +1,109 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.math + +import java.math.MathContext + +import com.fulcrumgenomics.testing.UnitSpec +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.util.NumericTypes.LogProbability +import org.apache.commons.math3.distribution.{BinomialDistribution => CommonsBinomial} + +class BinomialDistributionTest extends UnitSpec { + val binom = new BinomialDistribution + + "BinomialDistribution.coefficient" should "calculate the correct number of combinations given k=0 or k=n" in { + binom.coefficient(100, 0) shouldBe 1 + binom.coefficient(100, 100) shouldBe 1 + } + + it should "fail if n or k is less than 0" in { + an[Exception] shouldBe thrownBy { binom.coefficient(100, -2) } + an[Exception] shouldBe thrownBy { binom.coefficient(-2, 0) } + } + + it should "fail if k > n" in { + an[Exception] shouldBe thrownBy { binom.coefficient(n=100, k=101) } + } + + Seq((20, 5, BigInt(15504)), (75, 25, BigInt("52588547141148893628"))).foreach { case (n, k, coeff) => + it should s"correctly calculate coefficient(n=$n, k=$k) = $coeff" in { + binom.coefficient(n=n, k=k) shouldBe coeff + } + } + + it should "work for small values of n and k" in { + binom.coefficient(20, 5) shouldBe BigInt("15504") + } + + it should "work for moderate values of n and k" in { + binom.coefficient(75, 25) shouldBe BigInt("52588547141148893628") + } + + it should "work for very large values" in { + binom.coefficient(200, 100) shouldBe BigInt("90548514656103281165404177077484163874504589675413336841320") + } + + "BinomialCoefficient.cumulativeProbability" should "match commons math for small numbers" in { + val commons = new CommonsBinomial(20, 0.05) + Range.inclusive(0, 20).foreach { k => + binom.cumulativeProbability(k, n=20, p=0.05).toDouble shouldBe commons.cumulativeProbability(k) +- 1e-10 + } + } + + it should "compute values where commons math runs out of precision" in { + val (k, n, p) = (39, 55, 3/150.0) + + val commons = new CommonsBinomial(n, p) + + // Calculate the cumulative using log probabilities + val range = Range(0,k) + val probs = range.map(i => commons.logProbability(i)).toArray + val sum = LogProbability.or(probs) + val oneMinus = LogProbability.not(sum) + val linear = LogProbability.expProb(oneMinus) + + // Results using three methods + val commonsResult = 1 - commons.cumulativeProbability(k-1) + val commonsLogResult = linear + val result = binom.cumulativeProbability(k=k-1, n=n, p=p, lower=false).toDouble + + commonsResult shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows + commonsLogResult shouldBe 0.0 // The real answer is not 0.0, here to show this method underflows + result shouldBe 1.193E-53 +- 0.001E-53 // Calculated @ https://www.wolframalpha.com/input/?i=cumulative+binomial+probability + } + + it should "fail if given probabilities < 0 or > 1" in { + an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= -0.01) } + an[Exception] shouldBe thrownBy { binom.cumulativeProbability(k=5, n=10, p= 1.00000000000001) } + } + + it should "yield answers from lower=true and lower=false than sum to 1" in { + val (k, n, p) = (5, 10, 0.3) + val lower = binom.cumulativeProbability(k, n, p, lower=true) + val upper = binom.cumulativeProbability(k, n, p, lower=false) + (lower + upper).toDouble shouldBe 1.0 + } +}