From 7c1637f6d4574bc8ba5757165f9e7bb02e202621 Mon Sep 17 00:00:00 2001
From: Seth Axen <seth@sethaxen.com>
Date: Wed, 21 Feb 2024 22:58:22 +0100
Subject: [PATCH] Refactor ECDF code (#2311)

* Remove redundant call to compute_ecdf

* Sort samples in-place

* Remove dead definition

* Remove dead variable npoints

* Use more descriptive variable names

* Restructure to reduce code redundancy

* Use consistent argument order

* Standardize more variable names

* Split adjust_gamma into several functions

* Define in terms of pointwise prob instead of gamma

* Refactor to use more informative function names

* Use ndraws for consistency

* Add utility function to simulate an ECDF

* Add type annotations

* Rearrange code

* Require CDF take vector inputs

* Add notes for the future

* Define ecdf_confidence_band

* Remove rvs option

* Move ecdf utility functions to stats module

* Make some functions internal

* Use binom.interval

* Forward prob keeyword

* Mark function as private

* Test ecdf helper functions

* Revert "Require CDF take vector inputs"

This reverts commit 780d395a91ae97928da6dcee5f6772b034387bf7.

* Map cdf over array

* Fix formatting issues

* Add changelog entry

* Apply suggestions from code review

Co-authored-by: Oriol Abril-Pla <oriol.abril.pla@gmail.com>

* Use Any as type for `random_state`

* Update documentation of rvs and random_state

* Use `default_rng` in tests

* Document requirements of cdf function

* Assume cdf accepts array inputs

* try making CI happy

---------

Co-authored-by: Oriol Abril-Pla <oriol.abril.pla@gmail.com>
---
 CHANGELOG.md                                  |   2 +
 arviz/plots/ecdfplot.py                       | 149 ++++------------
 arviz/stats/ecdf_utils.py                     | 165 ++++++++++++++++++
 .../tests/base_tests/test_stats_ecdf_utils.py | 153 ++++++++++++++++
 4 files changed, 357 insertions(+), 112 deletions(-)
 create mode 100644 arviz/stats/ecdf_utils.py
 create mode 100644 arviz/tests/base_tests/test_stats_ecdf_utils.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8092beb43f..1a449ccb26 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 ### Maintenance and fixes
 
+-   Refactor ECDF code ([2311](https://github.com/arviz-devs/arviz/pull/2311))
+
 ### Deprecation
 
 ### Documentation
diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py
index e6ad27ba3f..83b6a11c0c 100644
--- a/arviz/plots/ecdfplot.py
+++ b/arviz/plots/ecdfplot.py
@@ -1,8 +1,9 @@
 """Plot ecdf or ecdf-difference plot with confidence bands."""
 import numpy as np
-from scipy.stats import uniform, binom
+from scipy.stats import uniform
 
 from ..rcparams import rcParams
+from ..stats.ecdf_utils import compute_ecdf, ecdf_confidence_band, _get_ecdf_points
 from .plot_utils import get_plotting_function
 
 
@@ -26,7 +27,7 @@ def plot_ecdf(
     show=None,
     backend=None,
     backend_kwargs=None,
-    **kwargs
+    **kwargs,
 ):
     r"""Plot ECDF or ECDF-Difference Plot with Confidence bands.
 
@@ -48,6 +49,7 @@ def plot_ecdf(
         Values to compare to the original sample.
     cdf : callable, optional
         Cumulative distribution function of the distribution to compare the original sample.
+        The function must take as input a numpy array of draws from the distribution.
     difference : bool, default False
         If True then plot ECDF-difference plot otherwise ECDF plot.
     pit : bool, default False
@@ -180,75 +182,47 @@ def plot_ecdf(
     values = np.ravel(values)
     values.sort()
 
-    ## This block computes gamma and uses it to get the upper and lower confidence bands
-    ## Here we check if we want confidence bands or not
-    if confidence_bands:
-        ## If plotting PIT then we find the PIT values of sample.
-        ## Basically here we generate the evaluation points(x) and find the PIT values.
-        ## z is the evaluation point for our uniform distribution in compute_gamma()
-        if pit:
-            x = np.linspace(1 / npoints, 1, npoints)
-            z = x
-            ## Finding PIT for our sample
-            probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2)
-        else:
-            ## If not PIT use sample for plots and for evaluation points(x) use equally spaced
-            ## points between minimum and maximum of sample
-            ## For z we have used cdf(x)
-            x = np.linspace(values[0], values[-1], npoints)
-            z = cdf(x) if cdf else compute_ecdf(values2, x)
-            probs = values
-
-        n = len(values)  # number of samples
-        ## Computing gamma
-        gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr)
-        ## Using gamma to get the confidence intervals
-        lower, higher = get_lims(gamma, n, z)
-
-        ## This block is for whether to plot ECDF or ECDF-difference
-        if not difference:
-            ## We store the coordinates of our ecdf in x_coord, y_coord
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
+    if pit:
+        eval_points = np.linspace(1 / npoints, 1, npoints)
+        if cdf:
+            sample = cdf(values)
         else:
-            ## Here we subtract the ecdf value as here we are plotting the ECDF-difference
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-            for i, x_i in enumerate(x):
-                y_coord[i] = y_coord[i] - (
-                    x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
-                )
-
-            ## Similarly we subtract from the upper and lower bounds
-            if pit:
-                lower = lower - x
-                higher = higher - x
-            else:
-                lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x))
-                higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x))
-
+            sample = compute_ecdf(values2, values) / len(values2)
+        cdf_at_eval_points = eval_points
+        rvs = uniform(0, 1).rvs
     else:
-        if pit:
-            x = np.linspace(1 / npoints, 1, npoints)
-            probs = cdf(values)
+        eval_points = np.linspace(values[0], values[-1], npoints)
+        sample = values
+        if confidence_bands or difference:
+            if cdf:
+                cdf_at_eval_points = cdf(eval_points)
+            else:
+                cdf_at_eval_points = compute_ecdf(values2, eval_points)
         else:
-            x = np.linspace(values[0], values[-1], npoints)
-            probs = values
+            cdf_at_eval_points = np.zeros_like(eval_points)
+        rvs = None
+
+    x_coord, y_coord = _get_ecdf_points(sample, eval_points, difference)
 
+    if difference:
+        y_coord -= cdf_at_eval_points
+
+    if confidence_bands:
+        ndraws = len(values)
+        band_kwargs = {"prob": 1 - fpr, "num_trials": num_trials, "rvs": rvs, "random_state": None}
+        band_kwargs["method"] = "pointwise" if pointwise else "simulated"
+        lower, higher = ecdf_confidence_band(ndraws, eval_points, cdf_at_eval_points, **band_kwargs)
+
+        if difference:
+            lower -= cdf_at_eval_points
+            higher -= cdf_at_eval_points
+    else:
         lower, higher = None, None
-        ## This block is for whether to plot ECDF or ECDF-difference
-        if not difference:
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-        else:
-            ## Here we subtract the ecdf value as here we are plotting the ECDF-difference
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-            for i, x_i in enumerate(x):
-                y_coord[i] = y_coord[i] - (
-                    x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
-                )
 
     ecdf_plot_args = dict(
         x_coord=x_coord,
         y_coord=y_coord,
-        x_bands=x,
+        x_bands=eval_points,
         lower=lower,
         higher=higher,
         confidence_bands=confidence_bands,
@@ -260,7 +234,7 @@ def plot_ecdf(
         ax=ax,
         show=show,
         backend_kwargs=backend_kwargs,
-        **kwargs
+        **kwargs,
     )
 
     if backend is None:
@@ -271,52 +245,3 @@ def plot_ecdf(
     ax = plot(**ecdf_plot_args)
 
     return ax
-
-
-def compute_ecdf(sample, z):
-    """Compute ECDF.
-
-    This function computes the ecdf value at the evaluation point
-        or a sorted set of evaluation points.
-    """
-    return np.searchsorted(sample, z, side="right") / len(sample)
-
-
-def get_ecdf_points(x, probs, difference):
-    """Compute the coordinates for the ecdf points using compute_ecdf."""
-    y = compute_ecdf(probs, x)
-
-    if not difference:
-        x = np.insert(x, 0, x[0])
-        y = np.insert(y, 0, 0)
-    return x, y
-
-
-def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05):
-    """Compute gamma for confidence interval calculation.
-
-    This function simulates an adjusted value of gamma to account for multiplicity
-    when forming an 1-fpr level confidence envelope for the ECDF of a sample.
-    """
-    if npoints is None:
-        npoints = n
-    gamma = []
-    for _ in range(num_trials):
-        unif_samples = uniform.rvs(0, 1, n)
-        unif_samples = np.sort(unif_samples)
-        gamma_m = 1000
-        ## Can compute ecdf for all the z together or one at a time.
-        f_z = compute_ecdf(unif_samples, z)
-        f_z = compute_ecdf(unif_samples, z)
-        gamma_m = 2 * min(
-            np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z))
-        )
-        gamma.append(gamma_m)
-    return np.quantile(gamma, fpr)
-
-
-def get_lims(gamma, n, z):
-    """Compute the simultaneous 1 - fpr level confidence bands."""
-    lower = binom.ppf(gamma / 2, n, z)
-    upper = binom.ppf(1 - gamma / 2, n, z)
-    return lower / n, upper / n
diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py
new file mode 100644
index 0000000000..53481657a6
--- /dev/null
+++ b/arviz/stats/ecdf_utils.py
@@ -0,0 +1,165 @@
+"""Functions for evaluating ECDFs and their confidence bands."""
+from typing import Any, Callable, Optional, Tuple
+import warnings
+
+import numpy as np
+from scipy.stats import uniform, binom
+
+
+def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
+    """Compute ECDF of the sorted `sample` at the evaluation points."""
+    return np.searchsorted(sample, eval_points, side="right") / len(sample)
+
+
+def _get_ecdf_points(
+    sample: np.ndarray, eval_points: np.ndarray, difference: bool
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the coordinates for the ecdf points using compute_ecdf."""
+    x = eval_points
+    y = compute_ecdf(sample, eval_points)
+
+    if not difference and y[0] > 0:
+        x = np.insert(x, 0, x[0])
+        y = np.insert(y, 0, 0)
+    return x, y
+
+
+def _simulate_ecdf(
+    ndraws: int,
+    eval_points: np.ndarray,
+    rvs: Callable[[int, Optional[Any]], np.ndarray],
+    random_state: Optional[Any] = None,
+) -> np.ndarray:
+    """Simulate ECDF at the `eval_points` using the given random variable sampler"""
+    sample = rvs(ndraws, random_state=random_state)
+    sample.sort()
+    return compute_ecdf(sample, eval_points)
+
+
+def _fit_pointwise_band_probability(
+    ndraws: int,
+    ecdf_at_eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+) -> float:
+    """Compute the smallest marginal probability of a pointwise confidence band that
+    contains the ECDF."""
+    ecdf_scaled = (ndraws * ecdf_at_eval_points).astype(int)
+    prob_lower_tail = np.amin(binom.cdf(ecdf_scaled, ndraws, cdf_at_eval_points))
+    prob_upper_tail = np.amin(binom.sf(ecdf_scaled - 1, ndraws, cdf_at_eval_points))
+    prob_pointwise = 1 - 2 * min(prob_lower_tail, prob_upper_tail)
+    return prob_pointwise
+
+
+def _get_pointwise_confidence_band(
+    prob: float, ndraws: int, cdf_at_eval_points: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the `prob`-level pointwise confidence band."""
+    count_lower, count_upper = binom.interval(prob, ndraws, cdf_at_eval_points)
+    prob_lower = count_lower / ndraws
+    prob_upper = count_upper / ndraws
+    return prob_lower, prob_upper
+
+
+def ecdf_confidence_band(
+    ndraws: int,
+    eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+    prob: float = 0.95,
+    method="simulated",
+    **kwargs,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the `prob`-level confidence band for the ECDF.
+
+    Arguments
+    ---------
+    ndraws : int
+        Number of samples in the original dataset.
+    eval_points : np.ndarray
+        Points at which the ECDF is evaluated. If these are dependent on the sample
+        values, simultaneous confidence bands may not be correctly calibrated.
+    cdf_at_eval_points : np.ndarray
+        CDF values at the evaluation points.
+    prob : float, default 0.95
+        The target probability that a true ECDF lies within the confidence band.
+    method : string, default "simulated"
+        The method used to compute the confidence band. Valid options are:
+        - "pointwise": Compute the pointwise (i.e. marginal) confidence band.
+        - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
+          `rvs` must be provided.
+    rvs: callable, optional
+        A function that takes an integer `ndraws` and optionally the object passed to
+        `random_state` and returns an array of `ndraws` samples from the same distribution
+        as the original dataset. Required if `method` is "simulated" and variable is discrete.
+    num_trials : int, default 1000
+        The number of random ECDFs to generate for constructing simultaneous confidence bands
+        (if `method` is "simulated").
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+        If `None`, the `numpy.random.RandomState` singleton is used. If an `int`, a new
+        ``numpy.random.RandomState`` instance is used, seeded with seed. If a `RandomState` or
+        `Generator` instance, the instance is used.
+
+    Returns
+    -------
+    prob_lower : np.ndarray
+        Lower confidence band for the ECDF at the evaluation points.
+    prob_upper : np.ndarray
+        Upper confidence band for the ECDF at the evaluation points.
+    """
+    if not 0 < prob < 1:
+        raise ValueError(f"Invalid value for `prob`. Expected 0 < prob < 1, but got {prob}.")
+
+    if method == "pointwise":
+        prob_pointwise = prob
+    elif method == "simulated":
+        prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
+            ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
+        )
+    else:
+        raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")
+
+    prob_lower, prob_upper = _get_pointwise_confidence_band(
+        prob_pointwise, ndraws, cdf_at_eval_points
+    )
+
+    return prob_lower, prob_upper
+
+
+def _simulate_simultaneous_ecdf_band_probability(
+    ndraws: int,
+    eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+    prob: float = 0.95,
+    rvs: Optional[Callable[[int, Optional[Any]], np.ndarray]] = None,
+    num_trials: int = 1000,
+    random_state: Optional[Any] = None,
+) -> float:
+    """Estimate probability for simultaneous confidence band using simulation.
+
+    This function simulates the pointwise probability needed to construct pointwise
+    confidence bands that form a `prob`-level confidence envelope for the ECDF
+    of a sample.
+    """
+    if rvs is None:
+        warnings.warn(
+            "Assuming variable is continuous for calibration of pointwise bands. "
+            "If the variable is discrete, specify random variable sampler `rvs`.",
+            UserWarning,
+        )
+        # if variable continuous, we can calibrate the confidence band using a uniform
+        # distribution
+        rvs = uniform(0, 1).rvs
+        eval_points_sim = cdf_at_eval_points
+    else:
+        eval_points_sim = eval_points
+
+    probs_pointwise = np.empty(num_trials)
+    for i in range(num_trials):
+        ecdf_at_eval_points = _simulate_ecdf(
+            ndraws, eval_points_sim, rvs, random_state=random_state
+        )
+        prob_pointwise = _fit_pointwise_band_probability(
+            ndraws, ecdf_at_eval_points, cdf_at_eval_points
+        )
+        probs_pointwise[i] = prob_pointwise
+    return np.quantile(probs_pointwise, prob)
diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py
new file mode 100644
index 0000000000..b6d5563464
--- /dev/null
+++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py
@@ -0,0 +1,153 @@
+import pytest
+
+import numpy as np
+import scipy.stats
+from ...stats.ecdf_utils import (
+    compute_ecdf,
+    ecdf_confidence_band,
+    _get_ecdf_points,
+    _simulate_ecdf,
+    _get_pointwise_confidence_band,
+)
+
+
+def test_compute_ecdf():
+    """Test compute_ecdf function."""
+    sample = np.array([1, 2, 3, 3, 4, 5])
+    eval_points = np.arange(0, 7, 0.1)
+    ecdf_expected = (sample[:, None] <= eval_points).mean(axis=0)
+    assert np.allclose(compute_ecdf(sample, eval_points), ecdf_expected)
+    assert np.allclose(compute_ecdf(sample / 2 + 10, eval_points / 2 + 10), ecdf_expected)
+
+
+@pytest.mark.parametrize("difference", [True, False])
+def test_get_ecdf_points(difference):
+    """Test _get_ecdf_points."""
+    # if first point already outside support, no need to insert it
+    sample = np.array([1, 2, 3, 3, 4, 5, 5])
+    eval_points = np.arange(-1, 7, 0.1)
+    x, y = _get_ecdf_points(sample, eval_points, difference)
+    assert np.array_equal(x, eval_points)
+    assert np.array_equal(y, compute_ecdf(sample, eval_points))
+
+    # if first point is inside support, insert it if not in difference mode
+    eval_points = np.arange(1, 6, 0.1)
+    x, y = _get_ecdf_points(sample, eval_points, difference)
+    assert len(x) == len(eval_points) + 1 - difference
+    assert len(y) == len(eval_points) + 1 - difference
+
+    # if not in difference mode, first point should be (eval_points[0], 0)
+    if not difference:
+        assert x[0] == eval_points[0]
+        assert y[0] == 0
+        assert np.allclose(x[1:], eval_points)
+        assert np.allclose(y[1:], compute_ecdf(sample, eval_points))
+        assert x[-1] == eval_points[-1]
+        assert y[-1] == 1
+
+
+@pytest.mark.parametrize(
+    "dist", [scipy.stats.norm(3, 10), scipy.stats.binom(10, 0.5)], ids=["continuous", "discrete"]
+)
+@pytest.mark.parametrize("seed", [32, 87])
+def test_simulate_ecdf(dist, seed):
+    """Test _simulate_ecdf."""
+    ndraws = 1000
+    eval_points = np.arange(0, 1, 0.1)
+
+    rvs = dist.rvs
+
+    random_state = np.random.default_rng(seed)
+    ecdf = _simulate_ecdf(ndraws, eval_points, rvs, random_state=random_state)
+    random_state = np.random.default_rng(seed)
+    ecdf_expected = compute_ecdf(np.sort(rvs(ndraws, random_state=random_state)), eval_points)
+
+    assert np.allclose(ecdf, ecdf_expected)
+
+
+@pytest.mark.parametrize("prob", [0.8, 0.9])
+@pytest.mark.parametrize(
+    "dist", [scipy.stats.norm(3, 10), scipy.stats.poisson(100)], ids=["continuous", "discrete"]
+)
+@pytest.mark.parametrize("ndraws", [10_000])
+def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, seed=57):
+    """Test _get_pointwise_confidence_band."""
+    eval_points = np.linspace(*dist.interval(0.99), 10)
+    cdf_at_eval_points = dist.cdf(eval_points)
+
+    ecdf_lower, ecdf_upper = _get_pointwise_confidence_band(prob, ndraws, cdf_at_eval_points)
+
+    # check basic properties
+    assert np.all(ecdf_lower >= 0)
+    assert np.all(ecdf_upper <= 1)
+    assert np.all(ecdf_lower <= ecdf_upper)
+
+    # use simulation to estimate lower and upper bounds on pointwise probability
+    in_interval = []
+    random_state = np.random.default_rng(seed)
+    for _ in range(num_trials):
+        ecdf = _simulate_ecdf(ndraws, eval_points, dist.rvs, random_state=random_state)
+        in_interval.append((ecdf_lower <= ecdf) & (ecdf < ecdf_upper))
+    asymptotic_dist = scipy.stats.norm(
+        np.mean(in_interval, axis=0), scipy.stats.sem(in_interval, axis=0)
+    )
+    prob_lower, prob_upper = asymptotic_dist.interval(0.999)
+
+    # check target probability within all bounds
+    assert np.all(prob_lower <= prob)
+    assert np.all(prob <= prob_upper)
+
+
+@pytest.mark.parametrize("prob", [0.8, 0.9])
+@pytest.mark.parametrize(
+    "dist, rvs",
+    [
+        (scipy.stats.norm(3, 10), scipy.stats.norm(3, 10).rvs),
+        (scipy.stats.norm(3, 10), None),
+        (scipy.stats.poisson(100), scipy.stats.poisson(100).rvs),
+    ],
+    ids=["continuous", "continuous default rvs", "discrete"],
+)
+@pytest.mark.parametrize("ndraws", [10_000])
+@pytest.mark.parametrize("method", ["pointwise", "simulated"])
+def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57):
+    """Test test_ecdf_confidence_band."""
+    eval_points = np.linspace(*dist.interval(0.99), 10)
+    cdf_at_eval_points = dist.cdf(eval_points)
+    random_state = np.random.default_rng(seed)
+
+    ecdf_lower, ecdf_upper = ecdf_confidence_band(
+        ndraws,
+        eval_points,
+        cdf_at_eval_points,
+        prob=prob,
+        rvs=rvs,
+        random_state=random_state,
+        method=method,
+    )
+
+    if method == "pointwise":
+        # these values tested elsewhere, we just make sure they're the same
+        ecdf_lower_pointwise, ecdf_upper_pointwise = _get_pointwise_confidence_band(
+            prob, ndraws, cdf_at_eval_points
+        )
+        assert np.array_equal(ecdf_lower, ecdf_lower_pointwise)
+        assert np.array_equal(ecdf_upper, ecdf_upper_pointwise)
+        return
+
+    # check basic properties
+    assert np.all(ecdf_lower >= 0)
+    assert np.all(ecdf_upper <= 1)
+    assert np.all(ecdf_lower <= ecdf_upper)
+
+    # use simulation to estimate lower and upper bounds on simultaneous probability
+    in_envelope = []
+    random_state = np.random.default_rng(seed)
+    for _ in range(num_trials):
+        ecdf = _simulate_ecdf(ndraws, eval_points, dist.rvs, random_state=random_state)
+        in_envelope.append(np.all(ecdf_lower <= ecdf) & np.all(ecdf < ecdf_upper))
+    asymptotic_dist = scipy.stats.norm(np.mean(in_envelope), scipy.stats.sem(in_envelope))
+    prob_lower, prob_upper = asymptotic_dist.interval(0.999)
+
+    # check target probability within bounds
+    assert prob_lower <= prob <= prob_upper