Refactor ECDF code (#2311)

* Remove redundant call to compute_ecdf * Sort samples in-place * Remove dead definition * Remove dead variable npoints * Use more descriptive variable names * Restructure to reduce code redundancy * Use consistent argument order * Standardize more variable names * Split adjust_gamma into several functions * Define in terms of pointwise prob instead of gamma * Refactor to use more informative function names * Use ndraws for consistency * Add utility function to simulate an ECDF * Add type annotations * Rearrange code * Require CDF take vector inputs * Add notes for the future * Define ecdf_confidence_band * Remove rvs option * Move ecdf utility functions to stats module * Make some functions internal * Use binom.interval * Forward prob keeyword * Mark function as private * Test ecdf helper functions * Revert "Require CDF take vector inputs" This reverts commit 780d395. * Map cdf over array * Fix formatting issues * Add changelog entry * Apply suggestions from code review Co-authored-by: Oriol Abril-Pla <[email protected]> * Use Any as type for `random_state` * Update documentation of rvs and random_state * Use `default_rng` in tests * Document requirements of cdf function * Assume cdf accepts array inputs * try making CI happy --------- Co-authored-by: Oriol Abril-Pla <[email protected]>
arviz-devs · Feb 21, 2024 · 7c1637f · 7c1637f
1 parent d0adde3
commit 7c1637f
Show file tree

Hide file tree

Showing 4 changed files with 357 additions and 112 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 ### Maintenance and fixes
 
+-   Refactor ECDF code ([2311](https://github.com/arviz-devs/arviz/pull/2311))
+
 ### Deprecation
 
 ### Documentation

diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py
@@ -1,8 +1,9 @@
 """Plot ecdf or ecdf-difference plot with confidence bands."""
 import numpy as np
-from scipy.stats import uniform, binom
+from scipy.stats import uniform
 
 from ..rcparams import rcParams
+from ..stats.ecdf_utils import compute_ecdf, ecdf_confidence_band, _get_ecdf_points
 from .plot_utils import get_plotting_function
 
 
@@ -26,7 +27,7 @@ def plot_ecdf(
     show=None,
     backend=None,
     backend_kwargs=None,
-    **kwargs
+    **kwargs,
 ):
     r"""Plot ECDF or ECDF-Difference Plot with Confidence bands.
 
@@ -48,6 +49,7 @@ def plot_ecdf(
         Values to compare to the original sample.
     cdf : callable, optional
         Cumulative distribution function of the distribution to compare the original sample.
+        The function must take as input a numpy array of draws from the distribution.
     difference : bool, default False
         If True then plot ECDF-difference plot otherwise ECDF plot.
     pit : bool, default False
@@ -180,75 +182,47 @@ def plot_ecdf(
     values = np.ravel(values)
     values.sort()
 
-    ## This block computes gamma and uses it to get the upper and lower confidence bands
-    ## Here we check if we want confidence bands or not
-    if confidence_bands:
-        ## If plotting PIT then we find the PIT values of sample.
-        ## Basically here we generate the evaluation points(x) and find the PIT values.
-        ## z is the evaluation point for our uniform distribution in compute_gamma()
-        if pit:
-            x = np.linspace(1 / npoints, 1, npoints)
-            z = x
-            ## Finding PIT for our sample
-            probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2)
-        else:
-            ## If not PIT use sample for plots and for evaluation points(x) use equally spaced
-            ## points between minimum and maximum of sample
-            ## For z we have used cdf(x)
-            x = np.linspace(values[0], values[-1], npoints)
-            z = cdf(x) if cdf else compute_ecdf(values2, x)
-            probs = values
-
-        n = len(values)  # number of samples
-        ## Computing gamma
-        gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr)
-        ## Using gamma to get the confidence intervals
-        lower, higher = get_lims(gamma, n, z)
-
-        ## This block is for whether to plot ECDF or ECDF-difference
-        if not difference:
-            ## We store the coordinates of our ecdf in x_coord, y_coord
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
+    if pit:
+        eval_points = np.linspace(1 / npoints, 1, npoints)
+        if cdf:
+            sample = cdf(values)
         else:
-            ## Here we subtract the ecdf value as here we are plotting the ECDF-difference
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-            for i, x_i in enumerate(x):
-                y_coord[i] = y_coord[i] - (
-                    x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
-                )
-
-            ## Similarly we subtract from the upper and lower bounds
-            if pit:
-                lower = lower - x
-                higher = higher - x
-            else:
-                lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x))
-                higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x))
-
+            sample = compute_ecdf(values2, values) / len(values2)
+        cdf_at_eval_points = eval_points
+        rvs = uniform(0, 1).rvs
     else:
-        if pit:
-            x = np.linspace(1 / npoints, 1, npoints)
-            probs = cdf(values)
+        eval_points = np.linspace(values[0], values[-1], npoints)
+        sample = values
+        if confidence_bands or difference:
+            if cdf:
+                cdf_at_eval_points = cdf(eval_points)
+            else:
+                cdf_at_eval_points = compute_ecdf(values2, eval_points)
         else:
-            x = np.linspace(values[0], values[-1], npoints)
-            probs = values
+            cdf_at_eval_points = np.zeros_like(eval_points)
+        rvs = None
+
+    x_coord, y_coord = _get_ecdf_points(sample, eval_points, difference)
 
+    if difference:
+        y_coord -= cdf_at_eval_points
+
+    if confidence_bands:
+        ndraws = len(values)
+        band_kwargs = {"prob": 1 - fpr, "num_trials": num_trials, "rvs": rvs, "random_state": None}
+        band_kwargs["method"] = "pointwise" if pointwise else "simulated"
+        lower, higher = ecdf_confidence_band(ndraws, eval_points, cdf_at_eval_points, **band_kwargs)
+
+        if difference:
+            lower -= cdf_at_eval_points
+            higher -= cdf_at_eval_points
+    else:
         lower, higher = None, None
-        ## This block is for whether to plot ECDF or ECDF-difference
-        if not difference:
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-        else:
-            ## Here we subtract the ecdf value as here we are plotting the ECDF-difference
-            x_coord, y_coord = get_ecdf_points(x, probs, difference)
-            for i, x_i in enumerate(x):
-                y_coord[i] = y_coord[i] - (
-                    x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
-                )
 
     ecdf_plot_args = dict(
         x_coord=x_coord,
         y_coord=y_coord,
-        x_bands=x,
+        x_bands=eval_points,
         lower=lower,
         higher=higher,
         confidence_bands=confidence_bands,
@@ -260,7 +234,7 @@ def plot_ecdf(
         ax=ax,
         show=show,
         backend_kwargs=backend_kwargs,
-        **kwargs
+        **kwargs,
     )
 
     if backend is None:
@@ -271,52 +245,3 @@ def plot_ecdf(
     ax = plot(**ecdf_plot_args)
 
     return ax
-
-
-def compute_ecdf(sample, z):
-    """Compute ECDF.
-
-    This function computes the ecdf value at the evaluation point
-        or a sorted set of evaluation points.
-    """
-    return np.searchsorted(sample, z, side="right") / len(sample)
-
-
-def get_ecdf_points(x, probs, difference):
-    """Compute the coordinates for the ecdf points using compute_ecdf."""
-    y = compute_ecdf(probs, x)
-
-    if not difference:
-        x = np.insert(x, 0, x[0])
-        y = np.insert(y, 0, 0)
-    return x, y
-
-
-def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05):
-    """Compute gamma for confidence interval calculation.
-
-    This function simulates an adjusted value of gamma to account for multiplicity
-    when forming an 1-fpr level confidence envelope for the ECDF of a sample.
-    """
-    if npoints is None:
-        npoints = n
-    gamma = []
-    for _ in range(num_trials):
-        unif_samples = uniform.rvs(0, 1, n)
-        unif_samples = np.sort(unif_samples)
-        gamma_m = 1000
-        ## Can compute ecdf for all the z together or one at a time.
-        f_z = compute_ecdf(unif_samples, z)
-        f_z = compute_ecdf(unif_samples, z)
-        gamma_m = 2 * min(
-            np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z))
-        )
-        gamma.append(gamma_m)
-    return np.quantile(gamma, fpr)
-
-
-def get_lims(gamma, n, z):
-    """Compute the simultaneous 1 - fpr level confidence bands."""
-    lower = binom.ppf(gamma / 2, n, z)
-    upper = binom.ppf(1 - gamma / 2, n, z)
-    return lower / n, upper / n
diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py
@@ -0,0 +1,165 @@
+"""Functions for evaluating ECDFs and their confidence bands."""
+from typing import Any, Callable, Optional, Tuple
+import warnings
+
+import numpy as np
+from scipy.stats import uniform, binom
+
+
+def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
+    """Compute ECDF of the sorted `sample` at the evaluation points."""
+    return np.searchsorted(sample, eval_points, side="right") / len(sample)
+
+
+def _get_ecdf_points(
+    sample: np.ndarray, eval_points: np.ndarray, difference: bool
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the coordinates for the ecdf points using compute_ecdf."""
+    x = eval_points
+    y = compute_ecdf(sample, eval_points)
+
+    if not difference and y[0] > 0:
+        x = np.insert(x, 0, x[0])
+        y = np.insert(y, 0, 0)
+    return x, y
+
+
+def _simulate_ecdf(
+    ndraws: int,
+    eval_points: np.ndarray,
+    rvs: Callable[[int, Optional[Any]], np.ndarray],
+    random_state: Optional[Any] = None,
+) -> np.ndarray:
+    """Simulate ECDF at the `eval_points` using the given random variable sampler"""
+    sample = rvs(ndraws, random_state=random_state)
+    sample.sort()
+    return compute_ecdf(sample, eval_points)
+
+
+def _fit_pointwise_band_probability(
+    ndraws: int,
+    ecdf_at_eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+) -> float:
+    """Compute the smallest marginal probability of a pointwise confidence band that
+    contains the ECDF."""
+    ecdf_scaled = (ndraws * ecdf_at_eval_points).astype(int)
+    prob_lower_tail = np.amin(binom.cdf(ecdf_scaled, ndraws, cdf_at_eval_points))
+    prob_upper_tail = np.amin(binom.sf(ecdf_scaled - 1, ndraws, cdf_at_eval_points))
+    prob_pointwise = 1 - 2 * min(prob_lower_tail, prob_upper_tail)
+    return prob_pointwise
+
+
+def _get_pointwise_confidence_band(
+    prob: float, ndraws: int, cdf_at_eval_points: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the `prob`-level pointwise confidence band."""
+    count_lower, count_upper = binom.interval(prob, ndraws, cdf_at_eval_points)
+    prob_lower = count_lower / ndraws
+    prob_upper = count_upper / ndraws
+    return prob_lower, prob_upper
+
+
+def ecdf_confidence_band(
+    ndraws: int,
+    eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+    prob: float = 0.95,
+    method="simulated",
+    **kwargs,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the `prob`-level confidence band for the ECDF.
+
+    Arguments
+    ---------
+    ndraws : int
+        Number of samples in the original dataset.
+    eval_points : np.ndarray
+        Points at which the ECDF is evaluated. If these are dependent on the sample
+        values, simultaneous confidence bands may not be correctly calibrated.
+    cdf_at_eval_points : np.ndarray
+        CDF values at the evaluation points.
+    prob : float, default 0.95
+        The target probability that a true ECDF lies within the confidence band.
+    method : string, default "simulated"
+        The method used to compute the confidence band. Valid options are:
+        - "pointwise": Compute the pointwise (i.e. marginal) confidence band.
+        - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
+          `rvs` must be provided.
+    rvs: callable, optional
+        A function that takes an integer `ndraws` and optionally the object passed to
+        `random_state` and returns an array of `ndraws` samples from the same distribution
+        as the original dataset. Required if `method` is "simulated" and variable is discrete.
+    num_trials : int, default 1000
+        The number of random ECDFs to generate for constructing simultaneous confidence bands
+        (if `method` is "simulated").
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+        If `None`, the `numpy.random.RandomState` singleton is used. If an `int`, a new
+        ``numpy.random.RandomState`` instance is used, seeded with seed. If a `RandomState` or
+        `Generator` instance, the instance is used.
+
+    Returns
+    -------
+    prob_lower : np.ndarray
+        Lower confidence band for the ECDF at the evaluation points.
+    prob_upper : np.ndarray
+        Upper confidence band for the ECDF at the evaluation points.
+    """
+    if not 0 < prob < 1:
+        raise ValueError(f"Invalid value for `prob`. Expected 0 < prob < 1, but got {prob}.")
+
+    if method == "pointwise":
+        prob_pointwise = prob
+    elif method == "simulated":
+        prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
+            ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
+        )
+    else:
+        raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")
+
+    prob_lower, prob_upper = _get_pointwise_confidence_band(
+        prob_pointwise, ndraws, cdf_at_eval_points
+    )
+
+    return prob_lower, prob_upper
+
+
+def _simulate_simultaneous_ecdf_band_probability(
+    ndraws: int,
+    eval_points: np.ndarray,
+    cdf_at_eval_points: np.ndarray,
+    prob: float = 0.95,
+    rvs: Optional[Callable[[int, Optional[Any]], np.ndarray]] = None,
+    num_trials: int = 1000,
+    random_state: Optional[Any] = None,
+) -> float:
+    """Estimate probability for simultaneous confidence band using simulation.
+
+    This function simulates the pointwise probability needed to construct pointwise
+    confidence bands that form a `prob`-level confidence envelope for the ECDF
+    of a sample.
+    """
+    if rvs is None:
+        warnings.warn(
+            "Assuming variable is continuous for calibration of pointwise bands. "
+            "If the variable is discrete, specify random variable sampler `rvs`.",
+            UserWarning,
+        )
+        # if variable continuous, we can calibrate the confidence band using a uniform
+        # distribution
+        rvs = uniform(0, 1).rvs
+        eval_points_sim = cdf_at_eval_points
+    else:
+        eval_points_sim = eval_points
+
+    probs_pointwise = np.empty(num_trials)
+    for i in range(num_trials):
+        ecdf_at_eval_points = _simulate_ecdf(
+            ndraws, eval_points_sim, rvs, random_state=random_state
+        )
+        prob_pointwise = _fit_pointwise_band_probability(
+            ndraws, ecdf_at_eval_points, cdf_at_eval_points
+        )
+        probs_pointwise[i] = prob_pointwise
+    return np.quantile(probs_pointwise, prob)