From 7c1637f6d4574bc8ba5757165f9e7bb02e202621 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Wed, 21 Feb 2024 22:58:22 +0100 Subject: [PATCH] Refactor ECDF code (#2311) * Remove redundant call to compute_ecdf * Sort samples in-place * Remove dead definition * Remove dead variable npoints * Use more descriptive variable names * Restructure to reduce code redundancy * Use consistent argument order * Standardize more variable names * Split adjust_gamma into several functions * Define in terms of pointwise prob instead of gamma * Refactor to use more informative function names * Use ndraws for consistency * Add utility function to simulate an ECDF * Add type annotations * Rearrange code * Require CDF take vector inputs * Add notes for the future * Define ecdf_confidence_band * Remove rvs option * Move ecdf utility functions to stats module * Make some functions internal * Use binom.interval * Forward prob keeyword * Mark function as private * Test ecdf helper functions * Revert "Require CDF take vector inputs" This reverts commit 780d395a91ae97928da6dcee5f6772b034387bf7. * Map cdf over array * Fix formatting issues * Add changelog entry * Apply suggestions from code review Co-authored-by: Oriol Abril-Pla * Use Any as type for `random_state` * Update documentation of rvs and random_state * Use `default_rng` in tests * Document requirements of cdf function * Assume cdf accepts array inputs * try making CI happy --------- Co-authored-by: Oriol Abril-Pla --- CHANGELOG.md | 2 + arviz/plots/ecdfplot.py | 149 ++++------------ arviz/stats/ecdf_utils.py | 165 ++++++++++++++++++ .../tests/base_tests/test_stats_ecdf_utils.py | 153 ++++++++++++++++ 4 files changed, 357 insertions(+), 112 deletions(-) create mode 100644 arviz/stats/ecdf_utils.py create mode 100644 arviz/tests/base_tests/test_stats_ecdf_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8092beb43f..1a449ccb26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Maintenance and fixes +- Refactor ECDF code ([2311](https://github.com/arviz-devs/arviz/pull/2311)) + ### Deprecation ### Documentation diff --git a/arviz/plots/ecdfplot.py b/arviz/plots/ecdfplot.py index e6ad27ba3f..83b6a11c0c 100644 --- a/arviz/plots/ecdfplot.py +++ b/arviz/plots/ecdfplot.py @@ -1,8 +1,9 @@ """Plot ecdf or ecdf-difference plot with confidence bands.""" import numpy as np -from scipy.stats import uniform, binom +from scipy.stats import uniform from ..rcparams import rcParams +from ..stats.ecdf_utils import compute_ecdf, ecdf_confidence_band, _get_ecdf_points from .plot_utils import get_plotting_function @@ -26,7 +27,7 @@ def plot_ecdf( show=None, backend=None, backend_kwargs=None, - **kwargs + **kwargs, ): r"""Plot ECDF or ECDF-Difference Plot with Confidence bands. @@ -48,6 +49,7 @@ def plot_ecdf( Values to compare to the original sample. cdf : callable, optional Cumulative distribution function of the distribution to compare the original sample. + The function must take as input a numpy array of draws from the distribution. difference : bool, default False If True then plot ECDF-difference plot otherwise ECDF plot. pit : bool, default False @@ -180,75 +182,47 @@ def plot_ecdf( values = np.ravel(values) values.sort() - ## This block computes gamma and uses it to get the upper and lower confidence bands - ## Here we check if we want confidence bands or not - if confidence_bands: - ## If plotting PIT then we find the PIT values of sample. - ## Basically here we generate the evaluation points(x) and find the PIT values. - ## z is the evaluation point for our uniform distribution in compute_gamma() - if pit: - x = np.linspace(1 / npoints, 1, npoints) - z = x - ## Finding PIT for our sample - probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2) - else: - ## If not PIT use sample for plots and for evaluation points(x) use equally spaced - ## points between minimum and maximum of sample - ## For z we have used cdf(x) - x = np.linspace(values[0], values[-1], npoints) - z = cdf(x) if cdf else compute_ecdf(values2, x) - probs = values - - n = len(values) # number of samples - ## Computing gamma - gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr) - ## Using gamma to get the confidence intervals - lower, higher = get_lims(gamma, n, z) - - ## This block is for whether to plot ECDF or ECDF-difference - if not difference: - ## We store the coordinates of our ecdf in x_coord, y_coord - x_coord, y_coord = get_ecdf_points(x, probs, difference) + if pit: + eval_points = np.linspace(1 / npoints, 1, npoints) + if cdf: + sample = cdf(values) else: - ## Here we subtract the ecdf value as here we are plotting the ECDF-difference - x_coord, y_coord = get_ecdf_points(x, probs, difference) - for i, x_i in enumerate(x): - y_coord[i] = y_coord[i] - ( - x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) - ) - - ## Similarly we subtract from the upper and lower bounds - if pit: - lower = lower - x - higher = higher - x - else: - lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x)) - higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x)) - + sample = compute_ecdf(values2, values) / len(values2) + cdf_at_eval_points = eval_points + rvs = uniform(0, 1).rvs else: - if pit: - x = np.linspace(1 / npoints, 1, npoints) - probs = cdf(values) + eval_points = np.linspace(values[0], values[-1], npoints) + sample = values + if confidence_bands or difference: + if cdf: + cdf_at_eval_points = cdf(eval_points) + else: + cdf_at_eval_points = compute_ecdf(values2, eval_points) else: - x = np.linspace(values[0], values[-1], npoints) - probs = values + cdf_at_eval_points = np.zeros_like(eval_points) + rvs = None + + x_coord, y_coord = _get_ecdf_points(sample, eval_points, difference) + if difference: + y_coord -= cdf_at_eval_points + + if confidence_bands: + ndraws = len(values) + band_kwargs = {"prob": 1 - fpr, "num_trials": num_trials, "rvs": rvs, "random_state": None} + band_kwargs["method"] = "pointwise" if pointwise else "simulated" + lower, higher = ecdf_confidence_band(ndraws, eval_points, cdf_at_eval_points, **band_kwargs) + + if difference: + lower -= cdf_at_eval_points + higher -= cdf_at_eval_points + else: lower, higher = None, None - ## This block is for whether to plot ECDF or ECDF-difference - if not difference: - x_coord, y_coord = get_ecdf_points(x, probs, difference) - else: - ## Here we subtract the ecdf value as here we are plotting the ECDF-difference - x_coord, y_coord = get_ecdf_points(x, probs, difference) - for i, x_i in enumerate(x): - y_coord[i] = y_coord[i] - ( - x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) - ) ecdf_plot_args = dict( x_coord=x_coord, y_coord=y_coord, - x_bands=x, + x_bands=eval_points, lower=lower, higher=higher, confidence_bands=confidence_bands, @@ -260,7 +234,7 @@ def plot_ecdf( ax=ax, show=show, backend_kwargs=backend_kwargs, - **kwargs + **kwargs, ) if backend is None: @@ -271,52 +245,3 @@ def plot_ecdf( ax = plot(**ecdf_plot_args) return ax - - -def compute_ecdf(sample, z): - """Compute ECDF. - - This function computes the ecdf value at the evaluation point - or a sorted set of evaluation points. - """ - return np.searchsorted(sample, z, side="right") / len(sample) - - -def get_ecdf_points(x, probs, difference): - """Compute the coordinates for the ecdf points using compute_ecdf.""" - y = compute_ecdf(probs, x) - - if not difference: - x = np.insert(x, 0, x[0]) - y = np.insert(y, 0, 0) - return x, y - - -def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05): - """Compute gamma for confidence interval calculation. - - This function simulates an adjusted value of gamma to account for multiplicity - when forming an 1-fpr level confidence envelope for the ECDF of a sample. - """ - if npoints is None: - npoints = n - gamma = [] - for _ in range(num_trials): - unif_samples = uniform.rvs(0, 1, n) - unif_samples = np.sort(unif_samples) - gamma_m = 1000 - ## Can compute ecdf for all the z together or one at a time. - f_z = compute_ecdf(unif_samples, z) - f_z = compute_ecdf(unif_samples, z) - gamma_m = 2 * min( - np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z)) - ) - gamma.append(gamma_m) - return np.quantile(gamma, fpr) - - -def get_lims(gamma, n, z): - """Compute the simultaneous 1 - fpr level confidence bands.""" - lower = binom.ppf(gamma / 2, n, z) - upper = binom.ppf(1 - gamma / 2, n, z) - return lower / n, upper / n diff --git a/arviz/stats/ecdf_utils.py b/arviz/stats/ecdf_utils.py new file mode 100644 index 0000000000..53481657a6 --- /dev/null +++ b/arviz/stats/ecdf_utils.py @@ -0,0 +1,165 @@ +"""Functions for evaluating ECDFs and their confidence bands.""" +from typing import Any, Callable, Optional, Tuple +import warnings + +import numpy as np +from scipy.stats import uniform, binom + + +def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray: + """Compute ECDF of the sorted `sample` at the evaluation points.""" + return np.searchsorted(sample, eval_points, side="right") / len(sample) + + +def _get_ecdf_points( + sample: np.ndarray, eval_points: np.ndarray, difference: bool +) -> Tuple[np.ndarray, np.ndarray]: + """Compute the coordinates for the ecdf points using compute_ecdf.""" + x = eval_points + y = compute_ecdf(sample, eval_points) + + if not difference and y[0] > 0: + x = np.insert(x, 0, x[0]) + y = np.insert(y, 0, 0) + return x, y + + +def _simulate_ecdf( + ndraws: int, + eval_points: np.ndarray, + rvs: Callable[[int, Optional[Any]], np.ndarray], + random_state: Optional[Any] = None, +) -> np.ndarray: + """Simulate ECDF at the `eval_points` using the given random variable sampler""" + sample = rvs(ndraws, random_state=random_state) + sample.sort() + return compute_ecdf(sample, eval_points) + + +def _fit_pointwise_band_probability( + ndraws: int, + ecdf_at_eval_points: np.ndarray, + cdf_at_eval_points: np.ndarray, +) -> float: + """Compute the smallest marginal probability of a pointwise confidence band that + contains the ECDF.""" + ecdf_scaled = (ndraws * ecdf_at_eval_points).astype(int) + prob_lower_tail = np.amin(binom.cdf(ecdf_scaled, ndraws, cdf_at_eval_points)) + prob_upper_tail = np.amin(binom.sf(ecdf_scaled - 1, ndraws, cdf_at_eval_points)) + prob_pointwise = 1 - 2 * min(prob_lower_tail, prob_upper_tail) + return prob_pointwise + + +def _get_pointwise_confidence_band( + prob: float, ndraws: int, cdf_at_eval_points: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: + """Compute the `prob`-level pointwise confidence band.""" + count_lower, count_upper = binom.interval(prob, ndraws, cdf_at_eval_points) + prob_lower = count_lower / ndraws + prob_upper = count_upper / ndraws + return prob_lower, prob_upper + + +def ecdf_confidence_band( + ndraws: int, + eval_points: np.ndarray, + cdf_at_eval_points: np.ndarray, + prob: float = 0.95, + method="simulated", + **kwargs, +) -> Tuple[np.ndarray, np.ndarray]: + """Compute the `prob`-level confidence band for the ECDF. + + Arguments + --------- + ndraws : int + Number of samples in the original dataset. + eval_points : np.ndarray + Points at which the ECDF is evaluated. If these are dependent on the sample + values, simultaneous confidence bands may not be correctly calibrated. + cdf_at_eval_points : np.ndarray + CDF values at the evaluation points. + prob : float, default 0.95 + The target probability that a true ECDF lies within the confidence band. + method : string, default "simulated" + The method used to compute the confidence band. Valid options are: + - "pointwise": Compute the pointwise (i.e. marginal) confidence band. + - "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band. + `rvs` must be provided. + rvs: callable, optional + A function that takes an integer `ndraws` and optionally the object passed to + `random_state` and returns an array of `ndraws` samples from the same distribution + as the original dataset. Required if `method` is "simulated" and variable is discrete. + num_trials : int, default 1000 + The number of random ECDFs to generate for constructing simultaneous confidence bands + (if `method` is "simulated"). + random_state : {None, int, `numpy.random.Generator`, + `numpy.random.RandomState`}, optional + If `None`, the `numpy.random.RandomState` singleton is used. If an `int`, a new + ``numpy.random.RandomState`` instance is used, seeded with seed. If a `RandomState` or + `Generator` instance, the instance is used. + + Returns + ------- + prob_lower : np.ndarray + Lower confidence band for the ECDF at the evaluation points. + prob_upper : np.ndarray + Upper confidence band for the ECDF at the evaluation points. + """ + if not 0 < prob < 1: + raise ValueError(f"Invalid value for `prob`. Expected 0 < prob < 1, but got {prob}.") + + if method == "pointwise": + prob_pointwise = prob + elif method == "simulated": + prob_pointwise = _simulate_simultaneous_ecdf_band_probability( + ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs + ) + else: + raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.") + + prob_lower, prob_upper = _get_pointwise_confidence_band( + prob_pointwise, ndraws, cdf_at_eval_points + ) + + return prob_lower, prob_upper + + +def _simulate_simultaneous_ecdf_band_probability( + ndraws: int, + eval_points: np.ndarray, + cdf_at_eval_points: np.ndarray, + prob: float = 0.95, + rvs: Optional[Callable[[int, Optional[Any]], np.ndarray]] = None, + num_trials: int = 1000, + random_state: Optional[Any] = None, +) -> float: + """Estimate probability for simultaneous confidence band using simulation. + + This function simulates the pointwise probability needed to construct pointwise + confidence bands that form a `prob`-level confidence envelope for the ECDF + of a sample. + """ + if rvs is None: + warnings.warn( + "Assuming variable is continuous for calibration of pointwise bands. " + "If the variable is discrete, specify random variable sampler `rvs`.", + UserWarning, + ) + # if variable continuous, we can calibrate the confidence band using a uniform + # distribution + rvs = uniform(0, 1).rvs + eval_points_sim = cdf_at_eval_points + else: + eval_points_sim = eval_points + + probs_pointwise = np.empty(num_trials) + for i in range(num_trials): + ecdf_at_eval_points = _simulate_ecdf( + ndraws, eval_points_sim, rvs, random_state=random_state + ) + prob_pointwise = _fit_pointwise_band_probability( + ndraws, ecdf_at_eval_points, cdf_at_eval_points + ) + probs_pointwise[i] = prob_pointwise + return np.quantile(probs_pointwise, prob) diff --git a/arviz/tests/base_tests/test_stats_ecdf_utils.py b/arviz/tests/base_tests/test_stats_ecdf_utils.py new file mode 100644 index 0000000000..b6d5563464 --- /dev/null +++ b/arviz/tests/base_tests/test_stats_ecdf_utils.py @@ -0,0 +1,153 @@ +import pytest + +import numpy as np +import scipy.stats +from ...stats.ecdf_utils import ( + compute_ecdf, + ecdf_confidence_band, + _get_ecdf_points, + _simulate_ecdf, + _get_pointwise_confidence_band, +) + + +def test_compute_ecdf(): + """Test compute_ecdf function.""" + sample = np.array([1, 2, 3, 3, 4, 5]) + eval_points = np.arange(0, 7, 0.1) + ecdf_expected = (sample[:, None] <= eval_points).mean(axis=0) + assert np.allclose(compute_ecdf(sample, eval_points), ecdf_expected) + assert np.allclose(compute_ecdf(sample / 2 + 10, eval_points / 2 + 10), ecdf_expected) + + +@pytest.mark.parametrize("difference", [True, False]) +def test_get_ecdf_points(difference): + """Test _get_ecdf_points.""" + # if first point already outside support, no need to insert it + sample = np.array([1, 2, 3, 3, 4, 5, 5]) + eval_points = np.arange(-1, 7, 0.1) + x, y = _get_ecdf_points(sample, eval_points, difference) + assert np.array_equal(x, eval_points) + assert np.array_equal(y, compute_ecdf(sample, eval_points)) + + # if first point is inside support, insert it if not in difference mode + eval_points = np.arange(1, 6, 0.1) + x, y = _get_ecdf_points(sample, eval_points, difference) + assert len(x) == len(eval_points) + 1 - difference + assert len(y) == len(eval_points) + 1 - difference + + # if not in difference mode, first point should be (eval_points[0], 0) + if not difference: + assert x[0] == eval_points[0] + assert y[0] == 0 + assert np.allclose(x[1:], eval_points) + assert np.allclose(y[1:], compute_ecdf(sample, eval_points)) + assert x[-1] == eval_points[-1] + assert y[-1] == 1 + + +@pytest.mark.parametrize( + "dist", [scipy.stats.norm(3, 10), scipy.stats.binom(10, 0.5)], ids=["continuous", "discrete"] +) +@pytest.mark.parametrize("seed", [32, 87]) +def test_simulate_ecdf(dist, seed): + """Test _simulate_ecdf.""" + ndraws = 1000 + eval_points = np.arange(0, 1, 0.1) + + rvs = dist.rvs + + random_state = np.random.default_rng(seed) + ecdf = _simulate_ecdf(ndraws, eval_points, rvs, random_state=random_state) + random_state = np.random.default_rng(seed) + ecdf_expected = compute_ecdf(np.sort(rvs(ndraws, random_state=random_state)), eval_points) + + assert np.allclose(ecdf, ecdf_expected) + + +@pytest.mark.parametrize("prob", [0.8, 0.9]) +@pytest.mark.parametrize( + "dist", [scipy.stats.norm(3, 10), scipy.stats.poisson(100)], ids=["continuous", "discrete"] +) +@pytest.mark.parametrize("ndraws", [10_000]) +def test_get_pointwise_confidence_band(dist, prob, ndraws, num_trials=1_000, seed=57): + """Test _get_pointwise_confidence_band.""" + eval_points = np.linspace(*dist.interval(0.99), 10) + cdf_at_eval_points = dist.cdf(eval_points) + + ecdf_lower, ecdf_upper = _get_pointwise_confidence_band(prob, ndraws, cdf_at_eval_points) + + # check basic properties + assert np.all(ecdf_lower >= 0) + assert np.all(ecdf_upper <= 1) + assert np.all(ecdf_lower <= ecdf_upper) + + # use simulation to estimate lower and upper bounds on pointwise probability + in_interval = [] + random_state = np.random.default_rng(seed) + for _ in range(num_trials): + ecdf = _simulate_ecdf(ndraws, eval_points, dist.rvs, random_state=random_state) + in_interval.append((ecdf_lower <= ecdf) & (ecdf < ecdf_upper)) + asymptotic_dist = scipy.stats.norm( + np.mean(in_interval, axis=0), scipy.stats.sem(in_interval, axis=0) + ) + prob_lower, prob_upper = asymptotic_dist.interval(0.999) + + # check target probability within all bounds + assert np.all(prob_lower <= prob) + assert np.all(prob <= prob_upper) + + +@pytest.mark.parametrize("prob", [0.8, 0.9]) +@pytest.mark.parametrize( + "dist, rvs", + [ + (scipy.stats.norm(3, 10), scipy.stats.norm(3, 10).rvs), + (scipy.stats.norm(3, 10), None), + (scipy.stats.poisson(100), scipy.stats.poisson(100).rvs), + ], + ids=["continuous", "continuous default rvs", "discrete"], +) +@pytest.mark.parametrize("ndraws", [10_000]) +@pytest.mark.parametrize("method", ["pointwise", "simulated"]) +def test_ecdf_confidence_band(dist, rvs, prob, ndraws, method, num_trials=1_000, seed=57): + """Test test_ecdf_confidence_band.""" + eval_points = np.linspace(*dist.interval(0.99), 10) + cdf_at_eval_points = dist.cdf(eval_points) + random_state = np.random.default_rng(seed) + + ecdf_lower, ecdf_upper = ecdf_confidence_band( + ndraws, + eval_points, + cdf_at_eval_points, + prob=prob, + rvs=rvs, + random_state=random_state, + method=method, + ) + + if method == "pointwise": + # these values tested elsewhere, we just make sure they're the same + ecdf_lower_pointwise, ecdf_upper_pointwise = _get_pointwise_confidence_band( + prob, ndraws, cdf_at_eval_points + ) + assert np.array_equal(ecdf_lower, ecdf_lower_pointwise) + assert np.array_equal(ecdf_upper, ecdf_upper_pointwise) + return + + # check basic properties + assert np.all(ecdf_lower >= 0) + assert np.all(ecdf_upper <= 1) + assert np.all(ecdf_lower <= ecdf_upper) + + # use simulation to estimate lower and upper bounds on simultaneous probability + in_envelope = [] + random_state = np.random.default_rng(seed) + for _ in range(num_trials): + ecdf = _simulate_ecdf(ndraws, eval_points, dist.rvs, random_state=random_state) + in_envelope.append(np.all(ecdf_lower <= ecdf) & np.all(ecdf < ecdf_upper)) + asymptotic_dist = scipy.stats.norm(np.mean(in_envelope), scipy.stats.sem(in_envelope)) + prob_lower, prob_upper = asymptotic_dist.interval(0.999) + + # check target probability within bounds + assert prob_lower <= prob <= prob_upper