Skip to content

Commit

Permalink
Refactor ECDF code (#2311)
Browse files Browse the repository at this point in the history
* Remove redundant call to compute_ecdf

* Sort samples in-place

* Remove dead definition

* Remove dead variable npoints

* Use more descriptive variable names

* Restructure to reduce code redundancy

* Use consistent argument order

* Standardize more variable names

* Split adjust_gamma into several functions

* Define in terms of pointwise prob instead of gamma

* Refactor to use more informative function names

* Use ndraws for consistency

* Add utility function to simulate an ECDF

* Add type annotations

* Rearrange code

* Require CDF take vector inputs

* Add notes for the future

* Define ecdf_confidence_band

* Remove rvs option

* Move ecdf utility functions to stats module

* Make some functions internal

* Use binom.interval

* Forward prob keeyword

* Mark function as private

* Test ecdf helper functions

* Revert "Require CDF take vector inputs"

This reverts commit 780d395.

* Map cdf over array

* Fix formatting issues

* Add changelog entry

* Apply suggestions from code review

Co-authored-by: Oriol Abril-Pla <[email protected]>

* Use Any as type for `random_state`

* Update documentation of rvs and random_state

* Use `default_rng` in tests

* Document requirements of cdf function

* Assume cdf accepts array inputs

* try making CI happy

---------

Co-authored-by: Oriol Abril-Pla <[email protected]>
  • Loading branch information
sethaxen and OriolAbril authored Feb 21, 2024
1 parent d0adde3 commit 7c1637f
Show file tree
Hide file tree
Showing 4 changed files with 357 additions and 112 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

### Maintenance and fixes

- Refactor ECDF code ([2311](https://github.com/arviz-devs/arviz/pull/2311))

### Deprecation

### Documentation
Expand Down
149 changes: 37 additions & 112 deletions arviz/plots/ecdfplot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Plot ecdf or ecdf-difference plot with confidence bands."""
import numpy as np
from scipy.stats import uniform, binom
from scipy.stats import uniform

from ..rcparams import rcParams
from ..stats.ecdf_utils import compute_ecdf, ecdf_confidence_band, _get_ecdf_points
from .plot_utils import get_plotting_function


Expand All @@ -26,7 +27,7 @@ def plot_ecdf(
show=None,
backend=None,
backend_kwargs=None,
**kwargs
**kwargs,
):
r"""Plot ECDF or ECDF-Difference Plot with Confidence bands.
Expand All @@ -48,6 +49,7 @@ def plot_ecdf(
Values to compare to the original sample.
cdf : callable, optional
Cumulative distribution function of the distribution to compare the original sample.
The function must take as input a numpy array of draws from the distribution.
difference : bool, default False
If True then plot ECDF-difference plot otherwise ECDF plot.
pit : bool, default False
Expand Down Expand Up @@ -180,75 +182,47 @@ def plot_ecdf(
values = np.ravel(values)
values.sort()

## This block computes gamma and uses it to get the upper and lower confidence bands
## Here we check if we want confidence bands or not
if confidence_bands:
## If plotting PIT then we find the PIT values of sample.
## Basically here we generate the evaluation points(x) and find the PIT values.
## z is the evaluation point for our uniform distribution in compute_gamma()
if pit:
x = np.linspace(1 / npoints, 1, npoints)
z = x
## Finding PIT for our sample
probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2)
else:
## If not PIT use sample for plots and for evaluation points(x) use equally spaced
## points between minimum and maximum of sample
## For z we have used cdf(x)
x = np.linspace(values[0], values[-1], npoints)
z = cdf(x) if cdf else compute_ecdf(values2, x)
probs = values

n = len(values) # number of samples
## Computing gamma
gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr)
## Using gamma to get the confidence intervals
lower, higher = get_lims(gamma, n, z)

## This block is for whether to plot ECDF or ECDF-difference
if not difference:
## We store the coordinates of our ecdf in x_coord, y_coord
x_coord, y_coord = get_ecdf_points(x, probs, difference)
if pit:
eval_points = np.linspace(1 / npoints, 1, npoints)
if cdf:
sample = cdf(values)
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)

## Similarly we subtract from the upper and lower bounds
if pit:
lower = lower - x
higher = higher - x
else:
lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x))
higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x))

sample = compute_ecdf(values2, values) / len(values2)
cdf_at_eval_points = eval_points
rvs = uniform(0, 1).rvs
else:
if pit:
x = np.linspace(1 / npoints, 1, npoints)
probs = cdf(values)
eval_points = np.linspace(values[0], values[-1], npoints)
sample = values
if confidence_bands or difference:
if cdf:
cdf_at_eval_points = cdf(eval_points)
else:
cdf_at_eval_points = compute_ecdf(values2, eval_points)
else:
x = np.linspace(values[0], values[-1], npoints)
probs = values
cdf_at_eval_points = np.zeros_like(eval_points)
rvs = None

x_coord, y_coord = _get_ecdf_points(sample, eval_points, difference)

if difference:
y_coord -= cdf_at_eval_points

if confidence_bands:
ndraws = len(values)
band_kwargs = {"prob": 1 - fpr, "num_trials": num_trials, "rvs": rvs, "random_state": None}
band_kwargs["method"] = "pointwise" if pointwise else "simulated"
lower, higher = ecdf_confidence_band(ndraws, eval_points, cdf_at_eval_points, **band_kwargs)

if difference:
lower -= cdf_at_eval_points
higher -= cdf_at_eval_points
else:
lower, higher = None, None
## This block is for whether to plot ECDF or ECDF-difference
if not difference:
x_coord, y_coord = get_ecdf_points(x, probs, difference)
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)

ecdf_plot_args = dict(
x_coord=x_coord,
y_coord=y_coord,
x_bands=x,
x_bands=eval_points,
lower=lower,
higher=higher,
confidence_bands=confidence_bands,
Expand All @@ -260,7 +234,7 @@ def plot_ecdf(
ax=ax,
show=show,
backend_kwargs=backend_kwargs,
**kwargs
**kwargs,
)

if backend is None:
Expand All @@ -271,52 +245,3 @@ def plot_ecdf(
ax = plot(**ecdf_plot_args)

return ax


def compute_ecdf(sample, z):
"""Compute ECDF.
This function computes the ecdf value at the evaluation point
or a sorted set of evaluation points.
"""
return np.searchsorted(sample, z, side="right") / len(sample)


def get_ecdf_points(x, probs, difference):
"""Compute the coordinates for the ecdf points using compute_ecdf."""
y = compute_ecdf(probs, x)

if not difference:
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0)
return x, y


def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05):
"""Compute gamma for confidence interval calculation.
This function simulates an adjusted value of gamma to account for multiplicity
when forming an 1-fpr level confidence envelope for the ECDF of a sample.
"""
if npoints is None:
npoints = n
gamma = []
for _ in range(num_trials):
unif_samples = uniform.rvs(0, 1, n)
unif_samples = np.sort(unif_samples)
gamma_m = 1000
## Can compute ecdf for all the z together or one at a time.
f_z = compute_ecdf(unif_samples, z)
f_z = compute_ecdf(unif_samples, z)
gamma_m = 2 * min(
np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z))
)
gamma.append(gamma_m)
return np.quantile(gamma, fpr)


def get_lims(gamma, n, z):
"""Compute the simultaneous 1 - fpr level confidence bands."""
lower = binom.ppf(gamma / 2, n, z)
upper = binom.ppf(1 - gamma / 2, n, z)
return lower / n, upper / n
165 changes: 165 additions & 0 deletions arviz/stats/ecdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""Functions for evaluating ECDFs and their confidence bands."""
from typing import Any, Callable, Optional, Tuple
import warnings

import numpy as np
from scipy.stats import uniform, binom


def compute_ecdf(sample: np.ndarray, eval_points: np.ndarray) -> np.ndarray:
"""Compute ECDF of the sorted `sample` at the evaluation points."""
return np.searchsorted(sample, eval_points, side="right") / len(sample)


def _get_ecdf_points(
sample: np.ndarray, eval_points: np.ndarray, difference: bool
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the coordinates for the ecdf points using compute_ecdf."""
x = eval_points
y = compute_ecdf(sample, eval_points)

if not difference and y[0] > 0:
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0)
return x, y


def _simulate_ecdf(
ndraws: int,
eval_points: np.ndarray,
rvs: Callable[[int, Optional[Any]], np.ndarray],
random_state: Optional[Any] = None,
) -> np.ndarray:
"""Simulate ECDF at the `eval_points` using the given random variable sampler"""
sample = rvs(ndraws, random_state=random_state)
sample.sort()
return compute_ecdf(sample, eval_points)


def _fit_pointwise_band_probability(
ndraws: int,
ecdf_at_eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
) -> float:
"""Compute the smallest marginal probability of a pointwise confidence band that
contains the ECDF."""
ecdf_scaled = (ndraws * ecdf_at_eval_points).astype(int)
prob_lower_tail = np.amin(binom.cdf(ecdf_scaled, ndraws, cdf_at_eval_points))
prob_upper_tail = np.amin(binom.sf(ecdf_scaled - 1, ndraws, cdf_at_eval_points))
prob_pointwise = 1 - 2 * min(prob_lower_tail, prob_upper_tail)
return prob_pointwise


def _get_pointwise_confidence_band(
prob: float, ndraws: int, cdf_at_eval_points: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level pointwise confidence band."""
count_lower, count_upper = binom.interval(prob, ndraws, cdf_at_eval_points)
prob_lower = count_lower / ndraws
prob_upper = count_upper / ndraws
return prob_lower, prob_upper


def ecdf_confidence_band(
ndraws: int,
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
method="simulated",
**kwargs,
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the `prob`-level confidence band for the ECDF.
Arguments
---------
ndraws : int
Number of samples in the original dataset.
eval_points : np.ndarray
Points at which the ECDF is evaluated. If these are dependent on the sample
values, simultaneous confidence bands may not be correctly calibrated.
cdf_at_eval_points : np.ndarray
CDF values at the evaluation points.
prob : float, default 0.95
The target probability that a true ECDF lies within the confidence band.
method : string, default "simulated"
The method used to compute the confidence band. Valid options are:
- "pointwise": Compute the pointwise (i.e. marginal) confidence band.
- "simulated": Use Monte Carlo simulation to estimate a simultaneous confidence band.
`rvs` must be provided.
rvs: callable, optional
A function that takes an integer `ndraws` and optionally the object passed to
`random_state` and returns an array of `ndraws` samples from the same distribution
as the original dataset. Required if `method` is "simulated" and variable is discrete.
num_trials : int, default 1000
The number of random ECDFs to generate for constructing simultaneous confidence bands
(if `method` is "simulated").
random_state : {None, int, `numpy.random.Generator`,
`numpy.random.RandomState`}, optional
If `None`, the `numpy.random.RandomState` singleton is used. If an `int`, a new
``numpy.random.RandomState`` instance is used, seeded with seed. If a `RandomState` or
`Generator` instance, the instance is used.
Returns
-------
prob_lower : np.ndarray
Lower confidence band for the ECDF at the evaluation points.
prob_upper : np.ndarray
Upper confidence band for the ECDF at the evaluation points.
"""
if not 0 < prob < 1:
raise ValueError(f"Invalid value for `prob`. Expected 0 < prob < 1, but got {prob}.")

if method == "pointwise":
prob_pointwise = prob
elif method == "simulated":
prob_pointwise = _simulate_simultaneous_ecdf_band_probability(
ndraws, eval_points, cdf_at_eval_points, prob=prob, **kwargs
)
else:
raise ValueError(f"Unknown method {method}. Valid options are 'pointwise' or 'simulated'.")

prob_lower, prob_upper = _get_pointwise_confidence_band(
prob_pointwise, ndraws, cdf_at_eval_points
)

return prob_lower, prob_upper


def _simulate_simultaneous_ecdf_band_probability(
ndraws: int,
eval_points: np.ndarray,
cdf_at_eval_points: np.ndarray,
prob: float = 0.95,
rvs: Optional[Callable[[int, Optional[Any]], np.ndarray]] = None,
num_trials: int = 1000,
random_state: Optional[Any] = None,
) -> float:
"""Estimate probability for simultaneous confidence band using simulation.
This function simulates the pointwise probability needed to construct pointwise
confidence bands that form a `prob`-level confidence envelope for the ECDF
of a sample.
"""
if rvs is None:
warnings.warn(
"Assuming variable is continuous for calibration of pointwise bands. "
"If the variable is discrete, specify random variable sampler `rvs`.",
UserWarning,
)
# if variable continuous, we can calibrate the confidence band using a uniform
# distribution
rvs = uniform(0, 1).rvs
eval_points_sim = cdf_at_eval_points
else:
eval_points_sim = eval_points

probs_pointwise = np.empty(num_trials)
for i in range(num_trials):
ecdf_at_eval_points = _simulate_ecdf(
ndraws, eval_points_sim, rvs, random_state=random_state
)
prob_pointwise = _fit_pointwise_band_probability(
ndraws, ecdf_at_eval_points, cdf_at_eval_points
)
probs_pointwise[i] = prob_pointwise
return np.quantile(probs_pointwise, prob)
Loading

0 comments on commit 7c1637f

Please sign in to comment.