Source code for credible.frequentist.utils

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Frequentist confidence interval estimation.

(Frequentist) confidence interval interpretation, with 95% coverage: **If we
are to take several independent random samples from the population and
construct confidence intervals from each of the sample data, then 95 out of 100
confidence intervals will contain the true mean (true proportion, in this
context of proportion)**.

See a discussion in `Five Confidence Intervals for Proportions That You
Should Know About <ci-evaluation_>`_.

.. include:: ../links.rst
"""

import functools
import typing

import numpy
import numpy.typing
import scipy.optimize
import scipy.stats
from tqdm import tqdm

from ..utils import as_int_arrays


def _clopper_pearson_ndarray(
    successes: numpy.typing.NDArray[numpy.integer],
    failures: numpy.typing.NDArray[numpy.integer],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`clopper_pearson`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    right = (1.0 - coverage) / 2  # half-width in each side
    lower = scipy.stats.beta.ppf(right, successes, failures + 1)
    upper = scipy.stats.beta.ppf(1 - right, successes + 1, failures)
    lower = numpy.nan_to_num(lower, nan=0.0)
    upper = numpy.nan_to_num(upper, nan=1.0)
    return successes / (successes + failures), lower, upper



[docs]
def clopper_pearson_array(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`clopper_pearson`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    successes_array, failures_array = as_int_arrays((successes, failures))
    return _clopper_pearson_ndarray(successes_array, failures_array, coverage)




[docs]
def clopper_pearson(
    successes: int, failures: int, coverage: float = 0.95
) -> tuple[float, float, float]:
    """Calculate the "exact" confidence interval for proportion estimates.

    The Clopper-Pearson interval method is used for estimating the confidence
    intervals.  This implementation is based on [CLOPPER-1934]_.  This
    technique is **very** conservative - in most of the cases, coverage is
    greater than the required value, which may imply in too large confidence
    intervals.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    retval = clopper_pearson_array([successes], [failures], coverage)
    return (retval[0].item(), retval[1].item(), retval[2].item())



def _agresti_coull_ndarray(
    successes: numpy.typing.NDArray[numpy.integer],
    failures: numpy.typing.NDArray[numpy.integer],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`agresti_coull`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    right = (1.0 - coverage) / 2  # half-width in each side
    crit = scipy.stats.norm.isf(right)
    kl_c = (successes + failures) + crit**2
    q_c = (successes + crit**2 / 2.0) / kl_c
    std_c = numpy.sqrt(q_c * (1.0 - q_c) / kl_c)
    dist = crit * std_c
    lower = q_c - dist
    upper = q_c + dist

    lower = numpy.nan_to_num(lower, nan=0.0)
    upper = numpy.nan_to_num(upper, nan=1.0)

    return successes / (successes + failures), lower, upper



[docs]
def agresti_coull_array(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`agresti_coull`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    successes_array, failures_array = as_int_arrays((successes, failures))
    return _agresti_coull_ndarray(successes_array, failures_array, coverage)




[docs]
def agresti_coull(
    successes: int, failures: int, coverage: float = 0.95
) -> tuple[float, float, float]:
    """Calculate the confidence interval for proportion estimates.

    The Agresti-Coull interval method is used for estimating the confidence
    intervals.  This implementation is based on [AGRESTI-1998]_.  This
    technique is conservative - in most of the cases, coverage is greater
    than the required value, which may imply a larger confidence interval that
    required.

    This function is considered a good choice for the frequentist approach, if
    you cannot use :py:func:`clopper_pearson`.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    retval = agresti_coull_array([successes], [failures], coverage)
    return (retval[0].item(), retval[1].item(), retval[2].item())



def _wilson_ndarray(
    successes: numpy.typing.NDArray[numpy.integer],
    failures: numpy.typing.NDArray[numpy.integer],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`wilson`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """

    right = (1.0 - coverage) / 2  # half-width in each side
    n = successes + failures
    p = successes / n
    crit = scipy.stats.norm.isf(right)
    crit2 = crit**2
    denom = 1 + (crit2 / n)
    center = (p + crit2 / (2 * n)) / denom
    dist = crit * numpy.sqrt(p * (1.0 - p) / n + crit2 / (4.0 * n**2))
    dist = dist / denom
    lower = center - dist
    upper = center + dist

    lower = numpy.nan_to_num(lower, nan=0.0)
    upper = numpy.nan_to_num(upper, nan=1.0)

    return successes / (successes + failures), lower, upper



[docs]
def wilson_array(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    """:py:func:`wilson`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    successes_array, failures_array = as_int_arrays((successes, failures))
    return _wilson_ndarray(successes_array, failures_array, coverage)




[docs]
def wilson(
    successes: int, failures: int, coverage: float = 0.95
) -> tuple[float, float, float]:
    """Calculate the confidence interval for proportion estimates.

    The Wilson interval method is used for estimating the confidence intervals.
    This implementation is based on [WILSON-1927]_.  This implementation does
    **not** contain the continuity correction.  It is as conservative in the
    extremes of the domain as the bayesian approach and can be a good default,
    if :py:func:`clopper_pearson` cannot be used.

    This function is considered the best "default" for the frequentist
    approach as it is not too conservative and assumes a resonable value
    through out the range.

    Parameters
    ----------
    successes
        Number of successes observed on the experiment.
    failures
        Number of failures observed on the experiment.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
        The estimated ratio between successes, and total trials (successes plus
        failures), lower and upper bounds of the confidence interval, in this
        order.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    retval = wilson_array([successes], [failures], coverage)
    return (retval[0].item(), retval[1].item(), retval[2].item())




[docs]
def percentile_interval(distribution=list[float], coverage: float = 0.95):
    """Derive confidence intervals.

    Parameters
    ----------
    distribution
        Distribution from which to compute the confidence interval.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.

    Returns
    -------
    tuple[float, float]
        Tuple with 2 floating-point numbers:

        * The lower value of the confidence interval
        * The higher value of the confidence interval
    """

    alpha = 1.0 - coverage
    arr = numpy.asarray(distribution, dtype=float)

    lower = float(numpy.percentile(arr, 100 * (alpha / 2)))
    upper = float(numpy.percentile(arr, 100 * (1 - alpha / 2)))

    return (lower, upper)




[docs]
def bootstrap_metric_distribution(
    y_true: numpy.typing.NDArray[numpy.integer],
    y_score_or_pred: numpy.typing.NDArray[numpy.floating]
    | numpy.typing.NDArray[numpy.integer],
    metric_func: typing.Callable,
    rng: numpy.random.Generator,
    n_bootstraps: int = 1000,
    require_all_classes: bool = True,
    max_resample_attempts: int = 100,
    **kwargs,
):
    """Build the empirical bootstrap distribution of a metric.

    This function repeatedly resamples the dataset with replacement and
    evaluates a metric on each bootstrap sample. The resulting distribution
    can be used to estimate confidence intervals for the metric.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score_or_pred
        Scores or predicted labels, as returned by a classifier.
    metric_func
        The sklearn metric function for which to compute the confidence interval.
    rng
        An initialized numpy random number generator.
    n_bootstraps
        Number of bootstrapping steps to evaluate.
    require_all_classes
        If set to True, each accepted bootstrap sample must contain all classes
        present in y_true. This is required for metrics such as ROC AUC.
    max_resample_attempts
        Maximum number of redraw attempts for a given bootstrap replicate when
        `require_all_classes=True`.
    **kwargs
        Parameters for the given metric_func.

    Returns
    -------
    list[float]
        A list of bootsrapped results from `y_score_or_pred` evaluated on `metric_func`.
        If `require_all_classes`was set to `True`, there is a chance the number
        of returned values is less than `n_bootstraps`.
    """

    boots = []

    num_classes = len(numpy.unique(y_true))
    n_size = len(y_true)

    for _ in tqdm(range(n_bootstraps)):
        if require_all_classes:
            # Ensure we do not loop indefinitely
            resample_counter = 0
            while resample_counter < max_resample_attempts:
                # Bootstrap with replacement
                resample_counter += 1
                indices = rng.choice(n_size, size=n_size, replace=True)
                yb_labels = y_true[indices]

                if len(numpy.unique(yb_labels)) == num_classes:
                    # We need at least one positive and one negative sample for ROC AUC
                    # so we redraw the bootstrap sample until both classes are present
                    yb_scores_or_pred = y_score_or_pred[indices]
                    bootstrapped_metrics = metric_func(
                        yb_labels, yb_scores_or_pred, **kwargs
                    )

                    boots.append(bootstrapped_metrics)
                    break

                # else, the random sampling did not return at least an instance of each \
                # class and we must retry

        else:
            indices = rng.choice(n_size, size=n_size, replace=True)

            yb_labels = y_true[indices]
            yb_scores_or_pred = y_score_or_pred[indices]

            bootstrapped_metrics = metric_func(yb_labels, yb_scores_or_pred, **kwargs)

            boots.append(bootstrapped_metrics)

    return boots




[docs]
def bootstrap_metric(
    y_true: numpy.typing.NDArray[numpy.integer],
    y_score_or_pred: numpy.typing.NDArray[numpy.floating]
    | numpy.typing.NDArray[numpy.integer],
    metric_func: typing.Callable,
    rng: numpy.random.Generator,
    n_bootstraps: int = 1000,
    coverage: float = 0.95,
    require_all_classes: bool = True,
    max_resample_attempts: int = 100,
    **kwargs,
):
    """Compute the confidence interval of ``metric_func`` via non-parametric bootstrapping.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score_or_pred
        Scores or predicted labels, as returned by a classifier.
    metric_func
        The sklearn metric function for which to compute the confidence interval.
    rng
        An initialized numpy random number generator.
    n_bootstraps
        Number of bootstrapping steps to evaluate.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95% coverage.
    require_all_classes
        If set to True, each accepted bootstrap sample must contain all classes
        present in y_true. This is required for metrics such as ROC AUC.
    max_resample_attempts
        Maximum number of redraw attempts for a given bootstrap replicate when
        `require_all_classes=True`.
    **kwargs
        Parameters for the given metric_func.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The output of the metric function
        * The mode of the bootstrapped distribution
        * The lower value of the confidence interval
        * The higher value of the confidence interval
    """

    if (
        y_true.ndim != 1
        or y_score_or_pred.ndim != 1
        or y_true.shape[0] != y_score_or_pred.shape[0]
    ):
        raise ValueError("y_true and y_pred must be 1D arrays of the same length.")

    # Point estimated on the full data (recommended for reporting)
    point = metric_func(y_true, y_score_or_pred, **kwargs)

    boots = bootstrap_metric_distribution(
        y_true=y_true,
        y_score_or_pred=y_score_or_pred,
        metric_func=metric_func,
        rng=rng,
        n_bootstraps=n_bootstraps,
        require_all_classes=require_all_classes,
        max_resample_attempts=max_resample_attempts,
        **kwargs,
    )

    if not boots:
        raise ValueError(
            "Bootstrapping did not produce any value. \
            Try changing `n_bootstraps` to increase the number of runs or supply more values as input."
        )

    # Compute the mode of the bootstrapped distribution
    boots_array = numpy.array(boots)

    # Handle degenerate / near-constant bootstrap distributions:
    # If all values are (almost) identical, KDE becomes numerically unstable, returning
    # numpy.linalg.LinAlgError (because the covariance matrix is singular or ill-conditioned)
    # and mode estimation fails.
    # In this case, the distribution effectively collapses to a point mass (Dirac delta),
    # where mean ≈ median ≈ mode. We therefore return the mean as a stable and
    # statistically equivalent estimate of the mode.
    tol = 1e-10
    if numpy.ptp(boots_array) < tol:
        mode = float(numpy.mean(boots_array))
    else:
        pdf = scipy.stats.gaussian_kde(boots_array)

        def neg_kde(x):
            return -pdf.evaluate(x)[
                0
            ]  # kde.evaluate returns a list; take first element

        solution = scipy.optimize.minimize_scalar(
            neg_kde, bounds=(boots_array.min(), boots_array.max()), method="bounded"
        )
        mode = solution.x

    lower, upper = percentile_interval(boots, coverage)

    return (float(point), float(mode), lower, upper)




[docs]
def compare_systems(
    y_true: typing.Sequence[int],
    output_a: typing.Sequence[float],
    output_b: typing.Sequence[float],
    metric_func: typing.Callable,
    rng: numpy.random.Generator,
    n_resamples: int = 9999,
    **kwargs,
):
    r"""Compare 2 system outputs using a paired permutation test.

    This function returns the observed difference in performance between
    system A and system B, together with the p-value obtained from a paired
    permutation test.

    The comparison assumes both systems were evaluated on the same samples,
    in the same order. Under the null hypothesis, the two systems are
    exchangeable within each sample, so permutations are generated by swapping
    system outputs pairwise.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels. These are the same for both systems.
    output_a
        Outputs produced by the first system. These may be predicted labels,
        scores, or any other per-sample output accepted by ``metric_func`` as
        its second argument.
    output_b
        Outputs produced by the second system. These may be predicted labels,
        scores, or any other per-sample output accepted by ``metric_func`` as
        its second argument.
    metric_func
        Metric function used to compare the two systems. It must accept
        ``y_true`` as its first argument and a system output as its second
        argument. Ususally this is a sklearn metric function for which to compare
        the two systems.
    rng
        An initialized numpy random number generator.
    n_resamples
        Number of random permutations used to approximate the null
        distribution.
    **kwargs
        Additional parameters for the given ``metric_func``.

    Returns
    -------
        tuple[float, float]
            Tuple with 2 floating-point numbers:

            * The observed difference ``metric_func(y_true, output_a) -
              metric_func(y_true, output_b)``.
            * The p-value from the paired permutation test.
    """
    if len(y_true) != len(output_a) or len(y_true) != len(output_b):
        raise ValueError(
            "y_true, output_a from system A, and output_b from system B must have the same length."
        )

    def statistic(metric_func, *data):
        results = []
        for d in data:
            # NOTE: `y_true` and `**kwargs` are taken from the parent function.
            results.append(metric_func(y_true, d, **kwargs))

        # NOTE: While this function accepts any length of data as input
        # (required by partial), we assume only two systems are present,
        # as defined in the parent function.
        return results[0] - results[1]

    statistic_partial = functools.partial(statistic, metric_func)

    result = scipy.stats.permutation_test(
        data=(output_a, output_b),
        statistic=statistic_partial,
        permutation_type="samples",
        n_resamples=n_resamples,
        rng=rng,
    )

    return float(result.statistic.item()), float(result.pvalue.item())