Source code for credible.bayesian.utils

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Functions to evalute the (Bayesian) credible region of measures.

(Bayesian) credible region interpretation, with 95% coverage: The probability
that the true proportion will lie within the 95% credible interval is 0.95.

Contrary to frequentist approaches, in which one can only say that if the test
were repeated an infinite number of times, and one constructed a confidence
interval each time, then X% of the confidence intervals would contain the true
rate, here we can say that given our observed data, there is a X% probability
that the true value of :math:`k/n` falls within the provided interval.

See a discussion in `Five Confidence Intervals for Proportions That You
Should Know About <ci-evaluation_>`_ for a study on coverage for most common
methods.

.. note::

   For a disambiguation with `Confidence Interval <confidence-interval_>`_ (the
   frequentist approach), read `Credible Regions or Intervals
   <credible-interval_>`_.

.. include:: ../links.rst
"""

import typing

import numpy
import numpy.random
import numpy.typing
import scipy.special

from ..utils import as_int_arrays


def _beta_ndarray(
    successes: numpy.typing.NDArray[numpy.integer],
    failures: numpy.typing.NDArray[numpy.integer],
    lambda_: float,
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    r""":py:func:`beta`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes (``k``) observed on the experiment.
    failures
        Number of failures (``l``) observed on the experiment.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95%
        of the area under the probability density of the posterior
        is covered by the returned equal-tailed interval.

    Returns
    -------
        A tuple containing 4 floating point numbers representing:

        * mean: The mean of the posterior distribution
        * mode: The mode of the posterior distribution
        * lower: The lower bounds of the credible region
        * upper: The upper bounds of the credible region

        If the input was a 1-D array with multiple experiments, then the return
        value of this function is also composed of 1-D arrays, representing the
        mean, mode, lower and upper bounds of the various credible regions
        defined for each of the success/failure tuples on the input.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """

    # we return the equally-tailed range
    right = (1.0 - coverage) / 2  # half-width in each side
    lower = scipy.special.betaincinv(successes + lambda_, failures + lambda_, right)
    upper = scipy.special.betaincinv(
        successes + lambda_, failures + lambda_, 1.0 - right
    )

    # evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
    alpha = successes + lambda_
    beta = failures + lambda_

    mean = numpy.nan_to_num(alpha / (alpha + beta))

    # the mode of a beta distribution is a bit tricky
    mode = numpy.zeros_like(lower)
    cond = (alpha > 1) & (beta > 1)
    mode[cond] = (alpha[cond] - 1) / (alpha[cond] + beta[cond] - 2)
    # In the case of precision, if the threshold is close to 1.0, both TP
    # and FP can be zero, which may cause this condition to be reached, if
    # the prior is exactly 1 (flat prior).  This is a weird situation,
    # because effectively we are trying to compute the posterior when the
    # total number of experiments is zero.  So, only the prior counts - but
    # the prior is flat, so we should just pick a value.  We choose the
    # middle of the range.
    # conda = alpha == 1 and beta == 1
    # mode[cond] = 0.0
    # conda = alpha <= 1 and beta > 1
    # mode[cond] = 0.0
    mode[(alpha > 1) & (beta <= 1)] = 1.0
    # else: #elif alpha < 1 and beta < 1:
    # in the case of precision, if the threshold is close to 1.0, both TP
    # and FP can be zero, which may cause this condition to be reached, if
    # the prior is smaller than 1.  This is a weird situation, because
    # effectively we are trying to compute the posterior when the total
    # number of experiments is zero.  So, only the prior counts - but the
    # prior is bimodal, so we should just pick a value.  We choose the
    # left of the range.
    # n.b.: could also be 1.0 as the prior is bimodal
    # mode[alpha < 1 and beta < 1] = 0.0

    return mean, mode, lower, upper



[docs]
def beta_array(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    lambda_: float,
    coverage: float,
) -> tuple[
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
    numpy.typing.NDArray[numpy.double],
]:
    r""":py:func:`beta`, for multiple systems.

    Parameters
    ----------
    successes
        Number of successes (``k``) observed on the experiment.
    failures
        Number of failures (``l``) observed on the experiment.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95%
        of the area under the probability density of the posterior
        is covered by the returned equal-tailed interval.

    Returns
    -------
        A tuple containing 4 floating point numbers representing:

        * mean: The mean of the posterior distribution
        * mode: The mode of the posterior distribution
        * lower: The lower bounds of the credible region
        * upper: The upper bounds of the credible region

        If the input was a 1-D array with multiple experiments, then the return
        value of this function is also composed of 1-D arrays, representing the
        mean, mode, lower and upper bounds of the various credible regions
        defined for each of the success/failure tuples on the input.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    successes_array, failures_array = as_int_arrays((successes, failures))
    return _beta_ndarray(successes_array, failures_array, lambda_, coverage)




[docs]
def beta(
    successes: int, failures: int, lambda_: float, coverage: float
) -> tuple[float, float, float, float]:
    r"""Return the mode, upper and lower bounds of the equal-tailed credible
    region of a probability estimate following Bernoulli trials.

    This technique is (not) very conservative - in most of the cases, coverage
    closer to the extremes (0 or 1) is lower than expected (but still greater
    than 85%).

    This implementation is based on [GOUTTE-2005]_.  It assumes :math:`k`
    successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
    from a series of Bernoulli trials (likelihood is binomial).  The posterior
    is derivated using the Bayes Theorem with a beta prior.  As there is no
    reason to favour high vs.  low precision, we use a symmetric Beta prior
    (:math:`\alpha=\beta`):

    .. math::

       P(p|k,n) &= \frac{P(k,n|p)P(p)}{P(k,n)} \\
       P(p|k,n) &= \frac{\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\
       P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\beta)}p^{k+\alpha-1}(1-p)^{n-k+\beta-1} \\
       P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\alpha)}p^{k+\alpha-1}(1-p)^{n-k+\alpha-1}

    The mode for this posterior (also the maximum a posteriori) is:

    .. math::

       \text{mode}(p) = \frac{k+\lambda-1}{n+2\lambda-2}

    Concretely, the prior may be flat (all rates are equally likely,
    :math:`\lambda=1`) or we may use Jeoffrey's prior (:math:`\lambda=0.5`),
    that is invariant through re-parameterisation.  Jeffrey's prior indicate
    that rates close to zero or one are more likely.

    The mode above works if :math:`k+{\alpha},n-k+{\alpha} > 1`, which is
    usually the case for a resonably well tunned system, with more than a few
    samples for analysis.  In the limit of the system performance, :math:`k`
    may be 0, which will make the mode become zero.

    For our purposes, it may be more suitable to represent :math:`n = k + l`,
    with :math:`k`, the number of successes and :math:`l`, the number of
    failures in the binomial experiment, and find this more suitable
    representation:

    .. math::

       P(p|k,l) &= \frac{1}{B(k+\alpha, l+\alpha)}p^{k+\alpha-1}(1-p)^{l+\alpha-1} \\
       \text{mode}(p) &= \frac{k+\lambda-1}{k+l+2\lambda-2}

    This can be mapped to most rates calculated in the context of binary
    classification this way:

    * Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
    * Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
    * Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
    * Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
    * Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN

    .. note:: **Important**

       To calculate the limits given the required coverage, we use the
       incomplete **inverse** (regularized, or normalized) beta function,
       :any:`scipy.special.betaincinv` instead of :any:`scipy.special.betainc`.
       The latter requires we provide the bounds and returns the coverage,
       whereas here we are interested in the *inverse* behaviour.

    Parameters
    ----------
    successes
        Number of successes (``k``) observed on the experiment.
    failures
        Number of failures (``l``) observed on the experiment.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    coverage
        A floating-point number between 0 and 1.0 indicating the
        coverage you're expecting.  A value of 0.95 will ensure 95%
        of the area under the probability density of the posterior
        is covered by the returned equal-tailed interval.

    Returns
    -------
        A tuple containing 4 floating point numbers representing:

        * mean: The mean of the posterior distribution
        * mode: The mode of the posterior distribution
        * lower: The lower bounds of the credible region
        * upper: The upper bounds of the credible region

        If the input was a 1-D array with multiple experiments, then the return
        value of this function is also composed of 1-D arrays, representing the
        mean, mode, lower and upper bounds of the various credible regions
        defined for each of the success/failure tuples on the input.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match, or in
        case the input types are unsupported.
    """
    retval = beta_array([successes], [failures], lambda_, coverage)
    return (
        retval[0].item(),
        retval[1].item(),
        retval[2].item(),
        retval[3].item(),
    )




[docs]
def beta_posterior(
    successes: int,
    failures: int,
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
    r"""Simulate the beta posterior of a system with the provided markings.

    This implementation is based on [GOUTTE-2005]_, Equation 7.

    Figures of merit that are supported by this procedure are those which have
    the form :math:`v = k / (k + l)` (successes over successes plus failures):

    * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
      :math:`k=TP`, :math:`l=FP`
    * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
      :math:`k=TP`, :math:`l=FN`
    * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
      :math:`l=FP`
    * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
      :math:`l=FP+FN`
    * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`

    Parameters
    ----------
    successes
        Number of successes (``k``) observed on the experiment.  May be a
        scalar of any integral type, or an array of integral values indicating
        a series of experiments to calculate the credible regions for.
    failures
        Number of failures (``l``) observed on the experiment.  May be a scalar
        of any integral type, or an array of integral values indicating a
        series of experiments to calculate the credible regions for.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    nb_samples
        Number of generated gamma distribution values.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        Variates: An array with size ``nb_samples`` containing a realization of
        Equation 7 from [GOUTTE-2005]_.  If ``successes`` and ``failures`` are
        sequences of arrays, then returns a 2D array where each row represents
        one set of realisations.

    Raises
    ------
    TypeError
        If the dimensions of ``successes`` and ``failures`` do not match.
    """
    return rng.beta(a=successes + lambda_, b=failures + lambda_, size=nb_samples)




[docs]
def average_beta_posterior(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
    r"""Simulate the average beta posterior of many systems.

    This implementation is based on [GOUTTE-2005]_, Equation 7.

    Figures of merit that are supported by this procedure are those which have
    the form :math:`v = k / (k + l)` (successes over successes plus failures):

    * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
      :math:`k=TP`, :math:`l=FP`
    * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
      :math:`k=TP`, :math:`l=FN`
    * Specificity or True Negative Rate: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
      :math:`l=FP`
    * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
      :math:`l=FP+FN`
    * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`

    Parameters
    ----------
    successes
        Sequence of integers expressing the number of successes (``k``)
        observed on the various experiments whose metric is to be averaged.
    failures
        Sequence of integers expressing the number of failures (``l``)
        observed on the various experiments whose metric is to be averaged.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    nb_samples
        Number of generated variates for the M-C simulation.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        Variates: An array with size ``nb_samples`` containing a realization of
        Equation 7 from [GOUTTE-2005]_, considering the averaging of all input
        systems.
    """
    successes_array, failures_array = as_int_arrays((successes, failures))

    if successes_array.ndim == 0:
        successes_array = numpy.atleast_1d(successes_array)
        failures_array = numpy.atleast_1d(failures_array)

    return numpy.mean(
        [
            beta_posterior(kk, ll, lambda_, nb_samples, rng)
            for kk, ll in zip(successes_array, failures_array)
        ],
        axis=0,
    )




[docs]
def evaluate_statistics(
    variates: typing.Iterable[float],
    coverage: float,
    bins: int | str,
) -> tuple[float, float, float, float]:
    """Evaluate the left and right margins for a given M-C distribution.

    Parameters
    ----------
    variates
        A 1-D array containing the simulated variates.
    coverage
        A number, between 0 and 1 to indicate the desired coverage.  Typically,
        this number is set to 0.95 (95% coverage).
    bins
        Histogram binning for defining the mode of the original distribution
        where variates where sampled from.  A larger number of bins will make
        it unlikely to discover the true mode.  You may also pass any of the
        string method names defined at :py:func:`numpy.histogram_bin_edges`.  A
        method such as ``doane`` should work fine in most cases.

    Returns
    -------
        statistics: mean, (a rough estimated of the) mode and lower and upper
        bounds of the credible intervals for the input simulation.

        .. note::

           Mode estimation depends on the number of input ``variates``, and
           histogram binning (``bins``).  Using :py:func:`scipy.stats.mode` can
           only evaluate most common value, which is **not reliable** for
           continuous variables.
    """

    left_half = (1 - coverage) / 2  # size of excluded (half) area
    variates_array = numpy.asarray(variates, dtype=numpy.double)
    sorted_variates = numpy.sort(variates_array)

    # n.b.: we return the equally tailed range

    # calculates position of score which would exclude the left_half (left)
    lower_index = int(round(len(variates_array) * left_half))

    # calculates position of score which would exclude the right_half (right)
    upper_index = int(round(len(variates_array) * (1 - left_half)))

    lower = sorted_variates[lower_index - 1]
    upper = sorted_variates[upper_index - 1]

    # we evaluate the histogram and get the maximum value from there
    hist, ranges = numpy.histogram(variates_array, bins=bins)
    idx = numpy.argmax(hist)
    mode = (ranges[idx] + ranges[idx + 1]) / 2

    return (
        numpy.mean(variates_array).item(),
        mode,
        lower,
        upper,
    )




[docs]
def average_beta(
    successes: typing.Iterable[int],
    failures: typing.Iterable[int],
    lambda_: float,
    coverage: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> tuple[float, float, float, float]:
    r"""Return mean, mode, upper and lower bounds of the credible region of an
    average of measures with beta posteriors.

    This implementation is based on [GOUTTE-2005]_.

    Parameters
    ----------
    successes
        Sequence of integers expressing the number of successes (``k``)
        observed on the various experiments whose metric is to be averaged.
    failures
        Sequence of integers expressing the number of failures (``l``)
        observed on the various experiments whose metric is to be averaged.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        statistics: mean, mode, lower and upper bounds of the credible interval
        for the provided coverage.
    """

    variates = average_beta_posterior(successes, failures, lambda_, nb_samples, rng)
    return evaluate_statistics(variates, coverage, "auto")




[docs]
def compare_beta_posteriors(
    k1: int,
    l1: int,
    k2: int,
    l2: int,
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> float:
    r"""Return the probability that system 1 is better than system 2 for a
    given figure of merit.

    This implementation is based on [GOUTTE-2005]_.

    Figures of merit that are supported by this procedure are those which have
    the form :math:`v = k / (k + l)`:

    * Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
      :math:`k=TP`, :math:`l=FP`
    * Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
      :math:`k=TP`, :math:`l=FN`
    * Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
      :math:`l=FP`
    * Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
      :math:`l=FP+FN`
    * Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`

    Parameters
    ----------
    k1
        Number of successes for the first system.  See various definitions
        above.
    l1
        Number of failures for the first system.  See various definitions
        above.
    k2
        Number of successes for the second system.  See various definitions
        above.  Must be syntatically equal to ``k1``.
    l2
        Number of failures for the second system.  See various definitions
        above.  Must be syntatically equal to ``l1``.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    nb_samples
        Number of generated variates for the M-C simulation.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        A number between 0.0 and 1.0 that describes the probability that the
        first system has a bigger measurement than the second.
    """

    v1 = beta_posterior(k1, l1, lambda_, nb_samples, rng)
    v2 = beta_posterior(k2, l2, lambda_, nb_samples, rng)
    return numpy.count_nonzero(v1 > v2) / nb_samples




[docs]
def f1_posterior(
    tp: int,
    fp: int,
    fn: int,
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
    r"""Simulate the F1-score posterior of a system with the provided markings.

    This implementation is based on [GOUTTE-2005]_, Equation 11.

    Parameters
    ----------
    tp
        True positive count, AKA "hit", as an integer scalar.
    fp
        False positive count, AKA "false alarm", or "Type I error".
    fn
        False Negative count, AKA "miss", or "Type II error", as a scalar.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    nb_samples
        Number of generated variates for the M-C simulation.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        variates: An array with size ``nb_samples`` containing a realization of
        Equation 11 of [GOUTTE-2005]_.
    """
    u = rng.gamma(shape=(tp + lambda_), scale=2.0, size=nb_samples)
    v = rng.gamma(shape=(fp + fn + (2 * lambda_)), scale=1.0, size=nb_samples)
    return u / (u + v)




[docs]
def average_f1_posterior(
    tp: typing.Iterable[int],
    fp: typing.Iterable[int],
    fn: typing.Iterable[int],
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
    r"""Simulate the F1-score posterior of an average system with the provided
    markings.

    This implementation is based on [GOUTTE-2005]_, Equation 11.

    Parameters
    ----------
    tp
        Arrays containing true positive counts, AKA "hit", for all systems to
        be considered on the average.
    fp
        Arrays containing false positive counts, AKA "false alarm", or "Type I
        error" for all systems to be considered on the average.
    fn
        Arrays containing false Negative counts, AKA "miss", or "Type II
        error" for all systems to be considered on the average..
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  If unsure, use 0.5.
    nb_samples
        Number of generated gamma distribution values.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        variates: An array with size ``nb_samples`` containing a realization of
        Equation 11 from [GOUTTE-2005]_.
    """
    tp_array, fp_array, fn_array = as_int_arrays((tp, fp, fn))
    return numpy.mean(
        [
            numpy.asarray(
                f1_posterior(tp_, fp_, fn_, lambda_, nb_samples, rng),
                dtype=numpy.double,
            )
            for tp_, fp_, fn_ in zip(tp_array, fp_array, fn_array)
        ],
        axis=0,
    )




[docs]
def compare_f1_scores(
    tp1: int,
    fp1: int,
    fn1: int,
    tp2: int,
    fp2: int,
    fn2: int,
    lambda_: float,
    nb_samples: int,
    rng: numpy.random.Generator,
) -> float:
    r"""Return the probability that the F1-score from 1 system is bigger than
    the F1-score of a second system.

    This implementation is based on [GOUTTE-2005]_.

    Parameters
    ----------
    tp1
        True positive count, AKA "hit" for the first system.
    fp1
        False positive count, AKA "false alarm", or "Type I error" for the
        first system.
    fn1
        False Negative count, AKA "miss", or "Type II error", for the first
        system.
    tp2
        True positive count, AKA "hit" for the second system.
    fp2
        False positive count, AKA "false alarm", or "Type I error" for the
        second system.
    fn2
        False Negative count, AKA "miss", or "Type II error", for the second
        system.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    nb_samples
        Number of generated variates for the M-C simulation.
    rng
        An initialized numpy random number generator.

    Returns
    -------
        prob: A number between 0.0 and 1.0 that describes the probability that
        the first system has a bigger F1-score than the second.
    """

    f1 = f1_posterior(tp1, fp1, fn1, lambda_, nb_samples, rng)
    f2 = f1_posterior(tp2, fp2, fn2, lambda_, nb_samples, rng)
    return numpy.count_nonzero(f1 > f2) / nb_samples




[docs]
def compare_systems(
    n: typing.Sequence[int],
    lambda_: typing.Sequence[float],
    nb_samples: int,
    rng: numpy.random.Generator,
) -> float:
    r"""Compare 2 system (binary) outputs using a Dirichlet posterior.

    This function returns the empyrical probability that a system (1) is better
    another system (2), based on their binary outputs.  The comparison is
    carried-out as described in [GOUTTE-2005]_, equations 16 and 19, via a
    Monte-Carlo simulation, since the integral of the probability cannot be
    resolved analytically.

    To do so, we compute the probability that :math:`P(\pi_1 > \pi_2)`, i.e.,
    the probability that system 1 gives the expected output while system 2 does
    not, is greater than the probability that system 1 is incorrect while
    system 2 gives the correct answer.  It assumes, therefore, systems 1 and 2
    are tuned (thresholded), and provide binary outputs that can be compared to
    generate 3 numbers:

    * :math:`n_1`: The measured number of times system 1 provides the correct
      answer, whereas system 2 does not
    * :math:`n_2`: The measured number of times system 2 provides the correct
      answer, whereas system 1 does not
    * :math:`n_3`: The measured number of times system 1 and 2 agree, giving
      the same answer (wrong or write, it does not matter)

    Notice that :math:`\pi_1 = \frac{n_1}{n_1 + n_2 + n_3}`, and so,
    analogously, you may calculate :math:`\pi_2` and :math:`\pi_3`.

    We then plug these numbers to simulate a Dirichlet (generalisation of the
    Beta distribution for multiple variables) by setting:

    * :math:`\alpha_1 = n_1 + \lambda_1`
    * :math:`\alpha_2 = n_2 + \lambda_2`
    * :math:`\alpha_3 = n_3 + \lambda_3`

    Where each :math:`\lambda_i` correspond to the prior to be imputed to that
    particular variable.  We typically select :math:`\lambda_1 = \lambda_2`,

    Parameters
    ----------
    n
        A triple with 3 integers representing :math:`n_1`, :math:`n_2` and
        :math:`n_3`.
    lambda_
        A tuple with length 3, containing floating point numbers describing the
        parameterisation of the Dirichlet priors to consider.  Use
        :math:`\lambda_i=0.5` for Jeffrey's prior.
    nb_samples
        Number of generated dirichlet distribution values (make this high, for
        a higher precision on the simulation).
    rng
        An initialized numpy random number generator.

    Returns
    -------
        probability: A number between 0.0 and 1.0 that describes the
        probability that the first system is better than the second one.
    """
    assert len(n) == 3
    assert len(lambda_) == 3
    samples = rng.dirichlet(numpy.array(n) + numpy.array(lambda_), size=nb_samples)
    return numpy.count_nonzero(samples[:, 0] > samples[:, 1]) / nb_samples