# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Functions to evalute the (Bayesian) credible region of measures.
(Bayesian) credible region interpretation, with 95% coverage: The probability
that the true proportion will lie within the 95% credible interval is 0.95.
Contrary to frequentist approaches, in which one can only say that if the test
were repeated an infinite number of times, and one constructed a confidence
interval each time, then X% of the confidence intervals would contain the true
rate, here we can say that given our observed data, there is a X% probability
that the true value of :math:`k/n` falls within the provided interval.
See a discussion in `Five Confidence Intervals for Proportions That You
Should Know About <ci-evaluation_>`_ for a study on coverage for most common
methods.
.. note::
For a disambiguation with `Confidence Interval <confidence-interval_>`_ (the
frequentist approach), read `Credible Regions or Intervals
<credible-interval_>`_.
.. include:: ../links.rst
"""
import typing
import numpy
import numpy.random
import numpy.typing
import scipy.special
from ..utils import as_int_arrays
def _beta_ndarray(
successes: numpy.typing.NDArray[numpy.integer],
failures: numpy.typing.NDArray[numpy.integer],
lambda_: float,
coverage: float,
) -> tuple[
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
]:
r""":py:func:`beta`, for multiple systems.
Parameters
----------
successes
Number of successes (``k``) observed on the experiment.
failures
Number of failures (``l``) observed on the experiment.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
coverage
A floating-point number between 0 and 1.0 indicating the
coverage you're expecting. A value of 0.95 will ensure 95%
of the area under the probability density of the posterior
is covered by the returned equal-tailed interval.
Returns
-------
A tuple containing 4 floating point numbers representing:
* mean: The mean of the posterior distribution
* mode: The mode of the posterior distribution
* lower: The lower bounds of the credible region
* upper: The upper bounds of the credible region
If the input was a 1-D array with multiple experiments, then the return
value of this function is also composed of 1-D arrays, representing the
mean, mode, lower and upper bounds of the various credible regions
defined for each of the success/failure tuples on the input.
Raises
------
TypeError
If the dimensions of ``successes`` and ``failures`` do not match, or in
case the input types are unsupported.
"""
# we return the equally-tailed range
right = (1.0 - coverage) / 2 # half-width in each side
lower = scipy.special.betaincinv(
successes + lambda_, failures + lambda_, right
)
upper = scipy.special.betaincinv(
successes + lambda_, failures + lambda_, 1.0 - right
)
# evaluate mean and mode (https://en.wikipedia.org/wiki/Beta_distribution)
alpha = successes + lambda_
beta = failures + lambda_
mean = numpy.nan_to_num(alpha / (alpha + beta))
# the mode of a beta distribution is a bit tricky
mode = numpy.zeros_like(lower)
cond = (alpha > 1) & (beta > 1)
mode[cond] = (alpha[cond] - 1) / (alpha[cond] + beta[cond] - 2)
# In the case of precision, if the threshold is close to 1.0, both TP
# and FP can be zero, which may cause this condition to be reached, if
# the prior is exactly 1 (flat prior). This is a weird situation,
# because effectively we are trying to compute the posterior when the
# total number of experiments is zero. So, only the prior counts - but
# the prior is flat, so we should just pick a value. We choose the
# middle of the range.
# conda = alpha == 1 and beta == 1
# mode[cond] = 0.0
# conda = alpha <= 1 and beta > 1
# mode[cond] = 0.0
mode[(alpha > 1) & (beta <= 1)] = 1.0
# else: #elif alpha < 1 and beta < 1:
# in the case of precision, if the threshold is close to 1.0, both TP
# and FP can be zero, which may cause this condition to be reached, if
# the prior is smaller than 1. This is a weird situation, because
# effectively we are trying to compute the posterior when the total
# number of experiments is zero. So, only the prior counts - but the
# prior is bimodal, so we should just pick a value. We choose the
# left of the range.
# n.b.: could also be 1.0 as the prior is bimodal
# mode[alpha < 1 and beta < 1] = 0.0
return mean, mode, lower, upper
[docs]
def beta_array(
successes: typing.Iterable[int],
failures: typing.Iterable[int],
lambda_: float,
coverage: float,
) -> tuple[
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
numpy.typing.NDArray[numpy.double],
]:
r""":py:func:`beta`, for multiple systems.
Parameters
----------
successes
Number of successes (``k``) observed on the experiment.
failures
Number of failures (``l``) observed on the experiment.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
coverage
A floating-point number between 0 and 1.0 indicating the
coverage you're expecting. A value of 0.95 will ensure 95%
of the area under the probability density of the posterior
is covered by the returned equal-tailed interval.
Returns
-------
A tuple containing 4 floating point numbers representing:
* mean: The mean of the posterior distribution
* mode: The mode of the posterior distribution
* lower: The lower bounds of the credible region
* upper: The upper bounds of the credible region
If the input was a 1-D array with multiple experiments, then the return
value of this function is also composed of 1-D arrays, representing the
mean, mode, lower and upper bounds of the various credible regions
defined for each of the success/failure tuples on the input.
Raises
------
TypeError
If the dimensions of ``successes`` and ``failures`` do not match, or in
case the input types are unsupported.
"""
successes_array, failures_array = as_int_arrays((successes, failures))
return _beta_ndarray(successes_array, failures_array, lambda_, coverage)
[docs]
def beta(
successes: int, failures: int, lambda_: float, coverage: float
) -> tuple[float, float, float, float]:
r"""Return the mode, upper and lower bounds of the equal-tailed credible
region of a probability estimate following Bernoulli trials.
This technique is (not) very conservative - in most of the cases, coverage
closer to the extremes (0 or 1) is lower than expected (but still greater
than 85%).
This implementation is based on [GOUTTE-2005]_. It assumes :math:`k`
successes and :math:`l` failures (:math:`n = k+l` total trials) are issued
from a series of Bernoulli trials (likelihood is binomial). The posterior
is derivated using the Bayes Theorem with a beta prior. As there is no
reason to favour high vs. low precision, we use a symmetric Beta prior
(:math:`\alpha=\beta`):
.. math::
P(p|k,n) &= \frac{P(k,n|p)P(p)}{P(k,n)} \\
P(p|k,n) &= \frac{\frac{n!}{k!(n-k)!}p^{k}(1-p)^{n-k}P(p)}{P(k)} \\
P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\beta)}p^{k+\alpha-1}(1-p)^{n-k+\beta-1} \\
P(p|k,n) &= \frac{1}{B(k+\alpha, n-k+\alpha)}p^{k+\alpha-1}(1-p)^{n-k+\alpha-1}
The mode for this posterior (also the maximum a posteriori) is:
.. math::
\text{mode}(p) = \frac{k+\lambda-1}{n+2\lambda-2}
Concretely, the prior may be flat (all rates are equally likely,
:math:`\lambda=1`) or we may use Jeoffrey's prior (:math:`\lambda=0.5`),
that is invariant through re-parameterisation. Jeffrey's prior indicate
that rates close to zero or one are more likely.
The mode above works if :math:`k+{\alpha},n-k+{\alpha} > 1`, which is
usually the case for a resonably well tunned system, with more than a few
samples for analysis. In the limit of the system performance, :math:`k`
may be 0, which will make the mode become zero.
For our purposes, it may be more suitable to represent :math:`n = k + l`,
with :math:`k`, the number of successes and :math:`l`, the number of
failures in the binomial experiment, and find this more suitable
representation:
.. math::
P(p|k,l) &= \frac{1}{B(k+\alpha, l+\alpha)}p^{k+\alpha-1}(1-p)^{l+\alpha-1} \\
\text{mode}(p) &= \frac{k+\lambda-1}{k+l+2\lambda-2}
This can be mapped to most rates calculated in the context of binary
classification this way:
* Precision or Positive-Predictive Value (PPV): p = TP/(TP+FP), so k=TP, l=FP
* Recall, Sensitivity, or True Positive Rate: r = TP/(TP+FN), so k=TP, l=FN
* Specificity or True Negative Rage: s = TN/(TN+FP), so k=TN, l=FP
* Accuracy: acc = TP+TN/(TP+TN+FP+FN), so k=TP+TN, l=FP+FN
* Jaccard: j = TP/(TP+FP+FN), so k=TP, l=FP+FN
.. note:: **Important**
To calculate the limits given the required coverage, we use the
incomplete **inverse** (regularized, or normalized) beta function,
:any:`scipy.special.betaincinv` instead of :any:`scipy.special.betainc`.
The latter requires we provide the bounds and returns the coverage,
whereas here we are interested in the *inverse* behaviour.
Parameters
----------
successes
Number of successes (``k``) observed on the experiment.
failures
Number of failures (``l``) observed on the experiment.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
coverage
A floating-point number between 0 and 1.0 indicating the
coverage you're expecting. A value of 0.95 will ensure 95%
of the area under the probability density of the posterior
is covered by the returned equal-tailed interval.
Returns
-------
A tuple containing 4 floating point numbers representing:
* mean: The mean of the posterior distribution
* mode: The mode of the posterior distribution
* lower: The lower bounds of the credible region
* upper: The upper bounds of the credible region
If the input was a 1-D array with multiple experiments, then the return
value of this function is also composed of 1-D arrays, representing the
mean, mode, lower and upper bounds of the various credible regions
defined for each of the success/failure tuples on the input.
Raises
------
TypeError
If the dimensions of ``successes`` and ``failures`` do not match, or in
case the input types are unsupported.
"""
retval = beta_array([successes], [failures], lambda_, coverage)
return (
retval[0].item(),
retval[1].item(),
retval[2].item(),
retval[3].item(),
)
[docs]
def beta_posterior(
successes: int,
failures: int,
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
r"""Simulate the beta posterior of a system with the provided markings.
This implementation is based on [GOUTTE-2005]_, Equation 7.
Figures of merit that are supported by this procedure are those which have
the form :math:`v = k / (k + l)` (successes over successes plus failures):
* Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
:math:`k=TP`, :math:`l=FP`
* Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
:math:`k=TP`, :math:`l=FN`
* Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
:math:`l=FP`
* Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
:math:`l=FP+FN`
* Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
Parameters
----------
successes
Number of successes (``k``) observed on the experiment. May be a
scalar of any integral type, or an array of integral values indicating
a series of experiments to calculate the credible regions for.
failures
Number of failures (``l``) observed on the experiment. May be a scalar
of any integral type, or an array of integral values indicating a
series of experiments to calculate the credible regions for.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
nb_samples
Number of generated gamma distribution values.
rng
An initialized numpy random number generator.
Returns
-------
Variates: An array with size ``nb_samples`` containing a realization of
Equation 7 from [GOUTTE-2005]_. If ``successes`` and ``failures`` are
sequences of arrays, then returns a 2D array where each row represents
one set of realisations.
Raises
------
TypeError
If the dimensions of ``successes`` and ``failures`` do not match.
"""
return rng.beta(
a=successes + lambda_, b=failures + lambda_, size=nb_samples
)
[docs]
def average_beta_posterior(
successes: typing.Iterable[int],
failures: typing.Iterable[int],
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
r"""Simulate the average beta posterior of many systems.
This implementation is based on [GOUTTE-2005]_, Equation 7.
Figures of merit that are supported by this procedure are those which have
the form :math:`v = k / (k + l)` (successes over successes plus failures):
* Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
:math:`k=TP`, :math:`l=FP`
* Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
:math:`k=TP`, :math:`l=FN`
* Specificity or True Negative Rate: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
:math:`l=FP`
* Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
:math:`l=FP+FN`
* Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
Parameters
----------
successes
Sequence of integers expressing the number of successes (``k``)
observed on the various experiments whose metric is to be averaged.
failures
Sequence of integers expressing the number of failures (``l``)
observed on the various experiments whose metric is to be averaged.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
nb_samples
Number of generated variates for the M-C simulation.
rng
An initialized numpy random number generator.
Returns
-------
Variates: An array with size ``nb_samples`` containing a realization of
Equation 7 from [GOUTTE-2005]_, considering the averaging of all input
systems.
"""
successes_array, failures_array = as_int_arrays((successes, failures))
if successes_array.ndim == 0:
successes_array = numpy.atleast_1d(successes_array)
failures_array = numpy.atleast_1d(failures_array)
return numpy.mean(
[
beta_posterior(kk, ll, lambda_, nb_samples, rng)
for kk, ll in zip(successes_array, failures_array)
],
axis=0,
)
[docs]
def evaluate_statistics(
variates: typing.Iterable[float],
coverage: float,
bins: int | str,
) -> tuple[float, float, float, float]:
"""Evaluate the left and right margins for a given M-C distribution.
Parameters
----------
variates
A 1-D array containing the simulated variates.
coverage
A number, between 0 and 1 to indicate the desired coverage. Typically,
this number is set to 0.95 (95% coverage).
bins
Histogram binning for defining the mode of the original distribution
where variates where sampled from. A larger number of bins will make
it unlikely to discover the true mode. You may also pass any of the
string method names defined at :py:func:`numpy.histogram_bin_edges`. A
method such as ``doane`` should work fine in most cases.
Returns
-------
statistics: mean, (a rough estimated of the) mode and lower and upper
bounds of the credible intervals for the input simulation.
.. note::
Mode estimation depends on the number of input ``variates``, and
histogram binning (``bins``). Using :py:func:`scipy.stats.mode` can
only evaluate most common value, which is **not reliable** for
continuous variables.
"""
left_half = (1 - coverage) / 2 # size of excluded (half) area
variates_array = numpy.asarray(variates, dtype=numpy.double)
sorted_variates = numpy.sort(variates_array)
# n.b.: we return the equally tailed range
# calculates position of score which would exclude the left_half (left)
lower_index = int(round(len(variates_array) * left_half))
# calculates position of score which would exclude the right_half (right)
upper_index = int(round(len(variates_array) * (1 - left_half)))
lower = sorted_variates[lower_index - 1]
upper = sorted_variates[upper_index - 1]
# we evaluate the histogram and get the maximum value from there
hist, ranges = numpy.histogram(variates_array, bins=bins)
idx = numpy.argmax(hist)
mode = (ranges[idx] + ranges[idx + 1]) / 2
return (
numpy.mean(variates_array).item(),
mode,
lower,
upper,
)
[docs]
def average_beta(
successes: typing.Iterable[int],
failures: typing.Iterable[int],
lambda_: float,
coverage: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> tuple[float, float, float, float]:
r"""Return mean, mode, upper and lower bounds of the credible region of an
average of measures with beta posteriors.
This implementation is based on [GOUTTE-2005]_.
Parameters
----------
successes
Sequence of integers expressing the number of successes (``k``)
observed on the various experiments whose metric is to be averaged.
failures
Sequence of integers expressing the number of failures (``l``)
observed on the various experiments whose metric is to be averaged.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
coverage
A floating-point number between 0 and 1.0 indicating the coverage
you're expecting. A value of 0.95 will ensure 95% of the area under
the probability density of the posterior is covered by the returned
equal-tailed interval.
nb_samples
Number of generated variates for the M-C simulation.
rng
An initialized numpy random number generator.
Returns
-------
statistics: mean, mode, lower and upper bounds of the credible interval
for the provided coverage.
"""
variates = average_beta_posterior(
successes, failures, lambda_, nb_samples, rng
)
return evaluate_statistics(variates, coverage, "auto")
[docs]
def compare_beta_posteriors(
k1: int,
l1: int,
k2: int,
l2: int,
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> float:
r"""Return the probability that system 1 is better than system 2 for a
given figure of merit.
This implementation is based on [GOUTTE-2005]_.
Figures of merit that are supported by this procedure are those which have
the form :math:`v = k / (k + l)`:
* Precision or Positive-Predictive Value (PPV): :math:`p = TP/(TP+FP)`, so
:math:`k=TP`, :math:`l=FP`
* Recall, Sensitivity, or True Positive Rate: :math:`r = TP/(TP+FN)`, so
:math:`k=TP`, :math:`l=FN`
* Specificity or True Negative Rage: :math:`s = TN/(TN+FP)`, so :math:`k=TN`,
:math:`l=FP`
* Accuracy: :math:`acc = TP+TN/(TP+TN+FP+FN)`, so :math:`k=TP+TN`,
:math:`l=FP+FN`
* Jaccard Index: :math:`j = TP/(TP+FP+FN)`, so :math:`k=TP`, :math:`l=FP+FN`
Parameters
----------
k1
Number of successes for the first system. See various definitions
above.
l1
Number of failures for the first system. See various definitions
above.
k2
Number of successes for the second system. See various definitions
above. Must be syntatically equal to ``k1``.
l2
Number of failures for the second system. See various definitions
above. Must be syntatically equal to ``l1``.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
nb_samples
Number of generated variates for the M-C simulation.
rng
An initialized numpy random number generator.
Returns
-------
A number between 0.0 and 1.0 that describes the probability that the
first system has a bigger measurement than the second.
"""
v1 = beta_posterior(k1, l1, lambda_, nb_samples, rng)
v2 = beta_posterior(k2, l2, lambda_, nb_samples, rng)
return numpy.count_nonzero(v1 > v2) / nb_samples
[docs]
def f1_posterior(
tp: int,
fp: int,
fn: int,
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
r"""Simulate the F1-score posterior of a system with the provided markings.
This implementation is based on [GOUTTE-2005]_, Equation 11.
Parameters
----------
tp
True positive count, AKA "hit", as an integer scalar.
fp
False positive count, AKA "false alarm", or "Type I error".
fn
False Negative count, AKA "miss", or "Type II error", as a scalar.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
nb_samples
Number of generated variates for the M-C simulation.
rng
An initialized numpy random number generator.
Returns
-------
variates: An array with size ``nb_samples`` containing a realization of
Equation 11 of [GOUTTE-2005]_.
"""
u = rng.gamma(shape=(tp + lambda_), scale=2.0, size=nb_samples)
v = rng.gamma(shape=(fp + fn + (2 * lambda_)), scale=1.0, size=nb_samples)
return u / (u + v)
[docs]
def average_f1_posterior(
tp: typing.Iterable[int],
fp: typing.Iterable[int],
fn: typing.Iterable[int],
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> numpy.typing.NDArray[numpy.double]:
r"""Simulate the F1-score posterior of an average system with the provided
markings.
This implementation is based on [GOUTTE-2005]_, Equation 11.
Parameters
----------
tp
Arrays containing true positive counts, AKA "hit", for all systems to
be considered on the average.
fp
Arrays containing false positive counts, AKA "false alarm", or "Type I
error" for all systems to be considered on the average.
fn
Arrays containing false Negative counts, AKA "miss", or "Type II
error" for all systems to be considered on the average..
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior. If unsure, use 0.5.
nb_samples
Number of generated gamma distribution values.
rng
An initialized numpy random number generator.
Returns
-------
variates: An array with size ``nb_samples`` containing a realization of
Equation 11 from [GOUTTE-2005]_.
"""
tp_array, fp_array, fn_array = as_int_arrays((tp, fp, fn))
return numpy.mean(
[
numpy.asarray(
f1_posterior(tp_, fp_, fn_, lambda_, nb_samples, rng),
dtype=numpy.double,
)
for tp_, fp_, fn_ in zip(tp_array, fp_array, fn_array)
],
axis=0,
)
[docs]
def compare_f1_scores(
tp1: int,
fp1: int,
fn1: int,
tp2: int,
fp2: int,
fn2: int,
lambda_: float,
nb_samples: int,
rng: numpy.random.Generator,
) -> float:
r"""Return the probability that the F1-score from 1 system is bigger than
the F1-score of a second system.
This implementation is based on [GOUTTE-2005]_.
Parameters
----------
tp1
True positive count, AKA "hit" for the first system.
fp1
False positive count, AKA "false alarm", or "Type I error" for the
first system.
fn1
False Negative count, AKA "miss", or "Type II error", for the first
system.
tp2
True positive count, AKA "hit" for the second system.
fp2
False positive count, AKA "false alarm", or "Type I error" for the
second system.
fn2
False Negative count, AKA "miss", or "Type II error", for the second
system.
lambda_
The parameterisation of the Beta prior to consider. Use
:math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for
Jeffrey's prior.
nb_samples
Number of generated variates for the M-C simulation.
rng
An initialized numpy random number generator.
Returns
-------
prob: A number between 0.0 and 1.0 that describes the probability that
the first system has a bigger F1-score than the second.
"""
f1 = f1_posterior(tp1, fp1, fn1, lambda_, nb_samples, rng)
f2 = f1_posterior(tp2, fp2, fn2, lambda_, nb_samples, rng)
return numpy.count_nonzero(f1 > f2) / nb_samples
[docs]
def compare_systems(
n: typing.Sequence[int],
lambda_: typing.Sequence[float],
nb_samples: int,
rng: numpy.random.Generator,
) -> float:
r"""Compare 2 system (binary) outputs using a Dirichlet posterior.
This function returns the empyrical probability that a system (1) is better
another system (2), based on their binary outputs. The comparison is
carried-out as described in [GOUTTE-2005]_, equations 16 and 19, via a
Monte-Carlo simulation, since the integral of the probability cannot be
resolved analytically.
To do so, we compute the probability that :math:`P(\pi_1 > \pi_2)`, i.e.,
the probability that system 1 gives the expected output while system 2 does
not, is greater than the probability that system 1 is incorrect while
system 2 gives the correct answer. It assumes, therefore, systems 1 and 2
are tuned (thresholded), and provide binary outputs that can be compared to
generate 3 numbers:
* :math:`n_1`: The measured number of times system 1 provides the correct
answer, whereas system 2 does not
* :math:`n_2`: The measured number of times system 2 provides the correct
answer, whereas system 1 does not
* :math:`n_3`: The measured number of times system 1 and 2 agree, giving
the same answer (wrong or write, it does not matter)
Notice that :math:`\pi_1 = \frac{n_1}{n_1 + n_2 + n_3}`, and so,
analogously, you may calculate :math:`\pi_2` and :math:`\pi_3`.
We then plug these numbers to simulate a Dirichlet (generalisation of the
Beta distribution for multiple variables) by setting:
* :math:`\alpha_1 = n_1 + \lambda_1`
* :math:`\alpha_2 = n_2 + \lambda_2`
* :math:`\alpha_3 = n_2 + \lambda_3`
Where each :math:`\lambda_i` correspond to the prior to be imputed to that
particular variable. We typically select :math:`\lambda_1 = \lambda_2`,
Parameters
----------
n
A triple with 3 integers representing :math:`n_1`, :math:`n_2` and
:math:`n_3`.
lambda_
A tuple with length 3, containing floating point numbers describing the
parameterisation of the Dirichlet priors to consider. Use
:math:`\lambda_i=0.5` for Jeffrey's prior.
nb_samples
Number of generated dirichlet distribution values (make this high, for
a higher precision on the simulation).
rng
An initialized numpy random number generator.
Returns
-------
probability: A number between 0.0 and 1.0 that describes the
probability that the first system is better than the second one.
"""
assert len(n) == 3
assert len(lambda_) == 3
samples = rng.dirichlet(
numpy.array(n) + numpy.array(lambda_), size=nb_samples
)
return numpy.count_nonzero(samples[:, 0] > samples[:, 1]) / nb_samples