Source code for credible.bayesian.kfold

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Implementation of :py:mod:`Scikit-Learn compatible measures
<sklearn.metrics>` with bayesian credible regions for k-folding experiments.
"""

import typing

import numpy
import numpy.typing
import sklearn.metrics

from . import utils
from .metrics import NUMBER_MC_SAMPLES



[docs]
def precision_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Precision **binary** classification score.

    AKA positive predictive value (PPV), mean, mode and credible intervals.  It
    corresponds arithmetically to ``tp/(tp+fp)``.  This function only supports
    **binary** classification problems.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average precision, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    fp_array = cms[:, 1]
    tp_array = cms[:, 3]
    _, mode, lower, upper = utils.average_beta(
        tp_array, fp_array, lambda_, coverage, nb_samples, rng
    )
    return (
        numpy.mean(tp_array / (tp_array + fp_array)).item(),
        mode,
        lower,
        upper,
    )




[docs]
def recall_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Recall **binary** classification score.

    AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode and
    credible intervals.  It corresponds arithmetically to ``tp/(tp+fn)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average recall, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    fn_array = cms[:, 2]
    tp_array = cms[:, 3]
    _, mode, lower, upper = utils.average_beta(
        tp_array, fn_array, lambda_, coverage, nb_samples, rng
    )
    return (
        numpy.mean(tp_array / (tp_array + fn_array)).item(),
        mode,
        lower,
        upper,
    )




[docs]
def specificity_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Specificity **binary** classification score.

    AKA selectivity or true negative rate (TNR), mean, mode and credible
    intervals.  It corresponds arithmetically to ``tn/(tn+fp)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average specificity, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    tn_array = cms[:, 0]
    fp_array = cms[:, 1]
    _, mode, lower, upper = utils.average_beta(
        tn_array, fp_array, lambda_, coverage, nb_samples, rng
    )
    return (
        numpy.mean(tn_array / (tn_array + fp_array)).item(),
        mode,
        lower,
        upper,
    )




[docs]
def accuracy_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Accuracy **binary** classification score.

    See `Accuracy
    <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is the
    proportion of correct predictions (both true positives and true negatives)
    among the total number of pixels examined.  It corresponds arithmetically
    to ``(tp+tn)/(tp+tn+fp+fn)``.  This measure includes both true-negatives
    and positives in the numerator, what makes it sensitive to data or regions
    without annotations. AKA selectivity or true negative rate (TNR), mean,
    mode and credible intervals.  It corresponds arithmetically to
    ``tn/(tn+fp)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average accuracy, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    tn_array = cms[:, 0]
    fp_array = cms[:, 1]
    fn_array = cms[:, 2]
    tp_array = cms[:, 3]
    _, mode, lower, upper = utils.average_beta(
        tp_array + tn_array,
        fn_array + fp_array,
        lambda_,
        coverage,
        nb_samples,
        rng,
    )
    return (
        numpy.mean(
            (tp_array + tn_array) / (tn_array + fp_array + fn_array + tp_array)
        ).item(),
        mode,
        lower,
        upper,
    )




[docs]
def jaccard_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Jaccard **binary** classification score.

    See `Jaccard Index or Similarity
    <https://en.wikipedia.org/wiki/Jaccard_index>`_.  It corresponds
    arithmetically to ``tp/(tp+fp+fn)``.  The Jaccard index depends on a
    TP-only numerator, similarly to the F1 score.  For regions where there are
    no annotations, the Jaccard index will always be zero, irrespective of the
    model output.  Accuracy may be a better proxy if one needs to consider the
    true abscence of annotations in a region as part of the measure.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average jaccard score, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    fp_array = cms[:, 1]
    fn_array = cms[:, 2]
    tp_array = cms[:, 3]
    _, mode, lower, upper = utils.average_beta(
        tp_array,
        fp_array + fn_array,
        lambda_,
        coverage,
        nb_samples,
        rng,
    )
    return (
        numpy.mean(tp_array / (tp_array + fn_array + fp_array)).item(),
        mode,
        lower,
        upper,
    )




[docs]
def f1_score(
    y_true: typing.Iterable[typing.Iterable[int]],
    y_pred: typing.Iterable[typing.Iterable[int]],
    rng: numpy.random.Generator,
    lambda_: float = 1.0,
    coverage: float = 0.95,
    nb_samples: int = NUMBER_MC_SAMPLES,
) -> tuple[float, float, float, float]:
    r"""Return the mean, mode, upper and lower bounds of the credible region of
    the F1 score.

    See `F1-score <https://en.wikipedia.org/wiki/F1_score>`_.  It corresponds
    arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.  The F1 or Dice
    score depends on a TP-only numerator, similarly to the Jaccard index.  For
    regions where there are no annotations, the F1-score will always be zero,
    irrespective of the model output. Accuracy may be a better proxy if one
    needs to consider the true abscence of annotations in a region as part of
    the measure.

    This implementation is based on [GOUTTE-2005]_.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    rng
        An initialized numpy random number generator.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    nb_samples
        Number of generated variates for the M-C simulation.

    Returns
    -------
        A tuple with 4 floating-point numbers:

        * The average F1-score, as would be returned by scikit-learn
        * The mode of the posterior distribution
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    cms = numpy.asarray(
        [
            sklearn.metrics.confusion_matrix(i, j).ravel()
            for (i, j) in zip(y_true, y_pred)
        ],
        dtype=numpy.int_,
    )
    fp_array = cms[:, 1]
    fn_array = cms[:, 2]
    tp_array = cms[:, 3]
    variates = utils.average_f1_posterior(
        tp_array, fp_array, fn_array, lambda_, nb_samples, rng
    )
    _, mode, lower, upper = utils.evaluate_statistics(variates, coverage, bins="auto")
    return (
        numpy.mean((2 * tp_array) / ((2 * tp_array) + fn_array + fp_array)).item(),
        mode,
        lower,
        upper,
    )