Source code for credible.bayesian.metrics

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Implementation of :py:mod:`Scikit-Learn compatible measures
<sklearn.metrics>` with bayesian credible regions.
"""

import typing

import numpy
import numpy.typing
import sklearn.metrics

from .. import curves
from ..utils import safe_divide
from . import functors, utils

NUMBER_MC_SAMPLES = 100000
"""Suggested number of samples to use for Monte Carlo simulations in this
package."""



[docs]
def precision_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Precision **binary** classification score.

    AKA positive predictive value (PPV), mean, mode and credible intervals.  It
    corresponds arithmetically to ``tp/(tp+fp)``.  This function only supports
    **binary** classification problems.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual precision, as would be returned by scikit-learn
        * The mode of the posterior distribution: It is typically close to
          the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    _, fp, _, tp = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    _, mode, lower, upper = utils.beta(tp, fp, lambda_, coverage)
    return safe_divide(tp, tp + fp), mode, lower, upper




[docs]
def recall_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Recall **binary** classification score.

    AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode and
    credible intervals.  It corresponds arithmetically to ``tp/(tp+fn)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual recall, as would be returned by scikit-learn
        * The mode of the posterior distribution: this represents the best
          estimate of the recall *a posteriori*.  It is typically close to
          the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    _, _, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    _, mode, lower, upper = utils.beta(tp, fn, lambda_, coverage)
    return safe_divide(tp, tp + fn), mode, lower, upper




[docs]
def specificity_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Specificity **binary** classification score.

    AKA selectivity or true negative rate (TNR), mean, mode and credible
    intervals.  It corresponds arithmetically to ``tn/(tn+fp)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual specificity, as would be returned by scikit-learn
        * The mode of the posterior distribution: this represents the best
          estimate of the specificity *a posteriori*.  It is typically close to
          the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    tn, fp, _, _ = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    _, mode, lower, upper = utils.beta(tn, fp, lambda_, coverage)
    return safe_divide(tn, tn + fp), mode, lower, upper




[docs]
def accuracy_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Accuracy **binary** classification score.

    See `Accuracy
    <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is the
    proportion of correct predictions (both true positives and true negatives)
    among the total number of pixels examined.  It corresponds arithmetically
    to ``(tp+tn)/(tp+tn+fp+fn)``.  This measure includes both true-negatives
    and positives in the numerator, what makes it sensitive to data or regions
    without annotations. AKA selectivity or true negative rate (TNR), mean,
    mode and credible intervals.  It corresponds arithmetically to
    ``tn/(tn+fp)``.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual accuracy, as would be returned by scikit-learn
        * The mode of the posterior distribution: this represents the best
          estimate of the accuracy *a posteriori*.  It is typically close to
          the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    _, mode, lower, upper = utils.beta(tn + tp, fp + fn, lambda_, coverage)
    return safe_divide(tn + tp, tn + tp + fp + fn), mode, lower, upper




[docs]
def jaccard_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Jaccard **binary** classification score.

    See `Jaccard Index or Similarity
    <https://en.wikipedia.org/wiki/Jaccard_index>`_.  It corresponds
    arithmetically to ``tp/(tp+fp+fn)``.  The Jaccard index depends on a
    TP-only numerator, similarly to the F1 score.  For regions where there are
    no annotations, the Jaccard index will always be zero, irrespective of the
    model output.  Accuracy may be a better proxy if one needs to consider the
    true abscence of annotations in a region as part of the measure.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you're expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.  Changes in this value do not significantly affect the
        outcome, unless ``tp`` or ``fp`` are very small (close to 1).
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual jaccard score, as would be returned by scikit-learn
        * The mode of the posterior distribution: this represents the best
          estimate of the jaccard score *a posteriori*.  It is typically close
          to the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """
    _, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    _, mode, lower, upper = utils.beta(tp, fp + fn, lambda_, coverage)
    return safe_divide(tp, tp + fp + fn), mode, lower, upper




[docs]
def f1_score(
    y_true: typing.Iterable[int],
    y_pred: typing.Iterable[int],
    coverage: float = 0.95,
    lambda_: float = 1.0,
    rng: numpy.random.Generator = numpy.random.default_rng(),
    nb_samples: int = NUMBER_MC_SAMPLES,
    **kwargs,
) -> tuple[float, float, float, float]:
    r"""Return the mean, mode, upper and lower bounds of the credible region of
    the F1 score.

    See `F1-score <https://en.wikipedia.org/wiki/F1_score>`_.  It corresponds
    arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``.  The F1 or Dice
    score depends on a TP-only numerator, similarly to the Jaccard index.  For
    regions where there are no annotations, the F1-score will always be zero,
    irrespective of the model output. Accuracy may be a better proxy if one
    needs to consider the true abscence of annotations in a region as part of
    the measure.

    This implementation is based on [GOUTTE-2005]_.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_pred
        Predicted labels, as returned by a classifier.
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.
    rng
        An initialized numpy random number generator.
    nb_samples
        Number of generated variates for the M-C simulation.
    **kwargs
        Additional parameters for ``sklearn.metrics.confusion_matrix``.

    Returns
    -------
    tuple[float, float, float, float]
        Tuple with 4 floating-point numbers:

        * The actual F1 score, as would be returned by scikit-learn
        * The mode of the posterior distribution: this represents the best
          estimate of the F1 score *a posteriori*.  It is typically close
          to the value estimated by scikit-learn.
        * The lower value of the credible region/confidence interval
        * The upper value of the credible region/confidence interval
    """

    _, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred, **kwargs).ravel()
    # builds a Monte Carlo simulation of the F1 posterior distribution
    variates = utils.f1_posterior(tp, fp, fn, lambda_, nb_samples, rng)
    # evaluates mean, mode, lower and upper bounds for the confidence interval
    # of interest.
    _, mode, lower, upper = utils.evaluate_statistics(variates, coverage, "auto")

    # from matplotlib import pyplot as plt
    # plt.hist(variates, bins="auto", alpha=0.3, label="Variates")
    # plt.axvline(_, color="red", label="mean")
    # plt.axvline(mode, color="blue", label="mode")
    # plt.axvline(
    #     safe_divide(2 * tp, (2 * tp) + fp + fn), color="green", label="est."
    # )
    # plt.legend(loc="best")
    # plt.title("F1-Score variates (i.i.d. samples)")
    # plt.show()

    return safe_divide(2 * tp, (2 * tp) + fp + fn), mode, lower, upper




[docs]
def roc_curve(
    y_true: typing.Iterable[int],
    y_score: typing.Iterable[float],
    coverage: float = 0.95,
    lambda_: float = 1.0,
) -> tuple[
    numpy.typing.NDArray[numpy.double],  # fpr
    numpy.typing.NDArray[numpy.double],  # tpr
    numpy.typing.NDArray[numpy.double],  # thresholds
    numpy.typing.NDArray[numpy.double],  # fpr lower ci
    numpy.typing.NDArray[numpy.double],  # tpr lower ci
    numpy.typing.NDArray[numpy.double],  # fpr upper ci
    numpy.typing.NDArray[numpy.double],  # tpr upper ci
]:
    r"""Compute Receiver operating characteristic (ROC).

    Approximately follows API of :py:func:`sklearn.metrics.roc_curve`.

    .. important::

       The returned credible regions are not immediately usable for plots or
       the evaluation of the area under the curve, only as point estimates for
       individual thresholds.  To plot, feed the output of this funtion to
       :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates
       provided by that function instead.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions (as
        returned by “decision_function” on some classifiers).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    Returns
    -------
        Seven 1-D floating point arrays corresponding to:

        * FPR (false positive rates)
        * TPR (true positive rates)
        * The thresholds used to evaluated the selected metrics
        * The lower confidence interval for the FPR
        * The lower confidence interval for the TPR
        * The upper confidence interval for the FPR
        * The upper confidence interval for the TPR
    """
    return curves.curve_ci(
        numpy.asarray(y_true, dtype=numpy.int_),
        numpy.asarray(y_score, dtype=numpy.double),
        (curves.AxisType.FPR, curves.AxisType.TPR),
        functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage),
        "roc",
    )




[docs]
def roc_auc_score(
    y_true: typing.Iterable[int],
    y_score: typing.Iterable[float],
    coverage: float = 0.95,
    lambda_: float = 1.0,
) -> tuple[float, float, float]:
    r"""Calculate the area under the ROC (FPR vs TPR) curve.

    This function mimics the scikit-learn API, except it also returns lower and
    upper bounds considering the credible regions defined in each threshold.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions (as
        returned by “decision_function” on some classifiers).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    Returns
    -------
        A tuple with 3 floats:

        * the area under the ROC (FPR vs. TPR) curve
        * the lower bound considering the credible region defined by
          ``lambda_`` and ``coverage`` parameters.
        * the upper bound considering the credible region defined by
          ``lambda_`` and ``coverage`` parameters.
    """
    data = roc_curve(y_true, y_score, coverage, lambda_)
    # calculates lower and upper bounds based on CIs for each point (threshold)
    # of the curve
    lower, upper = curves.curve_ci_hull(
        curve=data,
        extrapolate_from_origin=False,  # should extrapolate from (x=1, y=0)
    )
    return (
        curves.area_under_the_curve(data[:2]),
        curves.area_under_the_curve(lower),
        curves.area_under_the_curve(upper),
    )




[docs]
def det_curve(
    y_true: typing.Iterable[int],
    y_score: typing.Iterable[float],
    coverage: float = 0.95,
    lambda_: float = 1.0,
) -> tuple[
    numpy.typing.NDArray[numpy.double],  # fpr
    numpy.typing.NDArray[numpy.double],  # fnr
    numpy.typing.NDArray[numpy.double],  # thresholds
    numpy.typing.NDArray[numpy.double],  # fpr lower ci
    numpy.typing.NDArray[numpy.double],  # fnr lower ci
    numpy.typing.NDArray[numpy.double],  # fpr upper ci
    numpy.typing.NDArray[numpy.double],  # fnr upper ci
]:
    r"""Compute the Detection Error-Tradeoff (DET) curve.

    Approximately follows API of :py:func:`sklearn.metrics.det_curve`.

    .. important::

       The returned credible regions are not immediately usable for plots or
       the evaluation of the area under the curve, only as point estimates for
       individual thresholds.  To plot, feed the output of this funtion to
       :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates
       provided by that function instead.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions (as
        returned by “decision_function” on some classifiers).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    Returns
    -------
        Seven 1-D floating point arrays corresponding to:

        * FPR (false positive rates)
        * FNR (false negative rates)
        * The thresholds used to evaluated the selected metrics
        * The lower confidence interval for the FPR
        * The lower confidence interval for the FNR
        * The upper confidence interval for the FPR
        * The upper confidence interval for the FNR
    """

    return curves.curve_ci(
        numpy.asarray(y_true, dtype=numpy.int_),
        numpy.asarray(y_score, dtype=numpy.double),
        (curves.AxisType.FPR, curves.AxisType.FNR),
        functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage),
        "det",
    )




[docs]
def precision_recall_curve(
    y_true: typing.Iterable[int],
    y_score: typing.Iterable[float],
    coverage: float = 0.95,
    lambda_: float = 1.0,
) -> tuple[
    numpy.typing.NDArray[numpy.double],  # precision
    numpy.typing.NDArray[numpy.double],  # redcall
    numpy.typing.NDArray[numpy.double],  # thresholds
    numpy.typing.NDArray[numpy.double],  # precision lower ci
    numpy.typing.NDArray[numpy.double],  # recall lower ci
    numpy.typing.NDArray[numpy.double],  # precision upper ci
    numpy.typing.NDArray[numpy.double],  # recall upper ci
]:
    r"""Compute Precision-Recall (PR) curve.

    Approximately follows API of
    :py:func:`sklearn.metrics.precision_recall_curve`.

    .. note::

        This package computes the precision-recall curve in a similar, but
        slightly different way than scikit-learn.  It does not add an extra
        (1.0, 0.0) at the end of the PR curve.  (c.f.: documentation for
        :py:func:`sklearn.metrics.precision_recall_curve`).

    .. important::

       The returned credible regions are not immediately usable for plots or
       the evaluation of the area under the curve, only as point estimates for
       individual thresholds.  To plot, feed the output of this funtion to
       :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates
       provided by that function instead.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions (as
        returned by “decision_function” on some classifiers).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    Returns
    -------
        Seven 1-D floating point arrays corresponding to:

        * Precision
        * Recall
        * The thresholds used to evaluated the selected metrics
        * The lower confidence interval for the Precision
        * The lower confidence interval for the Recall
        * The upper confidence interval for the Precision
        * The upper confidence interval for the Recall
    """

    return curves.curve_ci(
        numpy.asarray(y_true, dtype=numpy.int_),
        numpy.asarray(y_score, dtype=numpy.double),
        (curves.AxisType.PREC, curves.AxisType.REC),
        functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage),
        "pr",
    )




[docs]
def average_precision_score(
    y_true: typing.Iterable[int],
    y_score: typing.Iterable[float],
    coverage: float = 0.95,
    lambda_: float = 1.0,
) -> tuple[float, float, float]:
    r"""Compute average precision (AP) from prediction scores.

    This function mimics the scikit-learn API, except it also returns lower and
    upper bounds considering the credible regions defined in each threshold.

    Parameters
    ----------
    y_true
        Ground truth (correct) labels.
    y_score
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions (as
        returned by “decision_function” on some classifiers).
    coverage
        A floating-point number between 0 and 1.0 indicating the coverage
        you are expecting.  A value of 0.95 will ensure 95% of the area under
        the probability density of the posterior is covered by the returned
        equal-tailed interval.
    lambda_
        The parameterisation of the Beta prior to consider. Use
        :math:`\lambda=1` for a flat prior.  Use :math:`\lambda=0.5` for
        Jeffrey's prior.

    Returns
    -------
        A tuple with 3 floats:

        * the area under the ROC (FPR vs. TPR) curve
        * the lower bound considering the credible region defined by
          ``lambda_`` and ``coverage`` parameters.
        * the upper bound considering the credible region defined by
          ``lambda_`` and ``coverage`` parameters.
    """
    data = precision_recall_curve(y_true, y_score, coverage, lambda_)
    # calculates lower and upper bounds based on CIs for each point (threshold)
    # of the curve
    lower, upper = curves.curve_ci_hull(
        curve=data,
        extrapolate_from_origin=True,  # should extrapolate from (x=0, y=0)
    )
    return (
        curves.average_metric(data[:2]),
        curves.average_metric(lower),
        curves.average_metric(upper),
    )