Source code for credible.bayesian.metrics

# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Implementation of :py:mod:`Scikit-Learn compatible measures
<sklearn.metrics>` with bayesian credible regions.
"""

import typing

import numpy
import numpy.typing
import sklearn.metrics

from .. import curves
from ..utils import safe_divide
from . import functors, utils

NUMBER_MC_SAMPLES = 100000
"""Suggested number of samples to use for Monte Carlo simulations in this
package."""


[docs] def precision_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float, float]: r"""Precision **binary** classification score. AKA positive predictive value (PPV), mean, mode and credible intervals. It corresponds arithmetically to ``tp/(tp+fp)``. This function only supports **binary** classification problems. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. Changes in this value do not significantly affect the outcome, unless ``tp`` or ``fp`` are very small (close to 1). coverage A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual precision, as would be returned by scikit-learn * The mode of the posterior distribution: It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ _, fp, _, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() _, mode, lower, upper = utils.beta(tp, fp, lambda_, coverage) return safe_divide(tp, tp + fp), mode, lower, upper
[docs] def recall_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float, float]: r"""Recall **binary** classification score. AKA sensitivity, hit rate, or true positive rate (TPR), mean, mode and credible intervals. It corresponds arithmetically to ``tp/(tp+fn)``. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. Changes in this value do not significantly affect the outcome, unless ``tp`` or ``fp`` are very small (close to 1). coverage A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual recall, as would be returned by scikit-learn * The mode of the posterior distribution: this represents the best estimate of the recall *a posteriori*. It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ _, _, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() _, mode, lower, upper = utils.beta(tp, fn, lambda_, coverage) return safe_divide(tp, tp + fn), mode, lower, upper
[docs] def specificity_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float, float]: r"""Specificity **binary** classification score. AKA selectivity or true negative rate (TNR), mean, mode and credible intervals. It corresponds arithmetically to ``tn/(tn+fp)``. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. Changes in this value do not significantly affect the outcome, unless ``tp`` or ``fp`` are very small (close to 1). coverage A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual specificity, as would be returned by scikit-learn * The mode of the posterior distribution: this represents the best estimate of the specificity *a posteriori*. It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ tn, fp, _, _ = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() _, mode, lower, upper = utils.beta(tn, fp, lambda_, coverage) return safe_divide(tn, tn + fp), mode, lower, upper
[docs] def accuracy_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float, float]: r"""Accuracy **binary** classification score. See `Accuracy <https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers>`_. is the proportion of correct predictions (both true positives and true negatives) among the total number of pixels examined. It corresponds arithmetically to ``(tp+tn)/(tp+tn+fp+fn)``. This measure includes both true-negatives and positives in the numerator, what makes it sensitive to data or regions without annotations. AKA selectivity or true negative rate (TNR), mean, mode and credible intervals. It corresponds arithmetically to ``tn/(tn+fp)``. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. Changes in this value do not significantly affect the outcome, unless ``tp`` or ``fp`` are very small (close to 1). coverage A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual accuracy, as would be returned by scikit-learn * The mode of the posterior distribution: this represents the best estimate of the accuracy *a posteriori*. It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() _, mode, lower, upper = utils.beta(tn + tp, fp + fn, lambda_, coverage) return safe_divide(tn + tp, tn + tp + fp + fn), mode, lower, upper
[docs] def jaccard_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float, float]: r"""Jaccard **binary** classification score. See `Jaccard Index or Similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_. It corresponds arithmetically to ``tp/(tp+fp+fn)``. The Jaccard index depends on a TP-only numerator, similarly to the F1 score. For regions where there are no annotations, the Jaccard index will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. Changes in this value do not significantly affect the outcome, unless ``tp`` or ``fp`` are very small (close to 1). coverage A floating-point number between 0 and 1.0 indicating the coverage you're expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual jaccard score, as would be returned by scikit-learn * The mode of the posterior distribution: this represents the best estimate of the jaccard score *a posteriori*. It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ _, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() _, mode, lower, upper = utils.beta(tp, fp + fn, lambda_, coverage) return safe_divide(tp, tp + fp + fn), mode, lower, upper
[docs] def f1_score( y_true: typing.Iterable[int], y_pred: typing.Iterable[int], rng: numpy.random.Generator, lambda_: float = 1.0, coverage: float = 0.95, nb_samples: int = NUMBER_MC_SAMPLES, ) -> tuple[float, float, float, float]: r"""Return the mean, mode, upper and lower bounds of the credible region of the F1 score. See `F1-score <https://en.wikipedia.org/wiki/F1_score>`_. It corresponds arithmetically to ``2*P*R/(P+R)`` or ``2*tp/(2*tp+fp+fn)``. The F1 or Dice score depends on a TP-only numerator, similarly to the Jaccard index. For regions where there are no annotations, the F1-score will always be zero, irrespective of the model output. Accuracy may be a better proxy if one needs to consider the true abscence of annotations in a region as part of the measure. This implementation is based on [GOUTTE-2005]_. Parameters ---------- y_true Ground truth (correct) labels. y_pred Predicted labels, as returned by a classifier. rng An initialized numpy random number generator. lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. nb_samples Number of generated variates for the M-C simulation. Returns ------- tuple[float, float, float, float] Tuple with 4 floating-point numbers: * The actual F1 score, as would be returned by scikit-learn * The mode of the posterior distribution: this represents the best estimate of the F1 score *a posteriori*. It is typically close to the value estimated by scikit-learn. * The lower value of the credible region/confidence interval * The upper value of the credible region/confidence interval """ _, fp, fn, tp = sklearn.metrics.confusion_matrix(y_true, y_pred).ravel() # builds a Monte Carlo simulation of the F1 posterior distribution variates = utils.f1_posterior(tp, fp, fn, lambda_, nb_samples, rng) # evaluates mean, mode, lower and upper bounds for the confidence interval # of interest. _, mode, lower, upper = utils.evaluate_statistics( variates, coverage, "auto" ) # from matplotlib import pyplot as plt # plt.hist(variates, bins="auto", alpha=0.3, label="Variates") # plt.axvline(_, color="red", label="mean") # plt.axvline(mode, color="blue", label="mode") # plt.axvline( # safe_divide(2 * tp, (2 * tp) + fp + fn), color="green", label="est." # ) # plt.legend(loc="best") # plt.title("F1-Score variates (i.i.d. samples)") # plt.show() return safe_divide(2 * tp, (2 * tp) + fp + fn), mode, lower, upper
[docs] def roc_curve( y_true: typing.Iterable[int], y_score: typing.Iterable[float], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[ numpy.typing.NDArray[numpy.double], # fpr numpy.typing.NDArray[numpy.double], # tpr numpy.typing.NDArray[numpy.double], # thresholds numpy.typing.NDArray[numpy.double], # fpr lower ci numpy.typing.NDArray[numpy.double], # tpr lower ci numpy.typing.NDArray[numpy.double], # fpr upper ci numpy.typing.NDArray[numpy.double], # tpr upper ci ]: r"""Compute Receiver operating characteristic (ROC). Approximately follows API of :py:func:`sklearn.metrics.roc_curve`. .. important:: The returned credible regions are not immediately usable for plots or the evaluation of the area under the curve, only as point estimates for individual thresholds. To plot, feed the output of this funtion to :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates provided by that function instead. Parameters ---------- y_true Ground truth (correct) labels. y_score Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers). lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- Seven 1-D floating point arrays corresponding to: * FPR (false positive rates) * TPR (true positive rates) * The thresholds used to evaluated the selected metrics * The lower confidence interval for the FPR * The lower confidence interval for the TPR * The upper confidence interval for the FPR * The upper confidence interval for the TPR """ return curves.curve_ci( numpy.asarray(y_true, dtype=numpy.int_), numpy.asarray(y_score, dtype=numpy.double), (curves.AxisType.FPR, curves.AxisType.TPR), functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage), "roc", )
[docs] def roc_auc_score( y_true: typing.Iterable[int], y_score: typing.Iterable[float], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float]: r"""Calculate the area under the ROC (FPR vs TPR) curve. This function mimics the scikit-learn API, except it also returns lower and upper bounds considering the credible regions defined in each threshold. Parameters ---------- y_true Ground truth (correct) labels. y_score Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers). lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- A tuple with 3 floats: * the area under the ROC (FPR vs. TPR) curve * the lower bound considering the credible region defined by ``lambda_`` and ``coverage`` parameters. * the upper bound considering the credible region defined by ``lambda_`` and ``coverage`` parameters. """ data = roc_curve(y_true, y_score, lambda_, coverage) # calculates lower and upper bounds based on CIs for each point (threshold) # of the curve lower, upper = curves.curve_ci_hull( curve=data, extrapolate_from_origin=False, # should extrapolate from (x=1, y=0) ) return ( curves.area_under_the_curve(data[:2]), curves.area_under_the_curve(lower), curves.area_under_the_curve(upper), )
[docs] def det_curve( y_true: typing.Iterable[int], y_score: typing.Iterable[float], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[ numpy.typing.NDArray[numpy.double], # fpr numpy.typing.NDArray[numpy.double], # fnr numpy.typing.NDArray[numpy.double], # thresholds numpy.typing.NDArray[numpy.double], # fpr lower ci numpy.typing.NDArray[numpy.double], # fnr lower ci numpy.typing.NDArray[numpy.double], # fpr upper ci numpy.typing.NDArray[numpy.double], # fnr upper ci ]: r"""Compute the Detection Error-Tradeoff (DET) curve. Approximately follows API of :py:func:`sklearn.metrics.det_curve`. .. important:: The returned credible regions are not immediately usable for plots or the evaluation of the area under the curve, only as point estimates for individual thresholds. To plot, feed the output of this funtion to :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates provided by that function instead. Parameters ---------- y_true Ground truth (correct) labels. y_score Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers). lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- Seven 1-D floating point arrays corresponding to: * FPR (false positive rates) * FNR (false negative rates) * The thresholds used to evaluated the selected metrics * The lower confidence interval for the FPR * The lower confidence interval for the FNR * The upper confidence interval for the FPR * The upper confidence interval for the FNR """ return curves.curve_ci( numpy.asarray(y_true, dtype=numpy.int_), numpy.asarray(y_score, dtype=numpy.double), (curves.AxisType.FPR, curves.AxisType.FNR), functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage), "det", )
[docs] def precision_recall_curve( y_true: typing.Iterable[int], y_score: typing.Iterable[float], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[ numpy.typing.NDArray[numpy.double], # precision numpy.typing.NDArray[numpy.double], # redcall numpy.typing.NDArray[numpy.double], # thresholds numpy.typing.NDArray[numpy.double], # precision lower ci numpy.typing.NDArray[numpy.double], # recall lower ci numpy.typing.NDArray[numpy.double], # precision upper ci numpy.typing.NDArray[numpy.double], # recall upper ci ]: r"""Compute Precision-Recall (PR) curve. Approximately follows API of :py:func:`sklearn.metrics.precision_recall_curve`. .. note:: This package computes the precision-recall curve in a similar, but slightly different way than scikit-learn. It does not add an extra (1.0, 0.0) at the end of the PR curve. (c.f.: documentation for :py:func:`sklearn.metrics.precision_recall_curve`). .. important:: The returned credible regions are not immediately usable for plots or the evaluation of the area under the curve, only as point estimates for individual thresholds. To plot, feed the output of this funtion to :py:func:`.curves.curve_ci_hull` and use the lower and upper estimates provided by that function instead. Parameters ---------- y_true Ground truth (correct) labels. y_score Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers). lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- Seven 1-D floating point arrays corresponding to: * Precision * Recall * The thresholds used to evaluated the selected metrics * The lower confidence interval for the Precision * The lower confidence interval for the Recall * The upper confidence interval for the Precision * The upper confidence interval for the Recall """ return curves.curve_ci( numpy.asarray(y_true, dtype=numpy.int_), numpy.asarray(y_score, dtype=numpy.double), (curves.AxisType.PREC, curves.AxisType.REC), functors.make_functor(utils.beta, lambda_=lambda_, coverage=coverage), "pr", )
[docs] def average_precision_score( y_true: typing.Iterable[int], y_score: typing.Iterable[float], lambda_: float = 1.0, coverage: float = 0.95, ) -> tuple[float, float, float]: r"""Compute average precision (AP) from prediction scores. This function mimics the scikit-learn API, except it also returns lower and upper bounds considering the credible regions defined in each threshold. Parameters ---------- y_true Ground truth (correct) labels. y_score Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by “decision_function” on some classifiers). lambda_ The parameterisation of the Beta prior to consider. Use :math:`\lambda=1` for a flat prior. Use :math:`\lambda=0.5` for Jeffrey's prior. coverage A floating-point number between 0 and 1.0 indicating the coverage you are expecting. A value of 0.95 will ensure 95% of the area under the probability density of the posterior is covered by the returned equal-tailed interval. Returns ------- A tuple with 3 floats: * the area under the ROC (FPR vs. TPR) curve * the lower bound considering the credible region defined by ``lambda_`` and ``coverage`` parameters. * the upper bound considering the credible region defined by ``lambda_`` and ``coverage`` parameters. """ data = precision_recall_curve(y_true, y_score, lambda_, coverage) # calculates lower and upper bounds based on CIs for each point (threshold) # of the curve lower, upper = curves.curve_ci_hull( curve=data, extrapolate_from_origin=True, # should extrapolate from (x=0, y=0) ) return ( curves.average_metric(data[:2]), curves.average_metric(lower), curves.average_metric(upper), )