# SPDX-FileCopyrightText: Copyright © 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Curve plotting support.
Code in this module expresses classic performance curves for system performance
evaluation as sets of coordinates. Use the module :py:mod:`.plot` to make
graphical representations.
.. include:: ../links.rst
"""
import enum
import typing
import numpy
import numpy.linalg
import numpy.typing
import sklearn.metrics
from .utils import CIArrayFunctor, CIFunctor
[docs]
class AxisType(enum.Enum):
r"""Supported types for metrics that have binomial distributions.
This enumeration contains a list of metrics (rates) that have binomial
distributions. For each available entry, we define the successes (``k``)
and failures (``l``) such that the metric can be calculated as such:
.. math::
metric = \frac{k}{k+l}
Keys with the same (integer) value represent synonyms.
Definitions are taken from
https://en.wikipedia.org/wiki/Confusion_matrix.
Parameters
----------
key
One of the integer keys for supported measures:
* True positive rate, recall, sensitivity: 1
* True negative rate, specificity, selectivity: 2
* False negative rate: 3
* False positive rate: 4
* Precision, positive preditive value: 5
* Negative predictive value: 6
fullname
Full name of the measure.
abbreviation
The abbreviation for the measure (three-letter, lower-case).
"""
TPR = (1, "true positive rate", "tpr")
REC = (1, "recall", "rec")
SEN = (1, "sensitivity", "sen")
TNR = (2, "true negative rate", "tnr")
SPEC = (2, "specificity", "spec")
SEL = (2, "selectivity", "sel")
FNR = (3, "false negative rate", "fnr")
FPR = (4, "false positive rate", "fpr")
PREC = (5, "precision", "prec")
PPV = (5, "positive predictive value", "ppv")
NPV = (6, "negative predictive value", "npv")
def __init__(self, key: int, fullname: str, abbreviation: str):
self.key = key
self.fullname = fullname
self.abbreviation = abbreviation
[docs]
def make_cm_functor(
self, ci_functor: CIFunctor
) -> typing.Callable[
[numpy.typing.NDArray[numpy.int_]], tuple[float, float, float]
]:
"""Return callable to treat binary confusion matrices and produce curve
and confidence intervals.
This method will take a confidence interval functor and will wrap it
over a complete confusion-matrix functor that produces the metric
estimates, lower and upper confidence intervals, in this order.
Parameters
----------
ci_functor
Functor to evaluate the rate, lower and upper confidence intervals
from (binomial) successes and failures.
Returns
-------
A functor that can operate directly from the confusion matrix
outputs (instead of success and failures), with the same return
values as the input functor.
"""
successes = failures = -1
match self.key:
# Notice scikit-learn order:
# tn, fp, fn, tp = confusion_matrix(...).ravel()
# From https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
case 1: # tpr
successes = 3 # tp
failures = 2 # fn
case 2: # tnr
successes = 0 # tn
failures = 1 # fp
case 3: # fnr
successes = 2 # fn
failures = 3 # tp
case 4: # fpr
successes = 1 # fp
failures = 0 # tn
case 5: # precision
successes = 3 # tp
failures = 1 # fp
case 6: # npv
successes = 0 # tn
failures = 2 # fn
return lambda x: ci_functor(x[successes], x[failures])
[docs]
def curve_ci(
y_true: typing.Iterable[int],
y_score: typing.Iterable[float],
axes: tuple[AxisType, AxisType],
ci_functor: CIFunctor,
skl_functor_name: typing.Literal["roc", "det", "pr"],
) -> tuple[
numpy.typing.NDArray[numpy.double], # axes[0] rate
numpy.typing.NDArray[numpy.double], # axes[1] rate
numpy.typing.NDArray[numpy.double], # thresholds
numpy.typing.NDArray[numpy.double], # axes[0] lower ci
numpy.typing.NDArray[numpy.double], # axes[1] lower ci
numpy.typing.NDArray[numpy.double], # axes[0] upper ci
numpy.typing.NDArray[numpy.double], # axes[1] upper ci
]:
"""Calculate points and confidence intervals of an arbitrary performance
curve.
This function can calculate the rates and confidence intervals of an
user-configurable ROC-style curve.
Parameters
----------
y_true
Ground truth (correct) labels.
y_score
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions (as
returned by “decision_function” on some classifiers).
axes
Which axes to calculate the curve for. Note not all combinations make
sense (no checks are performed). You should avoid computing a curve,
for example, of TPR against FNR as these rates are complementary to
1.0.
ci_functor
A callable to be used to calculate the estimate, lower, and upper
bounds for the measures of interest on each of the axes. This callable
will receive (binomial, integer) successes and failures, and produce
the rate, lower and upper bounds respecctively.
skl_functor_name
The name of a callable from scikit-learn to calculate basic thresholds
for the target curve. Use one of:
* ``roc``: :py:func:`sklearn.metrics.roc_curve`
* ``det``: :py:func:`sklearn.metrics.det_curve`
* ``pr``: :py:func:`sklearn.metrics.precision_recall_curve`
Returns
-------
tuple
Seven 1-D floating point arrays corresponding to:
* The selected metric for the first axis
* The selected metric for the second axis
* The thresholds used to evaluated the selected metrics, in decreasing
order.
* The lower confidence interval for the selected metric for the first
axis
* The lower confidence interval selected metric for the second axis
* The upper confidence interval for the selected metric for the first
axis
* The upper confidence interval selected metric for the second axis
"""
# finally, we do the computing of the curve
y_true_array = numpy.asarray(y_true, dtype=numpy.int_)
y_score_array = numpy.asarray(y_score, dtype=numpy.double)
skl_functor = sklearn.metrics.roc_curve
match skl_functor_name:
case "roc":
skl_functor = sklearn.metrics.roc_curve
case "det":
skl_functor = sklearn.metrics.det_curve
case "pr":
skl_functor = sklearn.metrics.precision_recall_curve
_, _, thresholds = skl_functor(y_true_array, y_score_array)
cm_per_threshold = [
sklearn.metrics.confusion_matrix(y_true_array, (y_score_array >= t))
.ravel()
.astype(numpy.int_)
for t in thresholds
]
# convert confusion-matrices per threshold into rate, lower, and upper
# values per threshold
op0 = axes[0].make_cm_functor(ci_functor)
ax0 = numpy.asarray([op0(k) for k in cm_per_threshold], dtype=numpy.double)
op1 = axes[1].make_cm_functor(ci_functor)
ax1 = numpy.asarray([op1(k) for k in cm_per_threshold], dtype=numpy.double)
# ensure we have no lower that is higher than the values and vice-versa
ax0[:, 1] = numpy.min((ax0[:, 0], ax0[:, 1]), axis=0)
ax0[:, 2] = numpy.max((ax0[:, 0], ax0[:, 2]), axis=0)
ax1[:, 1] = numpy.min((ax1[:, 0], ax1[:, 1]), axis=0)
ax1[:, 2] = numpy.max((ax1[:, 0], ax1[:, 2]), axis=0)
return (
ax0[:, 0],
ax1[:, 0],
thresholds,
ax0[:, 1],
ax1[:, 1],
ax0[:, 2],
ax1[:, 2],
)
[docs]
def curve_ci_hull(
curve: tuple[
typing.Iterable[float], # axes[0] rate
typing.Iterable[float], # axes[1] rate
typing.Iterable[float], # thresholds (ignored)
typing.Iterable[float], # axes[0] lower ci
typing.Iterable[float], # axes[1] lower ci
typing.Iterable[float], # axes[0] upper ci
typing.Iterable[float], # axes[1] upper ci
],
extrapolate_from_origin: bool = True,
) -> tuple[
tuple[
numpy.typing.NDArray[numpy.double], numpy.typing.NDArray[numpy.double]
],
tuple[
numpy.typing.NDArray[numpy.double], numpy.typing.NDArray[numpy.double]
],
]:
"""Calculate lower and upper confidence intervals of a curve.
This function calculates the hulls for 2 curves that are formed from points
defining the lower and upper bounds of the curve's credible/confidence
intervals for each measured threshold.
It returns the curve (no changes), as well as the lower and upper bounds of
the (central) curve.
To calculate the upper and lower curves, we do not consider the extremities
of the upper and lower bounds, as those points would translate to
pessimistic estimations of the true confidence interval bounds. Instead,
we simply find the intersection of a straight line from the origin (0,0)
and the ellipse 90-degree sector inscribed in the appropriate quarter of a
rectangle centered at the ROC point, and its lower and upper bound CI
estimates on both directions (horizontal and vertical). If
``extrapolate_from_origin`` is set to ``False``, then intersections are
created from ``(x, y) = (1, 0)``.
Parameters
----------
curve
Seven 1-D floating point arrays corresponding to:
* The selected metric for the first axis
* The selected metric for the second axis
* The thresholds (in decreasing order, as produced by scikit-learn)
* The lower confidence interval for the selected metric for the first
axis
* The lower confidence interval selected metric for the second axis
* The upper confidence interval for the selected metric for the first
axis
* The upper confidence interval selected metric for the second axis
extrapolate_from_origin
To calculate the upper hull, we consider two distinct cases: if
``extrapolate_from_origin`` is ``True`` (default), then we consider the
curve starts (or finishes) at coordinate ``(x,y) = (1,0)`` and finishes
(or starts) at ``(x,y) = (0,1)``. This is the case if the user is
plotting TPR against TNR or FPR against FNR. If
``extrapolate_from_origin`` is ``False``, then we consider the curve
starts (or finishes) at ``(x,y) = (0,0)``, and finishes (or starts) at
``(x,y) = (1,1)``.
If ``extrapolate_from_origin`` is ``True`` (default), each point of the
curve to extrapolates to the right and upper points defined by the
upper bounds of the credible/confidence intervals, and to the left and
lower points defined by the lower bounds of the intervals.
If ``extrapolate_from_origin`` is ``False``, each point of the curve to
extrapolates to the left and upper points defined by the upper bounds
of the credible/confidence intervals, and to the right and lower points
defined by the lower bounds of the intervals.
Returns
-------
tuple
Two curves as follows:
* lower: Two 1D arrays of floats that expresses the lower-bound of
curve, for the first and second coordinates respectively.
* upper: Two 1D arrays of floats that expresses the upper-bound of
curve, for the first and second coordinates respectively.
"""
def _ellipse_intersect(
a: numpy.typing.NDArray[numpy.double],
b: numpy.typing.NDArray[numpy.double],
i: numpy.typing.NDArray[numpy.double],
j: numpy.typing.NDArray[numpy.double],
quadrant: typing.Literal["tl", "tr", "bl", "br"],
) -> tuple[
numpy.typing.NDArray[numpy.double], numpy.typing.NDArray[numpy.double]
]:
r"""Calculate the intersection of a line, passing at the center of the
ellipse, and the ellipse itself.
See: https://mathworld.wolfram.com/Ellipse-LineIntersection.html
.. math::
x &= \frac{1}{\sqrt{\frac{1}{a^2}+\frac{j^2}{b^2 i^2}}} \\
y &= x \frac{j}{i}
Parameters
----------
a
Width of the ellipse.
b
Height of the ellipse.
i
Width of the reference vector (to compute angle).
j
Height of the reference vector (to compute angle).
quadrant
The quadrant applicable to the vector direction: either "tl"
(top-left), "tr" (top-right), "bl" (bottom-left) or "br"
(bottom-right).
Returns
-------
A tuple containing the x and y coordinates of the curve.
"""
# radius calculation, without direction
eps = 1e-8
num = (a**2) * (b**2) * (i**2)
den = ((b**2) * (i**2)) + ((a**2) * (j**2))
x = numpy.sqrt(
numpy.divide(num, den, out=numpy.zeros_like(a), where=(den > eps))
)
y = numpy.divide(x * j, i, out=numpy.zeros_like(a), where=(i > eps))
# when close to the x-axis, just re-use the ellipse width
x = numpy.where(j < eps, a, x)
# when close to the (1,0)-(1,1)-axis, just re-use the ellipse height
y = numpy.where(i < eps, b, y)
match quadrant:
case "tr": # add on both directions
return (i + x, j + y)
case "tl": # add on y direction, subtract on x
# n.b.: we use (1-i) to recover the original spatial
# coordinates (see below on the call to this function)
return ((1 - i) - x, j + y)
case "bl": # subtract on both directions
return (i - x, j - y)
case "br": # subtract on y direction, add on x
# n.b.: we use (1-i) to recover the original spatial
# coordinates (see below on the call to this function)
return ((1 - i) + x, j - y)
(x, y, _, x_low, y_low, x_high, y_high) = tuple(
numpy.asarray(k, dtype=numpy.double) for k in curve
)
if extrapolate_from_origin: # sectors are lower left or upper right
# N.B.: distance to origin is approximately symmetric considering the
# whole curve
# (x, y) -> (x_low, y_low)
lower_x, lower_y = _ellipse_intersect(
numpy.abs(x_low - x), # width of ellipse
numpy.abs(y_low - y), # height of ellipse
x, # width of reference vector (only angle matters)
y, # height of reference vector (only angle matters)
"bl",
)
# (x, y) -> (x_high, y_high)
upper_x, upper_y = _ellipse_intersect(
numpy.abs(x_high - x), # width of ellipse
numpy.abs(y_high - y), # height of ellipse
x, # width of reference vector (only angle matters)
y, # height of reference vector (only angle matters)
"tr",
)
else: # sectors are lower right or upper left
# N.B.: angles must be taken with respect to (1,0) and not (0,0) as the
# curve is facing the other direction.
# (x, y) -> (x_high, y_low)
lower_x, lower_y = _ellipse_intersect(
numpy.abs(x_high - x), # width of ellipse
numpy.abs(y_low - y), # height of ellipse
1 - x, # width of reference vector (only angle matters)
y, # height of reference vector (only angle matters)
"br",
)
# (x, y) -> (x_low, y_high)
upper_x, upper_y = _ellipse_intersect(
numpy.abs(x_low - x), # width of ellipse
numpy.abs(y_high - y), # height of ellipse
1 - x, # width of reference vector (only angle matters)
y, # height of reference vector (only angle matters)
"tl",
)
return (lower_x, lower_y), (upper_x, upper_y)
[docs]
def area_under_the_curve(
curve: tuple[
typing.Sequence[float] | numpy.typing.NDArray[numpy.double],
typing.Sequence[float] | numpy.typing.NDArray[numpy.double],
],
) -> float:
"""Calculate the area under a curve using a trapezoidal rule.
Parameters
----------
curve
A tuple with 2 1D sequences of floating point numbers representing the
first and second coordinate of the curve whose you want to evaluate AUC
for.
Returns
-------
The area under the curve (floating point scalar).
"""
return numpy.abs(numpy.trapezoid(curve[1], curve[0])) # type: ignore[reportReturnType]
[docs]
def average_metric(
curve: tuple[
typing.Sequence[float] | numpy.typing.NDArray[numpy.double],
typing.Sequence[float] | numpy.typing.NDArray[numpy.double],
],
) -> float:
r"""Calculate the area under a curve using a rectangle rule.
Typically used to calculate the average precision (AP) as in:
https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision
.. math::
\text{AP} &= \sum_n (R_n - R_{n-1}) P_n \\
\text{AP} &= \sum_n (curve1[n] - curve1[n-1]) curve0[n]
According to the scikit-learn documentation for
:py:func:`sklearn.metrics.average_precision_score`, this implementation
*"is different from computing the area under the precision-recall curve
with the trapezoidal rule, which uses linear interpolation and can be too
optimistic"*.
.. note::
Due to differences in the way this package computes the
precision-recall curve (does not add an extra (1.0, 0.0) at the end of
the PR curve like sklearn does, see documentation for
:py:func:`sklearn.metrics.precision_recall_curve`), we compensate this
here.
Parameters
----------
curve
A tuple with 2 1D sequences of floating point numbers representing the
first and second coordinate of the curve whose you want to evaluate AUC
for.
Returns
-------
The area under the curve (floating point scalar).
"""
return -sum(numpy.diff(numpy.append(curve[1], 0.0)) * curve[0])
[docs]
def estimated_ci_coverage(
ci_functor: CIArrayFunctor,
rng: numpy.random.Generator,
n: int = 100,
) -> numpy.typing.NDArray[numpy.double]:
"""Return the approximate coverage of a credible region or confidence
interval estimator.
Reference: `This blog post <ci-evaluation_>`_.
Parameters
----------
ci_functor : object
A callable that accepts ``k``, the number of successes (1D integer
numpy.ndarray), ``l`` (1D integer numpy.ndarray), the number of
failures to account for in the estimation of the interval/region. This
function must return two float parameters only corresponding to the
lower and upper bounds of the credible region or confidence interval
being estimated.
rng
An initialized numpy random number generator.
n
The number of bernoulli trials to consider on the binomial
distribution. This represents the total number of samples you'd have
for your experiment.
Returns
-------
The actual coverage curve, you can expect. The first row corresponds
to the values of ``p`` that were probed. The second row, the actual
coverage considering a simulated binomial distribution with size ``n``.
"""
coverage = []
size = 10000 # how many experiments to do at each try
r = numpy.arange(1 / n, 1.0, step=1 / n)
for p in r:
k = rng.binomial(n=n, p=p, size=size)
_, lower, upper = ci_functor(k, n - k)
covered = numpy.asarray((lower < p) & (p < upper), dtype=float)
coverage.append(covered.mean())
return numpy.vstack((r, numpy.asarray(coverage, dtype=float)))