from typing import List, NamedTuple, Tuple
import numpy as np
from numpy import ndarray
from sklearn import metrics
[docs]class BootstrappedROCCICurves(NamedTuple):
fpr_vals: ndarray
mean_tpr_vals: ndarray
low_tpr_vals: ndarray
high_tpr_vals: ndarray
low_az_val: ndarray
high_az_val: ndarray
[docs]def get_bootstrapped_roc_ci_curves(
y_pred: ndarray,
y_true: ndarray,
num_bootstraps: int = 100,
ci_to_use: float = 0.95,
) -> BootstrappedROCCICurves:
"""
Produces Confidence-Interval Curves to go alongside a regular ROC curve
This is done by using boostrapping.
Bootstrapping is done by selecting len(y_pred) samples randomly
(with replacement) from y_pred and y_true.
This is done num_boostraps times.
Parameters
----------
y_pred
The predictions (scores) produced by the system being evaluated
y_true
The true labels (1 or 0) which are the reference standard being used
num_bootstraps
How many times to make a random sample with replacement
ci_to_use
Which confidence interval is required.
Returns
-------
fpr_vals
An equally spaced set of fpr vals between 0 and 1
mean_tpr_vals
The mean tpr vals (one per fpr_val) obtained by boostrapping
low_tpr_vals
The tpr vals (one per fpr_val) representing lower curve for CI
high_tpr_vals
The tpr vals (one per fpr_val) representing the upper curve for CI
low_Az_val
The lower Az (AUC) val for the given CI_to_use
high_Az_val
The higher Az (AUC) val for the given CI_to_use
"""
rng_seed = 40 # control reproducibility
bootstrapped_az_scores: List[float] = []
tprs_list: List[ndarray] = []
base_fpr = np.linspace(0, 1, 101)
rng = np.random.RandomState(rng_seed)
while len(bootstrapped_az_scores) < num_bootstraps:
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(y_pred) - 1, len(y_pred))
if len(np.unique(y_true[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
# get the fpr and tpr for this bootstrap
fpr, tpr, thresholds = metrics.roc_curve(
y_true[indices], y_pred[indices]
)
# get values at fixed fpr locations
tpr_b = np.interp(base_fpr, fpr, tpr)
tpr_b[0] = 0.0
# append to list for all bootstraps
tprs_list.append(tpr_b)
# Get the Az score
az_score = metrics.auc(fpr, tpr)
bootstrapped_az_scores.append(az_score)
# Get the mean of the boostrapped tprs (at each fpr location)
tprs_array = np.array(tprs_list)
mean_tprs = tprs_array.mean(axis=0)
# half the error margin allowed
one_sided_ci = (1 - ci_to_use) / 2
tprs_lower, tprs_upper = _get_confidence_intervals(
n_bootstraps=len(base_fpr),
one_sided_ci=one_sided_ci,
points_array=tprs_array,
)
sorted_az_scores = np.array(bootstrapped_az_scores)
sorted_az_scores.sort()
az_ci_lower = sorted_az_scores[int(one_sided_ci * len(sorted_az_scores))]
az_ci_upper = sorted_az_scores[
int((1 - one_sided_ci) * len(sorted_az_scores))
]
return BootstrappedROCCICurves(
fpr_vals=base_fpr,
mean_tpr_vals=mean_tprs,
low_tpr_vals=tprs_lower,
high_tpr_vals=tprs_upper,
low_az_val=az_ci_lower,
high_az_val=az_ci_upper,
)
[docs]def average_roc_curves(
roc_curves: List[BootstrappedROCCICurves], bins: int = 200
) -> BootstrappedROCCICurves:
"""
Averages ROC curves using vertical averaging (fixed FP rates),
which gives a 1D measure of variability.
Parameters
----------
curves
List of BootstrappedROCCICurves to be averaged
bins (optional)
Number of false-positives to iterate over. (Default: 200)
Returns
-------
BootstrappedROCCICurves
ROC class containing the average over all ROCs.
"""
tprs = []
low_tprs = []
high_tprs = []
low_azs = []
high_azs = []
mean_fpr = np.linspace(0, 1, bins)
for roc in roc_curves:
# get values at fixed fpr locations
interp_tpr = np.interp(mean_fpr, roc.fpr_vals, roc.mean_tpr_vals)
interp_tpr[0] = 0.0
interp_low_tpr = np.interp(mean_fpr, roc.fpr_vals, roc.low_tpr_vals)
interp_high_tpr = np.interp(mean_fpr, roc.fpr_vals, roc.high_tpr_vals)
tprs.append(interp_tpr)
low_tprs.append(interp_low_tpr)
high_tprs.append(interp_high_tpr)
low_azs.append(roc.low_az_val)
high_azs.append(roc.high_az_val)
# get the mean tpr of all ROCs
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_low_tpr = np.mean(low_tprs, axis=0)
mean_high_tpr = np.mean(high_tprs, axis=0)
mean_low_az = np.mean(low_azs, axis=0)
mean_high_az = np.mean(high_azs, axis=0)
return BootstrappedROCCICurves(
fpr_vals=mean_fpr,
mean_tpr_vals=mean_tpr,
low_tpr_vals=mean_low_tpr,
high_tpr_vals=mean_high_tpr,
low_az_val=mean_low_az,
high_az_val=mean_high_az,
)
[docs]class BootstrappedCIPointError(NamedTuple):
mean_fprs: ndarray
mean_tprs: ndarray
low_tpr_vals: ndarray
high_tpr_vals: ndarray
low_fpr_vals: ndarray
high_fpr_vals: ndarray
[docs]def get_bootstrapped_ci_point_error(
y_score: ndarray,
y_true: ndarray,
num_bootstraps: int = 100,
ci_to_use: float = 0.95,
exclude_first_last: bool = True,
) -> BootstrappedCIPointError:
"""
Produces Confidence-Interval errors for individual points from ROC
Useful when only few ROC points exist so they will be plotted individually
e.g. when range of score values in y_score is very small
(e.g. manual observer scores)
Note that this method only works by analysing the cloud of boostrapped
points generatedfor a particular threshold value. A fixed number of
threshold values is essential. Therefore the scores in y_score must be
from a fixed discrete set of values, eg. [1,2,3,4,5]
Bootstrapping is done by selecting len(y_score) samples randomly
(with replacement) from y_score and y_true.
This is done num_boostraps times.
Parameters
----------
y_score
The scores produced by the system being evaluated. A discrete set of
possible scores must be used.
y_true
The true labels (1 or 0) which are the reference standard being used
num_bootstraps: integer
How many times to make a random sample with replacement
ci_to_use
Which confidence interval is required.
exclude_first_last
The first and last ROC point (0,0 and 1,1) are usually irrelevant
in these scenarios where only a few ROC points will be
individually plotted.
Set this to true to ignore these first and last points.
Returns
-------
mean_fprs
The array of mean fpr values (1 per possible ROC point)
mean_tprs
The array of mean tpr values (1 per possible ROC point)
low_tpr_vals
The tpr vals (one per ROC point) representing lowest val in CI
high_tpr_vals
The tpr vals (one per ROC point) representing the highest val in CI
low_fpr_vals
The fpr vals (one per ROC point) representing lowest val in CI_to_use
high_fpr_vals
The fpr vals (one per ROC point) representing the highest val in CI
"""
rng_seed = 40 # control reproducibility
tprs_list: List[ndarray] = []
fprs_list: List[ndarray] = []
rng = np.random.RandomState(rng_seed)
num_possible_scores = len(np.unique(y_score))
while len(tprs_list) < num_bootstraps:
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(y_score) - 1, len(y_score))
if len(np.unique(y_true[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
# get ROC data this boostrap
fpr, tpr, thresholds = metrics.roc_curve(
y_true[indices], y_score[indices]
)
if len(fpr) < num_possible_scores + 1:
# if all scores are not represented in this selection then a
# different number of ROC thresholds will be defined.
# This causes problems.
continue
# remove first and last items - these are just end points of the ROC
if exclude_first_last:
fpr = fpr[1:-1]
tpr = tpr[1:-1]
# append these boostrap values to the list
tprs_list.append(tpr)
fprs_list.append(fpr)
# Get the mean values for fpr and tpr at each ROC location
tprs_array = np.array(tprs_list)
fprs_array = np.array(fprs_list)
mean_tprs = tprs_array.mean(axis=0)
mean_fprs = fprs_array.mean(axis=0)
# half the error margin allowed
one_sided_ci = (1 - ci_to_use) / 2
tprs_lower, tprs_upper = _get_confidence_intervals(
n_bootstraps=tprs_array.shape[1],
one_sided_ci=one_sided_ci,
points_array=tprs_array,
)
fprs_lower, fprs_upper = _get_confidence_intervals(
n_bootstraps=fprs_array.shape[1],
one_sided_ci=one_sided_ci,
points_array=fprs_array,
)
return BootstrappedCIPointError(
mean_fprs=mean_fprs,
mean_tprs=mean_tprs,
low_tpr_vals=tprs_lower,
high_tpr_vals=tprs_upper,
low_fpr_vals=fprs_lower,
high_fpr_vals=fprs_upper,
)
def _get_confidence_intervals(
*, n_bootstraps: int, one_sided_ci: float, points_array
) -> Tuple[ndarray, ndarray]:
ci_upper = []
ci_lower = []
for bootstrap_point in range(n_bootstraps):
points = points_array[:, bootstrap_point]
points.sort()
tpr_upper = points[int((1 - one_sided_ci) * len(points))]
ci_upper.append(tpr_upper)
tpr_lower = points[int(one_sided_ci * len(points))]
ci_lower.append(tpr_lower)
return np.asarray(ci_lower), np.asarray(ci_upper)