Source code for bfbrain.AL_Metrics

"""This module contains code for various performance metrics
which BFBrain can track over the course of active learning.
"""

from abc import ABC, abstractmethod
from os import sys
from bfbrain.Score_Functions import *
from bfbrain.False_Proximity_Test import combined_false_score
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from functools import partial

scoring_funcs = {'BALD':BALD, 'QBDC':QBDC, 
                 'random':Random_AL, 'MaxEntropy':Max_Entropy,
                   'variation_ratios':Variation_Ratios, 
                   'predictive_variance':Predictive_Variance}

valid_sc_types = ['val', 'train', 'pool', 'model']

metric_reductions = {'mean':np.mean, 'max':np.max, 
                     'min':np.min, 'std': np.std}

[docs] def process_score_fn(score_fn, name): """A utility function which translates a string specifying one of the predefined acquisition scoring functions into the corresponding numerical method. Parameters ---------- score_fn : {'BALD', 'QBDC', 'random', 'MaxEntropy', 'variation_ratios', 'predictive_variance'} or callable. If this function is a callable, it must have the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32) name : str, optional If specified, this name is returned unaltered. Otherwise, a name will be automatically generated based on score_fn. Returns ------- callable A valid score_fn to be used in various performance metrics. str A string which will be used to generate a name for an ALMetric object. """ if callable(score_fn): if name is None: try: name = list(scoring_funcs.keys())[list(scoring_funcs.values()).index(score_fn)] except ValueError as error: name = 'score' else: if not isinstance(score_fn, str): raise Exception('score_fn must be a string which acts as a key in the dict scoring_funcs in score_functions.py, or a callable.') if name is None: name = score_fn try: score_fn = scoring_funcs[score_fn] except KeyError as error: raise Exception('score_fn was a string, but was not recognized as corresponding to a known metric. Valid string inputs are {}'.format(list(scoring_funcs.keys()))) return score_fn, name
def _get_reduction(red_name): """A utility method to connect a string or list of strings specifying certain reductions of a 1-D NumPy array to the corresponding functions. Parameters ---------- red_name : {'mean', 'max', 'min', 'std'} or list of these strings. Returns ------- list of callables A list of callables (possibly of length 1) which correspond to the reduction(s) named in red_name list of str red_name. If red_name was a single str object, it is returned as a list of str with length 1. """ try: if isinstance(red_name, str): reduction = [metric_reductions[red_name]] red_name = [red_name] else: reduction = [metric_reductions[red] for red in red_name] return reduction, red_name except KeyError as error: raise Exception('Unrecognized value for argument "reduction". Must be one of {} or a list of those values.'.format(list(metric_reductions.keys()))) def _check_reduction(reduction, red_name): """A utility function which checks that an input string is in a list of strings (specifically keys in metric_reductions), and raises an error if it's not. Parameters ---------- reduction : str red_name : list of str """ if reduction not in red_name: raise Exception('Please specify a reduction that the metric has recorded. Options are {}'.format(red_name))
[docs] class ALMetric(ABC): """A generic abstract class for computing and recording performance metrics for active learning. All performance metrics in BFBrain inherit from this class. Attributes ---------- status_history : list A list which contains a record, for each round of active learning, for whichever metric the subclass will measure. The entries of status_history may be, depending on the subclass, virtually any kind of data or data structure, as long as the elements are picklable. sc_type : {'val', 'train', 'pool', 'model'} A string which denotes what type of metric the ALMetric object is, since different metrics are recorded at different points in the active learning loop. If sc_type is 'val', this metric is computed using a validation data set immediately after each active learning round completes. If sc_type is 'train', this metric is computed immediately after new training data is generated in the active learning loop, but before the neural network's weights are reset and training commences. It is evaluated using the newly-generated training data. If sc_type is 'pool', this metric is computed using the pool of candidate points from which new training samples are drawn at each iteration. It is computed immediately after the new training data is selected from the pool. If sc_type is model, the a metric is computed without reference to any data set (validation, training, or pool) present in the active learning loop, at the end of each active learning iteration. The only implemented metrics which have sc_type 'model' measure predictive stability on some specified unlabelled set of points, namely UnlabelledAgreement and UnlabelledDeltaFScore, but the possibility remains that different sorts of metrics in this class, for example the one based on error stability computed directly from the neural network weights discussed in arXiv:2104.01836, may be desirable for a user to implement. name : str A string which denotes a name for this metric. In a list of metrics passed to a BFBLearner class, the names of each member of the list should be unique. Parameters ---------- sc_type : {'val', 'train', 'pool', 'model'} name : str """ def __init__(self, sc_type, name): self.status_history = [] if sc_type not in valid_sc_types: raise Exception('sc_type must be a string, and must be one of {}'.format(valid_sc_types)) self.sc_type = sc_type self.name = sc_type + '_' + name
[docs] def record_score(self, *args): """Appends the latest value for the performance metric to the status_history object. This method calls an abstract method "performance_check" which will turn whatever input is specified in the method into the metric the object is supposed to track. The method performance_check, and therefore the arguments going into this method, will vary depending on the specific subclass of ALMetric. """ self.status_history.append(self.performance_check(*args)) return self.status_history[-1]
[docs] def print_status(self, file = sys.stdout): """A method which prints the last entry in status_history to a file (or the console). Uses the method perf_message (which is often overwritten in the child class) to identify the metric being printed and separates status_history elements that are tuples into different printout lines, for clarity. """ last_status = self.status_history[-1] out_message = self.perf_message() if type(self.status_history[-1]) == tuple: for i, stat in enumerate(last_status): print(out_message[i], file = file) print(stat, file = file) else: print(out_message, file = file) print(last_status, file = file)
# A method which prints out the message
[docs] def perf_message(self): """A method which prints out a message that is helpful in identifying what metric is being reported when a user calls print_status. Often overwritten in a child class. """ if isinstance(self.status_history[-1], tuple): return tuple(self.name + ':' for _ in range(len(self.status_history[-1]))) return self.name + ':'
[docs] @abstractmethod def performance_check(self, *args): """An abstract class which takes some arguments (depending on the type of performance metric) and computes the quantity or quantities that the performance metric is supposed to track. This method is called by record_score and its results are appended to the status_history attribute. """ pass
[docs] def get_metric(self, *args): """A function which reduces the status_history object to a list of single numbers (usually some sort of figure of merit) in the event that the members of status_history are a list or a tuple. By default, it simply returns the full status_history list and must be overwritten in subclasses which have lists or tuples as entries in status_history. Parameters ---------- *args : Any Some overwritten versions of this class can accept optional arguments, although the method does not in the parent class. Returns ------- A NumPy array featuring information from status_history for plotting. """ if isinstance(self.status_history[-1], tuple): return np.transpose(np.array([[stat[0] for stat in self.status_history]])) return np.transpose(np.array([self.status_history]))
[docs] def reset_data(self): """ A function which resets the metric data entirely. In some subclasses, this must be overloaded to properly reset the class. """ self.status_history = []
[docs] def get_legend(self, *args): """Returns a legend for a plot of the metric given by plot_metric. Often must be overwritten in subclasses. Parameters ---------- *args : Any Some overwritten versions of this class can accept optional arguments, although the method does not in the parent class. Must take the same arguments as get_metric. Returns ------- list of strings A list of strings which are usable to specify a legend in matplotlib. """ return [self.name]
[docs] def plot_metric(self, filepath = None, **kwargs): """Plots the performance metric as a function of the number of active learning iterations. Parameters ---------- filepath : str, optional If this argument is specified, then the plot of the metric will be saved as a .png file in the directory with the name given by filepath. **kwargs : dict, optional Many subclasses of ALMetric have get_metric and get_legend methods which take some keyword arguments-- these can be specified when calling plot_metric. """ metrics = self.get_metric(**kwargs) legend = self.get_legend(**kwargs) plt.figure() plt.plot(metrics) plt.legend(legend) plt.xlabel("AL Iterations") plt.ylabel(self.name) if(filepath is None): plt.show() else: plt.savefig(filepath+ '/' + self.name + '.png') plt.close()
[docs] class ModelMetric(ALMetric): """An abstract class for handling metrics which depend only on the BFBLearner object's model, plus some consistent internal information. This class can be used as a "catch-all" for metrics which don't fit neatly into other categories-- for example, we use it in UnlabelledPredsMetric and its child classes to track the predictions of the model on some unlabelled set of inputs. """ def __init__(self, name): super().__init__(sc_type = 'model', name = name)
[docs] @abstractmethod def performance_check(self, model): """The performance_check method now specifies the arguments that a class inheriting from TrainMetric should use. """ pass
[docs] class UnlabelledPredsMetric(ModelMetric): """An abstract class for handling metrics which go by the predictions of the model on some unlabelled set of quartic coefficients. Attributes ---------- lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. ds : tf.data.Dataset A Tensorflow dataset generated from lams. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. Parameters ---------- lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. name : str The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'model_'+name. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. """ def __init__(self, lams, name, batch_size = 200000): super().__init__(name = name) self.lams = lams self.batch_size = 200000 self.ds = tf.data.Dataset.from_tensor_slices(lams).batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE).cache()
[docs] @abstractmethod def performance_check(self, model): pass
def __getstate__(self): """Used to pickle the metric. """ state = self.__dict__.copy() del state['ds'] return state def __setstate__(self, state): """Used to unpickle the metric. """ self.__dict__.update(state) self.ds = tf.data.Dataset.from_tensor_slices(self.lams).batch(self.batch_size, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE).cache()
[docs] class UnlabelledAgreement(UnlabelledPredsMetric): """A metric which computes agreement (Cohen's kappa) among the model between successive iterations of active learning on a specified set of unlabelled points. Attributes ---------- status_history : list of floats The entries of this status_history object will be Cohen's kappa between successive iterations of active learning on lams. old_preds : np.array(np.float32) The previous model's predictions on lams. Preserved to compare to the current model. lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. ds : tf.data.Dataset A Tensorflow dataset generated from lams. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default this will be 'model_agreement' n_trials : int, default=100 The number of forward passes through the network to get the predictions from Monte Carlo dropout. Parameters ---------- lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. name : str, default='agreement' The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'model_'+name. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. n_trials : int, default=100 The number of forward passes through the network to get the predictions from Monte Carlo dropout. """ def __init__(self, lams, name = 'agreement', batch_size = 200000, n_trials = 100): super().__init__(lams, name, batch_size, n_trials) self.n_trials = n_trials self.old_preds = None
[docs] def performance_check(self, model): """Computes Cohen's kappa for the classifier between successive active learning iterations, on the unlabelled quartic coefficients lams. If there is no previous model (i.e., no active learning has been done), returns 0. and saves the current model's predictions as old_preds. """ if self.old_preds is None: out_preds = [] for x in self.ds: out_preds.append(tf.reshape(MC_call_fast(model, x, self.n_trials), shape = [-1]).numpy()) out_preds = np.concatenate(out_preds, axis = 0) self.old_preds = out_preds return 0. out_preds = [] for x in self.ds: out_preds.append(tf.reshape(MC_call_fast(model, x, self.n_trials), shape = [-1]).numpy()) out_preds = np.concatenate(out_preds, axis = 0) Ao = np.count_nonzero(np.logical_or(np.logical_and(out_preds >= 0.5, self.old_preds >=0.5), np.logical_and(out_preds < 0.5, self.old_preds < 0.5))) / len(out_preds) p_pos_new = np.count_nonzero(out_preds >= 0.5)/len(out_preds) p_pos_old = np.count_nonzero(self.old_preds >= 0.5)/len(self.old_preds) Ae = 2.*p_pos_new*p_pos_old - p_pos_new - p_pos_old + 1. self.old_preds = out_preds return (Ao -Ae) / (1. - Ae)
[docs] def get_metric(self): """Simply returns Cohen's kappa as a function of the number of active learning iterations. """ return np.transpose(np.array([[stat for stat in self.status_history][1:]]))
[docs] def reset_data(self): """Resets the data in the metric. """ super().reset_data() self.old_preds = None
[docs] class UnlabelledDeltaF(UnlabelledPredsMetric): """A metric which computes the estimated change in F score on a specified unlabelled set of points for the model between successive iterations of active learning on a specified set of unlabelled points, based on the methodology of arXiv:cs/1901.09118. Attributes ---------- status_history : list of floats The entries of this status_history object will be the estimated change in F score between successive iterations of active learning on lams. old_preds : np.array(np.float32) The previous model's predictions on lams. Preserved to compare to the current model. lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. ds : tf.data.Dataset A Tensorflow dataset generated from lams. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default this will be 'model_delta_F' n_trials : int, default=100 The number of forward passes through the network to get the predictions from Monte Carlo dropout. Parameters ---------- lams : np.array(np.float32, np.float32) A 2-D NumPy array representing sets of quartic potential coefficients. This will be an unlabelled set of points the model will make predictions on. name : str, default='delta_F' The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'model_'+name. batch_size : int, default=200000 The maximum size of batches of lams that will be transferred to the GPU and computed with at one time. n_trials : int, default=100 The number of forward passes through the network to get the predictions from Monte Carlo dropout. """ def __init__(self, lams, name = 'delta_F', batch_size = 200000, n_trials = 100): super().__init__(lams, name, batch_size) self.old_preds = None self.n_trials = n_trials
[docs] def performance_check(self, model): """Computes the estimated change in F score for the classifier between successive active learning iterations, on the unlabelled quartic coefficients lams. If there is no previous model (i.e., no active learning has been done), returns np.inf and saves the current model's predictions as old_preds. """ if self.old_preds is None: out_preds = [] for x in self.ds: out_preds.append(tf.reshape(MC_call_fast(model, x, self.n_trials), shape = [-1]).numpy()) out_preds = np.concatenate(out_preds, axis = 0) self.old_preds = out_preds return np.inf out_preds = [] for x in self.ds: out_preds.append(tf.reshape(MC_call_fast(model, x, self.n_trials), shape = [-1]).numpy()) out_preds = np.concatenate(out_preds, axis = 0) a = np.count_nonzero(np.logical_and(out_preds >= 0.5, self.old_preds >= 0.5)) b = np.count_nonzero(np.logical_and(out_preds < 0.5, self.old_preds >=0.5)) c = np.count_nonzero(np.logical_and(out_preds >= 0.5, self.old_preds < 0.5)) self.old_preds = out_preds return 1. - ((2*a)/(2*a + b + c))
[docs] def get_metric(self): """Simply returns the change in F score as a function of the number of active learning iterations. """ return np.transpose(np.array([[stat for stat in self.status_history][1:]]))
[docs] def reset_data(self): """Resets the data in the metric. """ super().reset_data() self.old_preds = None
[docs] class ValidationMetric(ALMetric): """An abstract class for handling metrics which measure performance of the model on a validation set. This class exists primarily to remind the user that any validation-set-based performance metrics must have their performance_check method take the inputs (tf.keras.Model, tf.data.Dataset) """ def __init__(self, name): super().__init__(sc_type = 'val', name = name)
[docs] @abstractmethod def performance_check(self, model, ds): """The performance_check method now specifies the arguments that a class inheriting from ValidationMetric should use. Parameters ---------- model : tf.keras.Model The model in a BFBLearner object. ds : tf.data.Dataset A Tensorflow dataset representing the labelled validation set. """ pass
[docs] class TrainMetric(ALMetric): """An abstract class for handling metrics which measure predictions of the model on newly-added training data. This class exists primarily to remind the user that any training-set-based performance metrics must have their performance_check method take the inputs (tf.keras.Model, tf.Tensor) """ def __init__(self, name): super().__init__(sc_type = 'train', name = name)
[docs] @abstractmethod def performance_check(self, model, lams, labels): """The performance_check method now specifies the arguments that a class inheriting from TrainMetric should use. Parameters ---------- model : tf.keras.Model The model in a BFBLearner object. lams : tf.Tensor(tf.float32, tf.float32) A 2-D Tensorflow tensor representing sets of quartic potential coefficients. This will be all of the points that have been newly added to the training set in a given round of active learning. labels : tf.Tensor(bool) A 1-D Tensorflow tensor of booleans representing the labels of the new training data points. The ith element of this tensor is True if the ith row of lams represents a set of quartic potential coefficients that the oracle has labelled as bounded-from-below, False otherwise. """ pass
[docs] class PoolMetric(ALMetric): """An abstract class for handling metrics which measure predictions of the model on the pools of candidate points from which new training data is drawn. This class exists primarily to ensure that these metrics have additional abstract methods which much be specified to implement a class of this sort. Attributes ---------- batch_scores : list Because the pool of candidate points are generated in discrete manageable batches, the metrics are computed over each individual batch and then combined, the precise manner of which depends on the specific metric in question. However, all pool metrics must have this attribute to act as temporary storage of the individual batch results before they can be combined. """ def __init__(self, name): super().__init__(sc_type = 'pool', name = name) self.batch_scores = []
[docs] @abstractmethod def record_batch(self, *args): """An abstract method which will record an individual batch's results to batch_scores, in a manner that must be specified in a subclass. """ pass
[docs] @abstractmethod def record_score(self): """The concrete record_score method of the ALMetric class must be overwritten with an abstract version, which must in turn be specified in subclasses. """ pass
[docs] class PoolMetricReduction(PoolMetric): """An abstract class which inherits from PoolMetric, but features methods to automatically take the mean/min/max of the scores determined in record_batch. This is an abstract class which shouldn't be instantiated directly, but allows for rapid prototyping of a variety of pool metrics. Attributes ---------- reduction : a list containing elements of {np.mean, np.min, np.max} This is the reduction that is performed on individual batches, and finally, among the results for all batches, to produce the entries in status_history. If a list of reductions are applied, then the elements of status_history will be lists with each element being a different reduction being applied to the pool score. red_name : a list containing elements of {'mean', 'min', 'max'} This list of strings will contain the same information as reduction, but is used for labelling purposes. Parameters ---------- name : str A unique identifier for the metric in the list of metrics in a BFBLearner object. red : {'mean', 'min', 'max'} or a list of those values This argument specifies the reduction that is performed on individual batches, and finally, among the results for all batches, to produce the entries in status_history. If a list of reductions are applied, then the elements of status_history will be lists with each element being a different reduction being applied to the pool score. """ def __init__(self, name, red = ['mean', 'min', 'max']): self.reduction, self.red_name = _get_reduction(red) if 'std' in red: raise Exception('Do not request standard deviation for scores found from the pool of candidate points-- insufficient data is stored to compute this quantity.') super().__init__(name = name) self.batch_scores = []
[docs] def record_batch(self, *args): """Records a score generated by performance_check (which must be specified in a subclass) for an individual batch in the pool of candidate points to the batch_scores list. """ self.batch_scores.append(self.performance_check(*args))
[docs] def record_score(self): """Combines the metrics computed for each batch into a single status_history entry, and append that entry to status_history. """ self.status_history.append([self.reduction[i]([b_score[i] for b_score in self.batch_scores]) for i in range(len(self.reduction))]) self.batch_scores = [] return self.status_history[-1]
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return self.name + ' (unlabelled pool) {}:'.format(list(self.red_name))
[docs] def get_metric(self, reduction = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- reduction : str in self.red_name, optional If specified, get_metric will only return the values corresponding to the specified reduction. If not specified, get_metric will return the status_history object in its entirety. Returns ------- np.array Represents some plottable set of values from status_history. """ if reduction is not None: _check_reduction(reduction, self.red_name) return np.transpose(np.array([[stat[self.red_name.index(reduction)] for stat in self.status_history]])) else: return np.array(self.status_history)
[docs] def reset_data(self): """Reset the data in status_history. """ super().reset_data() self.batch_scores = []
[docs] def get_legend(self, reduction = None): """In this subclass, get_legend can take a keyword argument. Parameters ---------- reduction : str in self.red_name, optional get_legend will return a legend consistent with the get_metric result with the same reduction argument passed. Returns ------- list of str An argument to specify a legend in matplotlib. """ if reduction is not None: return [reduction + '_' + self.name] else: return [red + '_' + self.name for red in self.red_name]
# A metric for keeping track of the estimated change in F score for successive iterations of the active learning algorithm on unlabelled data, based on arXiv:cs/1901.09118
[docs] class PoolDeltaF(PoolMetric): """A metric for keeping track of the estimated change in F score for successive iterations of the active learning algorithm on unlabelled data, based on arXiv:cs/1901.09118. Each time active learning produces a new unlabelled pool of candidate points to draw training examples from, this metric computes the model's predicted labels for all of these points, and stores both the pool of points and the predictions. Then, after another round of active learning, the metric computes the NEW model's predicted labels on the stored pool of points. Then, the two sets of predictions are compared and the estimated change in F score over the pool distribution is computed from the level of agreement between the two sets of predictions, following the procedure outlined in arXiv:cs/1901.09118. Predictions are based on Monte Carlo dropout with 100 forward passes through the neural network. Attributes ---------- status_history : list of floats A list which contains a record, for each round of active learning, of the estimated change in F score. name : str, default='delta_F' A string which denotes a name for this metric. In a list of metrics passed to a BFBLearner class, the names of each member of the list should be unique. old_pool : list of np.array(np.float32, np.float32) A list of 2-D NumPy arrays, each of which represents a batch of points generated as part of the pool of candidate points in active learning. This array stores the points that made up the PREVIOUS round's pool of points, so that the current model can make predictions on them. new_pool : list of np.array(np.float32, np.float32) A list of 2-D NumPy arrays, each of which represents a batch of points generated as part of the pool of candidate points in active learning. This array stores the points that made up the CURRENT round's pool of points, so that the current model can make predictions on them. After recording the status_history value for this metric, new_pool's values are transferred to old_pool, and then new_pool is cleared. old_preds : list of np.array(np.float32) A list to contain all of the PREVIOUS model's predictions on old_pool. new_preds : list of np.array(np.float32) A list to contain all of the CURRENT model's predictions on old_pool. newer_preds : list of np.array(np.float32) A list to contain all of the CURRENT model's predictions on new_pool. After recording the status_history value for this metric, newer_preds values are transferred to old_preds, and then newer_preds and new_preds are both cleared. old_pool_iter : iter An iterator over old_pool. Replaced whenever old_pool is overwritten. """ def __init__(self, name = 'delta_F'): super().__init__(name = name) # Predictions of the last model on old_pool self.old_preds = [] # Predictions of the current model on old_pool self.new_preds = [] # Predictions of the current model on new_pool self.newer_preds = [] self.old_pool = [] self.new_pool = [] self.old_pool_iter = iter(self.old_pool)
[docs] def record_batch(self, model, L): """Records newly-generated pool points and the current model's prediction on them. After the first active learning iteration, also records the current model's predictions on the corresponding element of old_pool, since after the first active learning iteration the old_pool and new_pool will always have the same number of elements. Parameters ---------- model : tf.keras.model The current Tensorflow model of a BFBLearner object. L : np.array(np.float32, np.float32) A 2-D NumPy array representing a batch of pool points proposed to the neural network as possible training points. """ scores, pool = self.performance_check(model, L) if len(self.old_pool) > 0: old_scores = tf.reshape(MC_call_fast(model, next(self.old_pool_iter), 1000),shape = [-1]).numpy() self.new_preds.append(old_scores) self.newer_preds.append(scores) self.new_pool.append(pool)
[docs] def performance_check(self, model, L): """Computes the predictions of the neural network on an input batch of pool points, and returns a tuple of the predictions and the pool points. Parameters ---------- model : tf.keras.model The current Tensorflow model of a BFBLearner object. L : np.array(np.float32, np.float32) A 2-D NumPy array representing a batch of pool points proposed to the neural network as possible training points. Returns ------- np.array(np.float32) A 1-D NumPy array of model predictions on L np.array(np.float32, np.float32) The array L """ return tf.reshape(MC_call_fast(model, L, 1000), shape = [-1]).numpy(), L.numpy()
[docs] def record_score(self): """Combines the predictions made on individual batches in order to produce an estimate of the change in F score on old_pool, and appends this estimate onto status_history. Then, overwrites old_pool with new_pool, old_preds with newer_preds, and then resets new_pool, new_preds, and newer_preds. Returns ------- float The last element of status_history. """ if len(self.newer_preds) == len(self.new_preds): a, b, c = 0, 0, 0 for i in range(len(self.new_preds)): a += np.count_nonzero(np.logical_and(self.new_preds[i] >= 0.5, self.old_preds[i] >= 0.5)) b += np.count_nonzero(np.logical_and(self.new_preds[i] < 0.5, self.old_preds[i] >=0.5)) c += np.count_nonzero(np.logical_and(self.new_preds[i] >= 0.5, self.old_preds[i] < 0.5)) self.status_history.append(1. - ((2*a)/(2*a + b + c))) else: self.status_history.append(np.inf) self.old_preds = self.newer_preds self.newer_preds = [] self.old_pool = self.new_pool self.new_pool = [] self.new_preds = [] self.old_pool_iter = iter(self.old_pool) return self.status_history[-1]
[docs] def reset_data(self): """Clears all data in the metric. """ super().reset_data() # Predictions of the last model on old_pool self.old_preds = [] # Predictions of the current model on old_pool self.new_preds = [] # Predictions of the current model on new_pool self.newer_preds = [] self.old_pool = [] self.new_pool = [] self.old_pool_iter = iter(self.old_pool)
# A metric that simply records the model.evaluate() method on a Tensorflow dataset. Generally used to check the accuracy of the model on a validation set.
[docs] class ModelEvaluation(ValidationMetric): """A metric that simply records the model.evaluate() method on a labelled Tensorflow dataset (the validation set in our context). BFBrain's call to Tensorflow's evaluate method keeps track of the model accuracy, false positives, and false negatives via evaluate(). Note that this method does NOT use Monte Carlo dropout to compute these quantities, but instead approximates the mean of Monte Carlo dropout via a single pass through the network with no dropout (and all model weights divided by 1 - <dropout probability>). Attributes ---------- status_history : list of lists of np.float32 The entries of status_history here will be lists of the form [<binary accuracy>, <false positives>, <false negatives>], evaluated over the validation set. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default, this will be 'val_accuracy'. Parameters ---------- name : str, default='accuracy' The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'val_'+name. """ def __init__(self, name = 'accuracy'): super().__init__(name)
[docs] def performance_check(self, model, ds): """This performance_check simply calls Tensorflow's model.evaluate() method, and ignores the first term (the loss) """ return model.evaluate(ds, verbose = 0)[1:]
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return 'Validation accuracy [accuracy, false positives, false negatives]:'
[docs] def get_metric(self): """For plotting, this metric simply requests the binary accuracy over the active learning iterations. """ return np.transpose(np.array([[stat[0] for stat in self.status_history][1:]]))
# A metric that records the binary accuracy, false positives, and false negatives evaluated with Monte Carlo dropout on a Tensorflow dataset.
[docs] class MCModelEvaluation(ValidationMetric): """A metric that records the same data as ModelEvaluation on a labelled Tensorflow dataset (the validation set in our context), but using Monte Carlo dropout with 100 forward passes through the neural network. Otherwise functions identically to ModelEvaluation. Attributes ---------- name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default, this will be 'val_MC_accuracy'. Parameters ---------- name : str, default='MC_accuracy' The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'val_'+name. """ def __init__(self, name = 'MC_accuracy'): super().__init__(name)
[docs] def performance_check(self, model, ds): """This performance_check finds the binary accuracy, false positives, and false negatives over a validation data set. """ false_pos = 0 false_neg = 0 total = 0 for x, y in ds: total += tf.shape(y)[0] result = MC_call_fast(model, x, 100) false_pos = false_pos + tf.math.count_nonzero(tf.logical_and(result >= 0.5, ~y)) false_neg = false_neg + tf.math.count_nonzero(tf.logical_and(result < 0.5, y)) return [1.-(tf.cast(false_pos + false_neg, tf.float32) / tf.cast(total, tf.float32)).numpy(), false_pos.numpy(), false_neg.numpy()]
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return 'MC validation accuracy [accuracy, false positives, false negatives]:'
[docs] def get_metric(self): return np.transpose(np.array([[stat[0] for stat in self.status_history][1:]]))
[docs] class DecisionBoundaryScore(ValidationMetric): """A metric which records a "decision boundary score"-- for each point that the (non-MC-dropout) neural network classifies incorrectly in a validation set, this method uses gradient ascent/descent to determine the angular distance on the hypersphere of quartic coeffecients to the decision boundary. Reports the results of the mean, standard deviation, and max of these scores in radians for both false positives and false negatives, as well as the number of points in both groups which exceed some input number of radians distance from the decision boundary. This metric can be extremely computationally intensive, and generally can reflect the deterministic forward pass's tendency to occasionally be incorrect and very overconfident. However, if a user is insistent on only using a single forward pass of the neural network to evaluate a network, this method enables them to be somewhat confident that any points that are mislabelled will be close in parameter space to points which are correctly labelled. Attributes ---------- status_history : list of tuples of lists of np.float32 The entries of status_history here will contain a tuple of two lists of the form [<mean>, <std>, <max>, # > tol_dist radians], the first for false positives and the second for false negatives. The mean, standard deviation, and max values are computed from the angular distance (in radians) of the incorrectly classified points to points along the decision boundary. The final entry is the number of points for which this distance is greater than some user-specified cutoff, tol_dist. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default, this will be 'val_combined_false_score'. tol_dist : float The maximum angular distance of an incorrectly-classified point to the decision boundary that the user considers acceptable. For small (O(0.01)) values of this angle, it roughly corresponds to the fractional degree of correction of the quartic coefficients to reach the decision boundary-- so an angular deformation of 0.01 represents approximately a 1% correction to the quartic coupling coefficients. Parameters ---------- tol_dist : float The maximum angular distance of an incorrectly-classified point to the decision boundary that the user considers acceptable. For small (O(0.01)) values of this angle, it roughly corresponds to the fractional degree of correction of the quartic coefficients to reach the decision boundary-- so an angular deformation of 0.01 represents approximately a 1% correction to the quartic coupling coefficients. name : str, default='combined_false_score' The name will provide a unique identifier for the metric in the list of metrics tracked by BFBLearner-- this identifier will be 'val_'+name. """ def __init__(self, tol_dist, name = 'combined_false_score'): super().__init__(name) self.tol_dist = tol_dist
[docs] def performance_check(self, model, ds): """Computes the angular distances between incorrectly classified points and the decision boundary. """ bin_acc = model.evaluate(ds, verbose = 0)[1] if bin_acc < 0.99: return ([np.inf, np.inf, np.inf, np.inf], [np.inf, np.inf, np.inf, np.inf]) else: return (combined_false_score(model, ds, tf.constant(self.tol_dist)))
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return ('false positive score [mean, std, max, # > {}]:'.format(self.tol_dist), 'false negative score [mean, std, max, # > {}]:'.format(self.tol_dist))
[docs] def get_metric(self): """Returns the number of false positives and false negatives greater than tol_dist radians from the decision boundary (tracked separately) for plotting over the course of active learning. """ return np.transpose(np.array([[stat[0][3] + stat[1][3] for stat in self.status_history]]))
[docs] class ValidationConfusionMatrix(ValidationMetric): """A metric that finds the elements of the confusion matrix (correctly labelled positives, false positives, correctly labelled negatives, false negatives) for the validation set. Also calculates the confusion matrix with points which score higher than specified quantiles on some specified uncertainty metric, evaluated over all points which have the same predicted classification, omitted from the validation set. This metric in turn has all the information necessary for the extraction of binary classifier quality metrics such as precision, recall, or F score. Attributes ---------- status_history : list of tuples of lists of ints The elements status_history here are tuples of the form (<true positives>, <false positives>, <true negatives>, <false negatives>), where each element of the tuple is a list of length equal to the length of the attribute quantiles. Each element of the lists are the values of that observable assuming that we only include points with an uncertainty score (given by score_fn) less than or equal to the corresponding quantile (evaluated for all points of the same predicted class) in quantiles. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default this will be 'val_<score_fn>_confusion' score_fn : callable A callable of the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32). The pre-implemented functions for different uncertainty metrics can be specified in the constructor by using any of 'BALD' (mutual information), 'MaxEntropy' (Shannon entropy), 'variation_ratios' (variation ratios), 'predictive_variance' (variance of the prediction distribution), or 'QBDC' (score*(1-score), where score is the Monte Carlo dropout-evaluated prediction for an input) tf_score_fn : jit-compiled callable Tensorflow jit-compiled version of score_fn. n_trials : int or None Parameters ---------- score_fn : {'BALD', 'MaxEntropy', 'variation_ratios', 'random', 'QBDC', 'predictive_variance'} or callable Specifies the score function that the metric will apply to the pool of candidate points. If a callable (corresponding to a custom score function) is used, it must have the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32), depending on whether or not n_trials is specified. name : str, optional Allows for a custom name of the metric. If this argument is not specified, a name will be automatically generated as 'val_<score_fn>_confusion'. quantiles : list of floats, default=[0.95, 1.] The uncertainty quantiles for which the metric is tracked (see the status_history documentation) n_trials : int, optional For score_fn arguments that take a n_trials argument, which includes every pre-implemented score_fn except 'random', this argument can be specified here. If it is not specified, the default value for the given score_fn is used. """ def __init__(self, score_fn = 'BALD', name = None, quantiles = [0.95, 1.], n_trials = None): auto_name = (name is None) score_fn, name = process_score_fn(score_fn, name) self.score_fn = score_fn self.n_trials = n_trials if n_trials is not None: self.tf_score_fn = tf.function(partial(score_fn, n_trials = n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(score_fn, jit_compile = True) self.quantiles = quantiles if auto_name: super().__init__(name + '_confusion') else: super().__init__(name)
[docs] def performance_check(self, model, ds): """Finds the confusion matrix (true positives, false positives, true negatives, false negatives) for the model over the validation set. """ uncert = [] fp_inds = [] fn_inds = [] pos_inds = [] neg_inds = [] ind_displacement = 0 for x, y in ds: uncert.append(self.tf_score_fn(model, x).numpy()) results = tf.reshape(MC_call_fast(model, x, 1000), shape = [-1]) pos_inds.append((tf.reshape(tf.where(results >= 0.5), shape = [-1])).numpy() + ind_displacement) neg_inds.append((tf.reshape(tf.where(results < 0.5), shape = [-1])).numpy() + ind_displacement) fp_inds.append((tf.reshape(tf.where(tf.logical_and(results >= 0.5, ~y)), shape = [-1])).numpy() + ind_displacement) fn_inds.append((tf.reshape(tf.where(tf.logical_and(results < 0.5, y)), shape = [-1])).numpy() + ind_displacement) ind_displacement += (tf.shape(y)[0]) uncert = np.concatenate(uncert, axis = 0) pos_inds = np.concatenate(pos_inds, axis = 0) neg_inds = np.concatenate(neg_inds, axis = 0) fp_inds = np.concatenate(fp_inds, axis = 0) fn_inds = np.concatenate(fn_inds, axis = 0) if len(pos_inds) > 0: p_quantiles = np.quantile(uncert[pos_inds], self.quantiles) if len(neg_inds) > 0: n_quantiles = np.quantile(uncert[neg_inds], self.quantiles) def get_confusion(f_inds, inds, quants): if len(inds) == 0: false = [0 for quant in quants] true = [0 for quant in quants] return true, false if len(f_inds) == 0: true = [np.count_nonzero(uncert[inds] <= quant) for quant in quants] false = [0 for quant in quants] return true, false total = [np.count_nonzero(uncert[inds] <= quant) for quant in quants] false = [np.count_nonzero(uncert[f_inds] <= quant) for quant in quants] true = [total[i] - false[i] for i in range(len(quants))] return true, false true_pos, false_pos = get_confusion(fp_inds, pos_inds, p_quantiles) true_neg, false_neg = get_confusion(fn_inds, neg_inds, n_quantiles) return (true_pos, false_pos, true_neg, false_neg)
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return (self.name + ' (validation true positives) {}:'.format(list(self.quantiles)), self.name + ' (validation false positives) {}:'.format(list(self.quantiles)), self.name + ' (validation true negatives) {}:'.format(list(self.quantiles)), self.name + ' (validation false negatives) {}:'.format(list(self.quantiles)))
[docs] def get_metric(self, quantile = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- quantile : float in self.quantiles, optional If specified, get_metric will only return the values corresponding to the false positives and false negatives found with uncertainty scores less than or equal to the given quantile. Otherwise, all false positives and false negatives for all quantiles will be returned for plotting. Returns ------- np.array Represents some plottable set of values from status_history. """ if quantile is not None: if quantile not in self.quantiles: raise Exception('Please specify a quantile that the metric has recorded. Options are {}'.format(self.quantiles)) return np.transpose(np.array([[stat[1][self.quantiles.index(quantile)] for stat in self.status_history], [stat[3][self.quantiles.index(quantile)] for stat in self.status_history]])) else: return np.array([stat[1] + stat[3] for stat in self.status_history])
[docs] def get_legend(self, quantile = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- quantile : float in self.quantiles, optional The method will return a legend appropriate for plotting get_metric, when get_metric is given the same arguments. Returns ------- list of str A list of strings representing a plot legend for matplotlib. """ if quantile is not None: if quantile not in self.quantiles: raise Exception('Please specify a quantile that the metric has recorded. Options are {}'.format(self.quantiles)) return [self.name + 'false_positives_quantile_{}'.format(quantile), self.name + 'false_negatives_quantile_{}'.format(quantile)] else: return [self.name + 'false_positives_quantile_{}'.format(quant) for quant in self.quantiles] + [self.name + 'false_negatives_quantile_{}'.format(quant) for quant in self.quantiles]
def __getstate__(self): """Used to pickle the metric. """ state = self.__dict__.copy() del state['tf_score_fn'] return state def __setstate__(self, state): """Used to unpickle the metric. """ self.__dict__.update(state) if self.n_trials is not None: self.tf_score_fn = tf.function(partial(self.score_fn, n_trials = self.n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(self.score_fn, jit_compile = True)
# A metric which gives the precision, recall, and F score with various quantiles of an uncertainty metric excluded from the validation set.
[docs] class ValidationFScore(ValidationConfusionMatrix): """A metric that finds the precision, recall, and F score for the validation set. Also calculates the confusion matrix with points which score higher than specified quantiles on some specified uncertainty metric, evaluated over all points which have the same predicted classification, omitted from the validation set. This metric in turn has all the information necessary for the extraction of binary classifier quality metrics such as precision, recall, or F score. Attributes ---------- status_history : list of lists of lists of floats The elements of the status_history object here are lists of the form [<precision>, <recall>, <F score>], where each element of the tuple is a list of length equal to the length of the attribute quantiles. Each element of the lists are the values of that observable assuming that we only include points with an uncertainty score (given by score_fn) less than or equal to the corresponding quantile (evaluated for all points of the same predicted class) in quantiles. name : str The unique identifier for the metric in the list of metrics traced by BFBLearner. By default this will be 'val_<score_fn>_fscore' score_fn : callable A callable of the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32). The pre-implemented functions for different uncertainty metrics can be specified in the constructor by using any of 'BALD' (mutual information), 'MaxEntropy' (Shannon entropy), 'variation_ratios' (variation ratios), 'predictive_variance' (variance of the prediction distribution), or 'QBDC' (score*(1-score), where score is the Monte Carlo dropout-evaluated prediction for an input) tf_score_fn : jit-compiled callable Tensorflow jit-compiled version of score_fn. n_trials : int or None Parameters ---------- score_fn : {'BALD', 'MaxEntropy', 'variation_ratios', 'random', 'QBDC', 'predictive_variance'} or callable Specifies the score function that the metric will apply to the pool of candidate points. If a callable (corresponding to a custom score function) is used, it must have the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32), depending on whether or not n_trials is specified. name : str, optional Allows for a custom name of the metric. If this argument is not specified, a name will be automatically generated as 'val_<score_fn>_fscore'. quantiles : list of floats, default=[0.95, 1.] The uncertainty quantiles for which the metric is tracked (see the status_history documentation) n_trials : int, optional For score_fn arguments that take a n_trials argument, which includes every pre-implemented score_fn except 'random', this argument can be specified here. If it is not specified, the default value for the given score_fn is used. """ def __init__(self, score_fn = 'BALD', name = None, quantiles = [0.95, 1.], n_trials = None): auto_name = (name is None) score_fn, name = process_score_fn(score_fn, name) if auto_name: super().__init__(score_fn, name+'_fscore', quantiles, n_trials)
[docs] def performance_check(self, model, ds): """Computes the precision, recall, and F score over the validation set. """ confusion = super().performance_check(model, ds) prec = [confusion[0][i] / (confusion[0][i] + confusion[1][i]) for i in range(len(confusion[0]))] rec = [confusion[0][i] / (confusion[0][i] + confusion[3][i]) for i in range(len(confusion[0]))] fscore = [2*prec[i]*rec[i] / (prec[i] + rec[i]) for i in range(len(prec))] return (prec, rec, fscore)
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return (self.name + ' (validation precision) {}:'.format(list(self.quantiles)), self.name + ' (validation recall) {}:'.format(list(self.quantiles)), self.name + ' (validation F score) {}:'.format(list(self.quantiles)))
[docs] def get_metric(self, quantile = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- quantile : float in self.quantiles, optional If specified, get_metric will only return the values corresponding to the F score found with uncertainty scores less than or equal to the given quantile. Otherwise, all F scores for all quantiles will be returned for plotting. Returns ------- np.array Represents some plottable set of values from status_history. """ if quantile is not None: if quantile not in self.quantiles: raise Exception('Please specify a quantile that the metric has recorded. Options are {}'.format(self.quantiles)) return np.array([stat[2][self.quantiles.index(quantile)] for stat in self.status_history]) else: return np.array([stat[2] for stat in self.status_history])
[docs] def get_legend(self, quantile = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- quantile : float in self.quantiles, optional The method will return a legend appropriate for plotting get_metric, when get_metric is given the same arguments. Returns ------- list of str A list of strings representing a plot legend for matplotlib. """ if quantile is not None: if quantile not in self.quantiles: raise Exception('Please specify a quantile that the metric has recorded. Options are {}'.format(self.quantiles)) return [self.name + ' quantile <= {}'.format(quantile)] else: return [self.name + ' quantile <= {}'.format(quant) for quant in self.quantiles]
[docs] class PoolScore(PoolMetricReduction): """A metric which applies the function score_fn to the pool of candidate points at every active learning iteration, before the model is trained on any new data drawn from the pool. Evaluates score_fn on the pool points and records specified reductions of these scores. Attributes ---------- status_history : list of lists of floats (or np.float32) Each entry in this status_history object has entries corresponding to the score_fn results applied to each active learning iteration's pool of candidate points, with reduction(s) specified in the constructor. If the constructor specifies multiple reductions, each entry is a list with each value's reduction (so an entry will be [<mean>, <max>, <min>], for example). name : str A string which denotes a name for this metric. In a list of metrics passed to a BFBLearner class, the names of each member of the list should be unique. By default will be 'pool_<score_fn>' score_fn : callable A callable of the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32). The pre-implemented functions for different uncertainty metrics can be specified in the constructor by using any of 'BALD' (mutual information), 'MaxEntropy' (Shannon entropy), 'variation_ratios' (variation ratios), 'predictive_variance' (variance of the prediction distribution), or 'QBDC' (score*(1-score), where score is the Monte Carlo dropout-evaluated prediction for an input) tf_score_fn : jit-compiled callable Tensorflow jit-compiled version of score_fn. n_trials : int or None Parameters ---------- score_fn : {'BALD', 'MaxEntropy', 'variation_ratios', 'random', 'QBDC', 'predictive_variance'} or callable Specifies the score function that the metric will apply to the pool of candidate points. If a callable (corresponding to a custom score function) is used, it must have the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32), depending on whether or not n_trials is specified. name : str, optional Allows for a custom name of the metric. If this argument is not specified, a name will be automatically generated as 'pool_<score_fn>'. reduction : {'mean', 'min', 'max'} or a list of these values, default=['mean','min','max'] Specifies what reductions should be done on the scores computed for the pool candidate points. If a list is specified, all reductions in the list are computed. n_trials : int, optional For score_fn arguments that take a n_trials argument, which includes every pre-implemented score_fn except 'random', this argument can be specified here. If it is not specified, the default value for the given score_fn is used. """ def __init__(self, score_fn = 'BALD', name = None, reduction = ['mean', 'min', 'max'], n_trials = None): score_fn, name = process_score_fn(score_fn, name) super().__init__(name, reduction) self.score_fn = score_fn self.n_trials = n_trials if n_trials is not None: self.tf_score_fn = tf.function(partial(score_fn, n_trials = n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(score_fn, jit_compile = True)
[docs] def performance_check(self, *args): """performance_check for this metric records the score function evaluated on a pool of candidate points, subject to the reduction(s) specified in the constructor. Parameters ---------- *args : (tf.keras.Model, tf.tensor(tf.float32, tf.float32)) or tf.tensor(tf.float32) If the class's score_fn were already evaluated as part of active learning (namely, if the score_fn corresponds to the one used as the acquisition function in active learning), then this method can take an already-evaluated Tensorflow tensor consisting of a list of scores. If not, then we can pass the arguments appropriate for score_fn in order to evaluate the results directly. Returns ------- list of np.float32 A list of the specified reductions in the score function for a batch of pool candidate points. These are then combined into a single entry into status_history using the methods in the parent class. """ if isinstance(args[0], tf.keras.Model): score = self.tf_score_fn(*args).numpy() else: score = args[0].numpy() return [red(score) for red in self.reduction]
def __getstate__(self): """Used to pickle the metric. """ state = self.__dict__.copy() del state['tf_score_fn'] return state def __setstate__(self, state): """Used to unpickle the metric. """ self.__dict__.update(state) if self.n_trials is not None: self.tf_score_fn = tf.function(partial(self.score_fn, n_trials = self.n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(self.score_fn, jit_compile = True)
[docs] class NewDataScore(TrainMetric): """A metric which applies the function score_fn to the set of points that are added to the training set at every active learning iteration, before the model is trained on the new data. Evaluates score_fn on these points and records specified reductions of these scores. Attributes ---------- status_history : list of lists of floats (or np.float32) Each entry in this status_history object has entries corresponding to the score_fn results applied to each active learning iteration's new training data, with reduction(s) specified in the constructor. If the constructor specifies multiple reductions, each entry is a list with each value's reduction (so an entry will be [<mean>, <max>, <min>], for example). name : str A string which denotes a name for this metric. In a list of metrics passed to a BFBLearner class, the names of each member of the list should be unique. By default will be 'train_<score_fn>' score_fn : callable A callable of the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32). The pre-implemented functions for different uncertainty metrics can be specified in the constructor by using any of 'BALD' (mutual information), 'MaxEntropy' (Shannon entropy), 'variation_ratios' (variation ratios), 'predictive_variance' (variance of the prediction distribution), or 'QBDC' (score*(1-score), where score is the Monte Carlo dropout-evaluated prediction for an input) tf_score_fn : jit-compiled callable Tensorflow jit-compiled version of score_fn. n_trials : int or None reduction : a list containing elements of {np.mean, np.min, np.max} This is the reduction that is performed on the scores to produce the entries in status_history. If a list of reductions are applied, then the elements of status_history will be lists with each element being a different reduction being applied to the scores. red_name : a list containing elements of {'mean', 'min', 'max'} This list of strings will contain the same information as reduction, but is used for labelling purposes. Parameters ---------- score_fn : {'BALD', 'MaxEntropy', 'variation_ratios', 'random', 'QBDC', 'predictive_variance'} or callable Specifies the score function that the metric will apply to the pool of candidate points. If a callable (corresponding to a custom score function) is used, it must have the signature (tf.keras.model, tf.tensor(tf.float32, tf.float32))-> tf.tensor(tf.float32) or (tf.keras.model, tf.tensor(tf.float32, tf.float32), int)-> tf.tensor(tf.float32), depending on whether or not n_trials is specified. name : str, optional Allows for a custom name of the metric. If this argument is not specified, a name will be automatically generated as 'train_<score_fn>'. red : {'mean', 'min', 'max'} or a list of these values, default=['mean','min','max'] Specifies what reductions should be done on the scores computed for the new training data. If a list is specified, all reductions in the list are computed. n_trials : int, optional For score_fn arguments that take a n_trials argument, which includes every pre-implemented score_fn except 'random', this argument can be specified here. If it is not specified, the default value for the given score_fn is used. """ def __init__(self, score_fn = 'BALD', name = None, red = ['mean', 'min', 'max'], n_trials = None): self.reduction, self.red_name = _get_reduction(red) score_fn, name = process_score_fn(score_fn, name) super().__init__(name = name) self.score_fn = score_fn self.n_trials = n_trials if n_trials is not None: self.tf_score_fn = tf.function(partial(score_fn, n_trials = n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(score_fn, jit_compile = True)
[docs] def performance_check(self, model, lams, labels): """Computes score_fn on the points added to the training set with each active learning iteration, and then records the specified reductions. """ score = self.tf_score_fn(model, lams).numpy() return [red(score) for red in self.reduction]
[docs] def perf_message(self): """The perf_message method labels any printed output of the metric. """ return self.name + ' (new queried data) {}:'.format(list(self.red_name))
[docs] def get_metric(self, reduction = None): """In this subclass, get_metric can take a keyword argument. Parameters ---------- reduction : str in self.red_name, optional If specified, get_metric will only return the values corresponding to the specified reduction. If not specified, get_metric will return the status_history object in its entirety. Returns ------- np.array Represents some plottable set of values from status_history. """ if reduction is not None: _check_reduction(reduction, self.red_name) return np.transpose(np.array([[stat[self.red_name.index(reduction)] for stat in self.status_history]])) else: return np.array(self.status_history)
[docs] def get_legend(self, reduction = None): """In this subclass, get_legend can take a keyword argument. Parameters ---------- reduction : str in self.red_name, optional get_legend will return a legend consistent with the get_metric result with the same reduction argument passed. Returns ------- list of str An argument to specify a legend in matplotlib. """ if reduction is not None: return [reduction + '_' + self.name] else: return [red + '_' + self.name for red in self.red_name]
def __getstate__(self): """Used to pickle the metric. """ state = self.__dict__.copy() del state['tf_score_fn'] return state def __setstate__(self, state): """Used to unpickle the metric. """ self.__dict__.update(state) if self.n_trials is not None: self.tf_score_fn = tf.function(partial(self.score_fn, n_trials = self.n_trials), jit_compile = True) else: self.tf_score_fn = tf.function(self.score_fn, jit_compile = True)
[docs] class StoppingCondition: """A generic class for implementing early stopping conditions for active learning. A StoppingCondition object is called each round immediately after the metric it follows is evaluated. Then, if the call returns True, the active learning loop is terminated. Attributes ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track. metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. Note that if one wishes to use the method find_stopping_index, the metric_func function MUST be capable of accommodating the optional integer argument. If this argument won't be called, it's acceptable to use an existing argument. Parameters ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. """ def __init__(self, metric_name, metric_func): self.metric_name = metric_name self.metric_func = metric_func def __call__(self, metrics_dict, ind = None): """Calling the StoppingCondition class on a dictionary of metrics (of the form, {metric.name : metric}) will make it perform its function on the metric it is following. Parameters ---------- metrics_dict : dict A dictionary relating the names of ALMetric objects (or rather child classes of this class) to the objects themselves. ind : int, optional If specified, the StoppingCondition object only considers status_history[:ind] instead of the full status_history. This is useful for retroactively determining if a stopping condition would have eliminated unnecessary active learning iterations. Returns ------- bool True if the StoppingCondition determines we should stop active learning, False otherwise. """ try: metric_in = metrics_dict[self.metric_name] except KeyError: raise Exception('metric_name is not within the metrics being recorded by ActiveLearning. Must be one of {}.'.format(list(metrics_dict.keys()))) if ind is not None: return self.metric_func(metric_in, ind) return self.metric_func(metric_in)
[docs] def find_stopping_index(self, metrics_dict): """Computes the index (active learning iteration) at which this StoppingCondition WOULD have stopped active learning if it were applied to the metrics for an already-trained BFBLearner object. Parameters ---------- metrics_dict : dict A dictionary relating the names of ALMetric objects (or rather child classes of this class) to the objects themselves, extracted from a trained BFBLearner object. Returns ------- int An integer representing the active learning round at which the StoppingCondition would have stopped active learning, if it had been implemented during training. If the condition would not have been met, returns -1. """ metric = metrics_dict[self.metric_name] for ind in range(1, len(metric.status_history) + 1): if self(metrics_dict, ind): return ind return -1
[docs] class ScoreNotDecreasing(StoppingCondition): """A stopping condition based on when an uncertainty score (in particular BALD or variation ratios) is not decreasing over some data set (usually the pool of candidate points proposed by the classifier or the set of training points added as training data). Because mutual information, variation ratios, and predictive variance are all in theory metrics of epistemic uncertainty (or in the case of the second, at least highly sensitive to it), some measurement of these scores should be decreasing as more data is added. If it's not, then the network probably reached close to the highest performance it's capable of attaining. Attributes ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. In this case, metric_func checks to see if a specified uncertainty score on some set of points hasn't achieved a new minimum over some specified number of rounds. reduction : {'mean', 'min', 'max', 'std'} Must be some reduction over the score that the metric specified by metric_name has evaluated. This is the specific quantity that the stopping condition monitors.' patience : int, default=5 The number of rounds without achieving a new minimum for its monitored quantity that the stopping condition tolerates before halting active learning. Parameters ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track. reduction : {'mean', 'min', 'max', 'std'} Must be some reduction over the score that the metric specified by metric_name has evaluated. This is the specific quantity that the stopping condition monitors.' patience : int, default=5 The number of rounds without achieving a new minimum for its monitored quantity that the stopping condition tolerates before halting active learning. """ def __init__(self, metric_name, reduction = 'mean', patience = 5): self.reduction = reduction self.patience = patience def metric_func(metric, ind = None): if (not hasattr(metric, 'red_name')) or (not hasattr(metric, 'score_fn')): raise Exception('The specified metric is not a scoring metric, and so this stopping condition is not applicable.') if ind is None: ind = len(metric.status_history) if ind < self.patience: return False red_index = metric.red_name.index(self.reduction) min_score = min([stat[red_index] for stat in metric.status_history[:ind]]) min_score_arg = [stat[red_index] for stat in metric.status_history[:ind]].index(min_score) if min_score_arg in range(ind-self.patience,ind): return False last_status_history = metric.status_history[ind-self.patience:ind] return all([elem[red_index] >= min_score for elem in last_status_history]) super().__init__(metric_name, metric_func)
[docs] class AccuracyNotImproving(StoppingCondition): """A stopping condition that monitors either a ModelEvaluation or MCModelEvaluation metric and stops the active learning after some number of rounds have passed without achieving a new maximum accuracy. Attributes ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. In this case, metric_func checks to see if the accuracy entry for a ModelEvaluation or MCModelEvaluation metric hasn't achieved a new maximum over some specified number of rounds. patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. Parameters ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track. patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. """ def __init__(self, metric_name, patience = 5): self.patience = patience def metric_func(metric, ind = None): if (not isinstance(metric, ModelEvaluation)) and (not isinstance(metric, MCModelEvaluation)): raise Exception('The specified metric is not a ModelEvaluation or MCModelEvaluation metric, and so this stopping condition is not applicable.') if ind is None: ind = len(metric.status_history) if ind < self.patience: return False max_accuracy = max([stat[0] for stat in metric.status_history[:ind]]) last_status_history = metric.status_history[ind-self.patience:ind] max_accuracy_arg = [stat for stat in metric.status_history[:ind]].index(max_accuracy) if max_accuracy_arg in range(ind-self.patience,ind): return False return all([elem[0] <= max_accuracy for elem in last_status_history]) super().__init__(metric_name, metric_func)
[docs] class FScoreNotImproving(StoppingCondition): """A stopping condition that monitors a ValidationConfusionMatrix or ValidationFScore metric and stops the active learning after some number of rounds have passed without achieving a new maximum F score. Attributes ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. In this case, metric_func checks to see if the F score evaluated over some uncertainty quantile (see ValidationConfusionMatrix and ValidationFScore documentation for details) hasn't achieved a new maximum in patience rounds. quant : float, default=1.0 The uncertainty quantile that the stopping condition should check. Default is 1.0, meaning the entire validation set is considered. patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. Parameters ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track quant : float, default=1.0 The uncertainty quantile that the stopping condition should check. Default is 1.0, meaning the entire validation set is considered. patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. """ def __init__(self, metric_name, quant = 1.0, patience = 5): self.patience = patience self.quant = quant def metric_func(metric, ind = None): if (not isinstance(metric, ValidationConfusionMatrix)): raise Exception('The specified metric is not ValidationConfusionMatrix, and so this stopping condition is not applicable.') try: q_index = metric.quantiles.index(quant) except ValueError: raise Exception('the specified quantile is not recorded in the metric. Please specify one of {}'.format(metric.quantiles)) if ind is None: ind = len(metric.status_history) if ind < self.patience: return False if isinstance(metric, ValidationFScore): fscore = np.array([stat[2] for stat in metric.status_history]).T else: true_pos = np.array([stat[0] for stat in metric.status_history]).T false_pos = np.array([stat[1] for stat in metric.status_history]).T false_neg = np.array([stat[3] for stat in metric.status_history]).T prec = true_pos / (true_pos + false_pos) rec = true_pos / (true_pos + false_neg) fscore = 2*prec*rec / (prec + rec) max_fscore = max(fscore[q_index][:ind]) max_fscore_arg = np.argmax(fscore[q_index][:ind]) last_status_history = fscore[q_index][ind-self.patience:ind] if max_fscore_arg in range(ind-self.patience,ind): return False return np.all(last_status_history <= max_fscore) super().__init__(metric_name, metric_func)
[docs] class DeltaFNotDecreasing(StoppingCondition): """A stopping condition that monitors a PoolDeltaF or UnlabelledDeltaF metric and stops active learning once the classifier's estimated change in F score has not decreased for a specified number of rounds. Attributes ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track. metric_func : callable A callable which takes a metric and an (optional) integer index as input and returns True if the stopping condition has been met, and False otherwise. In this case, metric_func checks to see if the estimated change in F score has not achieved a new minimum in patience rounds. patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. Parameters ---------- metric_name : str Denotes the name of the performance metric (that is, the ALMetric object's name str) that the StoppingCondition should track patience : int, default=5 The number of rounds without achieving a new maximum for its monitored quantity that the stopping condition tolerates before halting active learning. """ def __init__(self, metric_name, patience = 5): self.patience = patience def metric_func(metric, ind = None): if (not isinstance(metric, PoolDeltaF)) and (not isinstance(metric, UnlabelledDeltaF)): raise Exception('The specified metric is not a PoolDeltaF metric, and so this stopping condition is not applicable.') if ind is None: ind = len(metric.status_history) if ind < self.patience: return False min_score = min([stat for stat in metric.status_history[:ind]]) min_score_arg = [stat for stat in metric.status_history[:ind]].index(min_score) last_status_history = metric.status_history[ind-self.patience:ind] if min_score_arg in range(ind-self.patience,ind): return False return all([elem >= min_score for elem in last_status_history]) super().__init__(metric_name, metric_func)