Source code for bfbrain.Data_Manager

"""A module containing the DataManager class, which handles the generation 
and labelling of training and validation data for the BFBLearner class.
"""

from bfbrain.Jax_Oracle import label_func, test_labeller
import jax
import sympy as sym
from sympy import lambdify, re

import tensorflow as tf
import numpy as np

from numpy.random import default_rng
from numpy.random import SeedSequence
from bfbrain.Hypersphere_Formulas import rand_nsphere, convert_from_polar_sym


[docs]
class labeller_wrapper:
    """Wrapper for the labelling function which serves as the active 
    learning oracle.

    Attributes
    ----------
    func : callable
        A numeric function for the potential. This is a numeric function 
        generated by the class DataManager in its init method. It will 
        take numeric arrays (of a format depending on the DataManager 
        class) representing a scalar vev and a set of quartic potential 
        coefficients and return the numerical value of the quartic part 
        of the potential function and its gradient with respect to the vev.

    phi_len : int
        The number of real parameters necessary to uniquely specify a vev 
        in the model.

    lam_len : int
        The number of independent real quartic coefficients in the model.

    rng : np.random.Generator
        The random number generator which governs any random processes 
        that the oracle may use.

    polar : bool
        If True, then the analysis of the potential will be conducted with 
        a polar coordinate parameterization of the vev parameters. 
        If False, then Cartesian coordinates will be used, albeit with the 
        vev parameters restricted to a phi_len-dimensional unit 
        hypersphere.

    label_fn : callable, optional.
        A function that takes a 2-D NumPy array of quartic coefficients 
        and returns a list of Boolean labels for them. This is for 
        implementing customized oracle functions. Must have the signature 
        (func: Callable, phi_len: int, polar: bool, rng: NumpyGenerator, lam: np.array(np.float, np.float), **kwargs) -> np.array(bool)
        If this argument is not specified, the default oracle 
        BFBrain.Jax_Oracle.label_func is used.

    label_check : callable, optional
        A function that can be used to test the reliability of a custom 
        oracle given by label_fn, or if label_fn is None, the default 
        oracle BFBrain.Jax_Oracle.label_func. Must have the same signature 
        as label_fn, up to additional keyword arguments. If this argument 
        is not specified, the a tester for the default oracle is used: 
        BFBrain.Jax_Oracle.test_labeller

    **label_kwargs : dict, default=dict(niter = 250)
        A dictionary of additional keyword arguments needed for the 
        labelling function label_func. The default values are applicable 
        for the default oracle function, BFBrain.Jax_Oracle.label_func
    """
    def __init__(self, func, phi_len, lam_len, rng, polar, label_fn = None, label_check = None, label_kwargs = dict(niter = 250)):
        self.func = func
        self.phi_len = phi_len
        self.lam_len = lam_len
        self.rng = rng
        self.polar = polar
        self.label_kwargs = label_kwargs
        if(label_fn is None):
            self.label_fn = label_func
        else:
            self.label_fn = label_fn
        if(label_check is None):
            self.label_check = test_labeller
        else:
            self.label_check = label_check


[docs]
    def do_labelling(self, lam, label_kwargs = None):
        """Performs labelling using the class's oracle function.

        Parameters
        ----------
        lam : np.array(np.float32, np.float32)
            A 2-D NumPy array of sets of quartic potential coefficients.

        label_kwargs : dict, optional
            An optional alternative set of oracle keyword arguments. If 
            not specified, the class instance's label_kwargs
            attribute is used.

        Returns
        -------
        np.array(bool)
            A 1-D NumPy array of labels for lam, for which points that 
            are bounded from below are labelled "True" and points which 
            are not are labelled "False".
        """
        if label_kwargs is not None:
            return self.label_fn(self.func, self.phi_len, self.polar, self.rng, lam, **label_kwargs)
        else:
            return self.label_fn(self.func, self.phi_len, self.polar, self.rng, lam, **self.label_kwargs)

    

[docs]
    def check_labeller(self, lam, **tester_kwargs):
        """Tests the reliability of the labelling-- calls 
        self.label_check. Depending on the methodology of label_func, 
        this function may or may not be useful. For example, a rigorous 
        computation of boundedness-from-below based on resultants would 
        not require any consistency or reliability checks.

        Parameters
        ----------
        lam : np.array(np.float32, np.float32)
            A 2-D NumPy array of sets of quartic potential coefficients.

        tester_kwargs : dict
            A set of keyword arguments for self.label_check.

        Returns
        -------
        Any
            Will return what self.label_check returns.
        """
        return self.label_check(self.func, self.phi_len, self.polar, self.rng, lam, **(self.label_kwargs | tester_kwargs))




[docs]
class np_data:
    """Holds labelled sets of quartic coefficients in CPU memory in a 
    format that's easy to save, load, and manipulate.

    Attributes
    ----------
    pos : np.array(np.float32, np.float32)
        A 2-D NumPy array of sets of quartic coefficients in the 
        potential, which the labeller has determined are 
        bounded-from-below.

    neg : np.array(np.float32, np.float32)
        A 2-D NumPy array of sets of quartic coefficients in the 
        potential, which the labeller has determined are NOT 
        bounded-from-below.
    """
    def __init__(self, pos, neg):
        self.pos = pos
        self.neg = neg


[docs]
    @classmethod
    def from_file(cls, path):
        """A constructor for loading an np_data object from a .npz file 
        (see NumPy documentation), likely created in a previous BFBrain 
        analysis.

        Parameters
        ----------
        path : str
            A string with a file name. '.npz' is appended to the end of 
            the string, and should not be included in path.

        Returns
        -------
        np_data
        """
        npz_data = np.load(path+'.npz')
        return cls(pos = np.copy(npz_data['pos']), neg = np.copy(npz_data['neg']))

    

[docs]
    def save_data(self, path):
        """Saves the data object to the filepath specified as an npz 
        object (see NumPy documentation)

        Parameters
        ----------
        path : str
            A string with a file name. If .npz is not at the end of the 
            string, it is appended to it.
        """
        np.savez(path, pos = self.pos, neg = self.neg)

    

[docs]
    def append_data(self, new_data):
        """Given another np_data object, appends its data to this 
        object in place.

        Parameters
        ----------
        new_data: np_data
        """
        if(len(new_data.pos) > 0):
            self.pos = np.concatenate((self.pos, new_data.pos), axis=0)
        if(len(new_data.neg) > 0):
            self.neg = np.concatenate((self.neg, new_data.neg), axis=0)

    

[docs]
    def n_elements(self):
        """
        Computes the total number of sets of quartic coefficients in 
        the object (both bounded-from-below and not bounded-from-below)

        Returns
        -------
        int
            The total number of sets of quartic coefficients in the 
            np_data object.
        """
        return len(self.pos) + len(self.neg)


        

[docs]
class DataManager:
    """A class containing methods which process and generate data. 
    Note that this class contains all the random number generation that's 
    not specifically associated with the neural network and its optimizer.
    Generally one should use the 'from_seed' or 'from_file' constructor 
    rather than constructing from the base initialization method.

    Attributes
    ----------
    phi_len : int
        The number of independent real parameters needed to uniquely 
        specify a vev in the model.

    lam_len : int
        The number of independent real quartic potential coefficients 
        in the model.

    rng : list of numpy.random.Generator
        A list of NumPy random number generators which control all the 
        random generation related to the generation and labelling of data. 
        In total there are 6 random number generators, each differently 
        seeded using NumPy's SeedSequence.spawn method. Each random number 
        generator is used only for one specific task: Generating training 
        data, generating validation data, generating random points in the 
        vicinity of other points (two rng's are used here, one for 
        rotation direction and the other for rotation angle), 
        doing random number generation associated with labelling, and 
        shuffling data for training.

    polar : bool
        If true, the potential is analyzed with the vev coordinates 
        converted to a polar form. If false, they are analyzed in their 
        Cartesian form.

    sym_expr : SymPy expression
        Represents the potential function in a form that is both picklable 
        and can easily be used to generate the gradient symbolically.

    sym_grad_expr : SymPy expression
        Represents the gradient of the potential function in a form that 
        is both picklable and can easily be used to generate the gradient 
        symbolically.

    phisym_var : sympy.Array
        The symbols representing the quartic potential coefficients in 
        sym_expr.

    lamsym : sympy.Array
        The symbols representing the quartic potential coefficients in 
        sym_expr.

    lambdify_mode : {'jax', 'numpy', 'scipy', 'math', 'mpmath', 'numexpr', 'sympy', 'tensorflow'}
        Passed directly as the argument "modules" in sympy.lambdify, the 
        function used to generate numerical functions from the symbolic 
        expression for the scalar potential. Default value is 'jax', 
        consistent with the default oracle function, 
        BFBrain.Jax_Oracle.label_func

    label_fn : callable, optional
        A function that takes a 2-D NumPy array of quartic coefficients 
        and returns a list of Boolean labels for them. This is for 
        implementing customized oracle functions. Must have the signature 
        (func: Callable, phi_len: int, polar: bool, rng: NumpyGenerator, lam: np.array(np.float, np.float), **kwargs) -> np.array(bool)
        If this argument is not specified, the default oracle 
        BFBrain.Jax_Oracle.label_func is used.

    label_check : callable, optional
        A function that can be used to test the reliability of a custom 
        oracle given by label_fn, or if label_fn is None, the default 
        oracle BFBrain.Jax_Oracle.label_func. Must have the same signature 
        as label_fn, up to additional keyword arguments. If this argument 
        is not specified, the a tester for the default oracle is used: 
        BFBrain.Jax_Oracle.test_labeller

    **label_kwargs : dict, optional
        A dictionary of additional keyword arguments needed for the 
        labelling function label_func.
    """
    def __init__(self, phi_len, lam_len, rng, polar, sym_expr, sym_grad_expr, phisym_var, lamsym, lambdify_mode = 'jax', label_fn = None, label_check = None, **label_kwargs):
        self.phi_len = phi_len
        self.lam_len = lam_len
        self.rng = rng
        self.polar = polar
        self.sym_expr = sym_expr
        self.sym_grad_expr = sym_grad_expr
        self.phisym_var = phisym_var
        self.lamsym = lamsym
        self.lambdify_mode = lambdify_mode
        self.label_fn = label_fn
        self.label_check = label_check
        self.label_kwargs = label_kwargs

        #Now create numerical functions for the value and gradient of the 
        # quartic part of the potential from the symbolic function:
        if(lambdify_mode == 'jax'):
            num_func = jax.jit(lambdify([phisym_var, lamsym], sym_expr, lambdify_mode))
            num_grad = jax.jit(lambdify([phisym_var, lamsym], sym_grad_expr, lambdify_mode))
        elif(lambdify_mode == 'tensorflow'):
            num_func = tf.function(lambdify([phisym_var, lamsym], sym_expr, lambdify_mode))
            num_grad = tf.function(lambdify([phisym_var, lamsym], sym_grad_expr, lambdify_mode))
        else:
            num_func = sym.lambdify([phisym_var, lamsym], sym_expr, lambdify_mode)
            num_grad = sym.lambdify([phisym_var, lamsym], sym_grad_expr, lambdify_mode)

        if(lambdify_mode == 'jax'):
            @jax.jit
            def min_func(phi, lam):
                return num_func(phi, lam), num_grad(phi, lam)
        elif(lambdify_mode == 'tensorflow'):
            @tf.function
            def min_func(phi, lam):
                return num_func(phi, lam), tf.stack(num_grad(phi, lam))
        else:
            def min_func(phi, lam):
                return num_func(phi, lam), num_grad(phi, lam)   
        
        #Initialize the labeller.
        self.labeller = labeller_wrapper(min_func, self.phi_len, self.lam_len, self.rng[5], self.polar, self.label_fn, self.label_check, self.label_kwargs)
    


[docs]
    @classmethod
    def from_func(cls, sym_func, phi_len, lam_len, seed = None, polar = False, lambdify_mode = 'jax', label_fn = None, label_check = None, **label_kwargs):
        """Preferred constructor for initializing DataManager.

        Parameters
        ----------
        sym_func : SymPy function.
            A SymPy function that expresses the quartic part of the 
            potential. Must have the signature 
            (sympy.Array, sympy.Array) -> sympy.Expr, where the first 
            sympy.Array object corresponds to the vev configuration
            and the second corresponds to the quartic coefficients 
            in the potential.

        phi_len : int
            The number of real parameters needed to uniquely specify the 
            vev in the model.

        lam_len : int
            The number of independent real quartic coupling coefficients 
            in the model's potential function.

        seed : int, optional
            A random number seed. Used to spawn a sequence of random 
            generators with SeedSequence.

        polar : bool, default=False
            If true, the potential is analyzed with the vev coordinates 
            converted to a polar form. If false, they are analyzed in 
            their Cartesian form.

        lambdify_mode : {'jax', 'numpy', 'scipy', 'math', 'mpmath', 'numexpr', 'sympy', 'tensorflow'}
            The "module" input to sympy.lambdify, used to extract 
            numerical expressions from the symbolic SymPy function. 
            See SymPy documentation for details.

        label_fn : callable, optional
            A function that takes a 2-D NumPy array of quartic 
            coefficients and returns a list of Boolean labels for them. 
            This is for implementing customized oracle functions. Must have the signature 
            (func: Callable, phi_len: int, polar: bool, rng: numpy.random.Generator, lam: np.array(np.float, np.float), **kwargs) -> np.array(bool)
            If this argument is not specified, the default oracle 
            BFBrain.Jax_Oracle.label_func is used.

        label_check : callable, optional
            A function that can be used to test the reliability of a 
            custom oracle given by label_fn, or if label_fn is None, the 
            default oracle BFBrain.Jax_Oracle.label_func. Must have the 
            same signature as label_fn, up to additional keyword arguments.
            If this argument is not specified, the a tester for the 
            default oracle is used: BFBrain.Jax_Oracle.test_labeller

        **label_kwargs : dict, optional
            A dictionary of additional keyword arguments needed for the 
            labelling function label_func.
        """
        #Create the symbols for some symbolic manipulation:
        phisym = sym.Array(sym.symbols('phi:'+str(phi_len), real = True))
        lamsym = sym.Array(sym.symbols('lambda:'+str(lam_len), real = True))

        #Initialize the random number generators:
        seeds = SeedSequence(seed).spawn(6)
        rng = [default_rng(seeds[i]) for i in range(6)]

        #Now create a simplified symbolic expression for the quartic part of the potential from the symbolic function:
        if(polar):
            phisym_pol = sym.Array(sym.symbols('theta:'+str(phi_len-1), real = True))
            sym_expr = (re(sym_func(phisym, lamsym).subs(zip(phisym, convert_from_polar_sym(phisym_pol))).diff(lamsym))).applyfunc(sym.simplify).dot(lamsym)
            sym_grad_expr = (sym_expr.diff(phisym_pol).applyfunc(sym.simplify))
            return cls(phi_len, lam_len, rng, polar, sym_expr, sym_grad_expr, phisym_pol, lamsym, lambdify_mode, label_fn, label_check, **label_kwargs)
        else:
            sym_expr = re(sym_func(phisym, lamsym))
            sym_grad_expr = (sym_expr.diff(phisym).applyfunc(sym.simplify))
            return cls(phi_len, lam_len, rng, polar, sym_expr, sym_grad_expr, phisym, lamsym, lambdify_mode, label_fn, label_check, **label_kwargs)

    
    def __getstate__(self):
        """Modify pickle's saving of this class to avoid unpicklable 
        objects.
        """
        state = self.__dict__.copy()
        del state['labeller']
        return state
    
    def __setstate__(self, state):
        """Modify pickle's loading of this class to reconstruct 
        unpicklable objects.
        """
        self.__dict__.update(state)
        #Now create numerical functions for the value and gradient of the 
        # quartic part of the potential from the symbolic function:
        if(self.lambdify_mode == 'jax'):
            num_func = jax.jit(lambdify([self.phisym_var, self.lamsym], self.sym_expr, self.lambdify_mode))
            num_grad = jax.jit(lambdify([self.phisym_var, self.lamsym], self.sym_grad_expr, self.lambdify_mode))
        elif(self.lambdify_mode == 'tensorflow'):
            num_func = tf.function(lambdify([self.phisym_var, self.lamsym], self.sym_expr, self.lambdify_mode))
            num_grad = tf.function(lambdify([self.phisym_var, self.lamsym], self.sym_grad_expr, self.lambdify_mode))
        else:
            num_func = sym.lambdify([self.phisym_var, self.lamsym], self.sym_expr, self.lambdify_mode)
            num_grad = sym.lambdify([self.phisym_var, self.lamsym], self.sym_grad_expr, self.lambdify_mode)
 
        if(self.lambdify_mode == 'jax'):
            @jax.jit
            def min_func(phi, lam):
                return num_func(phi, lam), num_grad(phi, lam)
        elif(self.lambdify_mode == 'tensorflow'):
            @tf.function
            def min_func(phi, lam):
                return num_func(phi, lam), tf.stack(num_grad(phi, lam))
        else:
            def min_func(phi, lam):
                return num_func(phi, lam), num_grad(phi, lam)   
        
        #Initialize the labeller.
        self.labeller = labeller_wrapper(min_func, self.phi_len, self.lam_len, self.rng[5], self.polar, self.label_fn, self.label_check, self.label_kwargs)
    

[docs]
    def create_random_lambdas(self, nlams, validation = False):
        """Create a list of random sets of quartic potential coefficients 
        (but don't label them yet). Use independent uncorrelated rng's for 
        the generation of a validation and training set. Notice that these 
        lambdas are Cartesian coordinates that uniformly sample the 
        unit hypersphere.

        Parameters
        ----------
        nlams : int
            The number of sets of quartic coefficients to generate 
            randomly.

        validation: bool, default=False
            A flag denoting which random number generator to use, 
            ensuring independently-generated validation and training sets. 
            If True, use the random number generator for the 
            validation set, while if False, use the random number 
            generator for the training set.

        Returns
        -------
        np.array(np.float32, np.float32)
            A 2-D NumPy array representing a list of sets of quartic 
            coefficients for the potential.
        """
        if(validation):
            rand_lams = rand_nsphere(nlams, self.lam_len, self.rng[1]).astype(np.float32)
        else:
            rand_lams = rand_nsphere(nlams, self.lam_len, self.rng[0]).astype(np.float32)
        return rand_lams

    

[docs]
    def check_labeller(self, nlams, **tester_kwargs):
        """A wrapper for calling the labeller's check_labeller function. 
        Generates sample quartic coefficients randomly before running 
        labeller.check_labeller on them.

        Parameters
        ----------
        nlams : int
            The number of sets of quartic coefficients to randomly 
            generate for testing the labeller function's consistency.

        tester_kwargs : dict
            Additional keyword arguments required by the check_labeller 
            function. If the default oracle and tester are used, possible 
            keyword arguments are niter_step, count_success, max_iter, 
            verbose. See BFBrain.Jax_Oracle.test_labeller for details.

        Returns
        -------
            Same type as labeller.check_labeller, which may be a 
            user-written function. If the default oracle and tester are 
            used, this function will be BFBrain.Jax_Oracle.test_labeller.
        """
        lams = self.create_random_lambdas(nlams, validation = True)
        return self.labeller.check_labeller(lams, **tester_kwargs)



[docs]
    def checklam_all(self, lams, truth_label_fn = None, label_kwargs = None):
        """Labels sets of quartic potential coefficients with True 
        (for bounded from below) or False (not bounded from below).

        Parameters
        ----------
        lams : np.array(np.float32, np.float32)
            A 2-D NumPy array of quartic coefficients of the potential. 
            Each entry along the 0 axis corresponds to a single set of 
            quartic potential coefficients specifying a potential 
            function.

        truth_label_fn : callable, optional
            Must take a 1-D NumPy array representing a single set of 
            quartic coefficients and return a Boolean True if the 
            potential they describe is bounded from below, False 
            otherwise. If this argument is specified, the method 
            will use this callable to label lams instead of the labeller 
            class. This is used in specific instances when a fast symbolic 
            expression for the bounded-from-below constraints is known, 
            and the performance of the classifier training loop can be 
            evaluated in the absence of noise due to the approximate 
            labeller. Obviously the use case of the classifier is for 
            potentials where such a symbolic expression is NOT known, 
            so the real-world model building usefulness of this option 
            is limited.

        label_kwargs : dict, optional
            If these are specified, the oracle will use the keyword 
            arguments given here instead of the keyword arguments 
            specified in the DataManager constructor.

        Returns
        -------
        np.array(bool)
            A Boolean NumPy array of labels for each set of coefficients 
            in lams.
        """
        if truth_label_fn is not None:
            return np.array([truth_label_fn(lam) for lam in lams])
        else:
            return self.labeller.do_labelling(lams, label_kwargs)

    

[docs]
    def create_data(self, lams, truth_label_fn = None, label_kwargs = None):
        """Given an unlabelled 2-D NumPy array of sets of quartic 
        coefficients, label them and return an np_data object.

        Parameters
        ----------
        lams : np.array(np.float32, np.float32)
            A 2-D NumPy array of quartic coefficients of the potential. 
            Each entry along the 0 axis corresponds to a single set of 
            quartic potential coefficients specifying a 
            potential function.

        truth_label_fn : callable, optional
            Must take a 1-D NumPy array representing a single set of 
            quartic coefficients and return a Boolean True if the 
            potential they describe is bounded from below, False 
            otherwise. If this argument is specified, the method will use 
            this callable to label lams instead of the labeller class. 
            This is used in specific instances when a fast symbolic 
            expression for the bounded-from-below constraints is known, 
            and the performance of the classifier training loop can be 
            evaluated in the absence of noise due to the approximate 
            oracle. Obviously the use case of the classifier is for 
            potentials where such a symbolic expression is NOT known, 
            so the real-world model building usefulness of this option 
            is limited.

        label_kwargs : dict, optional
            If these are specified, the oracle will use the keyword 
            arguments given here instead of the keyword arguments 
            specified in the DataManager constructor.

        Returns
        -------
        np_data
            An np_data object representing the labelled contents of the 
            input array lams.
        """
        res = self.checklam_all(lams, truth_label_fn, label_kwargs)
        return np_data(lams[res], lams[~res])



[docs]
    def create_random_data(self, nlams, validation = False, truth_label_fn = None, label_kwargs = None):
        """Creates a random sample of Cartesian lambda coefficients and 
        labels them, then storing the results in an np_data object.

        Parameters
        ----------
        nlams : int
            The number of sets of quartic potential coefficients to 
            generate.

        validation : bool, default=False
            If True, use the validation random number generator to 
            generate the random coefficients. If False, use the training 
            random number generator.

        truth_label_fn : callable, optional
            Must take a 1-D NumPy array representing a single set of 
            quartic coefficients and return a Boolean True if the 
            potential they describe is bounded from below, False 
            otherwise. If this argument is specified, the method will use 
            this callable to label lams instead of the labeller class. 
            This is used in specific instances when a fast symbolic 
            expression for the bounded-from-below constraints is known, 
            and the performance of the classifier training loop can be 
            evaluated in the absence of noise due to the approximate 
            labeller. Obviously the use case of the classifier is for 
            potentials where such a symbolic expression is NOT known, 
            so the real-world model building usefulness of this option 
            is limited.

        label_kwargs : dict, optional
            If these are specified, the oracle will use the keyword 
            arguments given here instead of the keyword arguments 
            specified in the DataManager constructor.

        Returns
        -------
        np_data
            An np_data object that represents the labelled sets of 
            quartic coefficients that was randomly generated.
        """
        lams = self.create_random_lambdas(nlams, validation)
        return self.create_data(lams, truth_label_fn, label_kwargs)

    

[docs]
    def check_accuracy_with_better_labeller(self, in_data, label_kwargs):
        """A method for evaluating the accuracy of a labeller which is 
        capable of mislabelling some False points as True, like the 
        default oracle, which is based on global minimization of the 
        quartic part of the potential.

        Parameters
        ----------
        in_data : np_data
            An np_data object, labelled with an oracle that can 
            mislabel some False points as True (but not the reverse).

        label_kwargs : dict
            A dictionary which specifies the keyword arguments for the 
            oracle, which must be selected to yield significantly more 
            accurate labels than the ones specified by in_data.

        Returns
        -------
        float
            The precision (fraction of positively labelled points that 
            are true positives) of the oracle which originally labelled 
            in_data.
        """
        lams = in_data.pos
        labels = self.checklam_all(lams, label_kwargs = label_kwargs)
        return 1. - len(labels[~labels]) / len(lams)



[docs]
    def balance_array(self, data):
        """Given an np_data object that has more negative (not 
        bounded-from-below) points than positives (bounded-from-below), 
        rebalance data to include new positive points generated by 
        leveraging the convexity of the space of bounded-from-below points. 
        If there are as many or more positive points as negative points 
        in the np_data object, leaves the np_data object unmodified.

        Parameters
        ----------
        data : np_data
            The np_data object that has many more negatively-labelled 
            points (that is, points which are not bounded-from-below) 
            than positives.
        """
        # Get the positive and negative data sets.
        lam_pos = data.pos
        lam_neg = data.neg

        pos = len(lam_pos)
        neg = len(lam_neg)

        # If there are no members of one label, or there are an exactly 
        # equal number of members of both labels, or there are already 
        # more positive points than negative points, do nothing.
        if(pos == 0 or neg == 0 or pos == neg or pos > neg):
            return
        # Generate new points from the positive points that are already 
        # in the data.
        new_lams = self._create_new_positives(lam_pos, neg - pos)
        new_data = np_data(new_lams, np.array([]))

        # append new data to the np_data object data.
        data.append_data(new_data)


    def _create_new_positives(self, lams, nlams):
        """Generates new bounded-from-below points from existing known 
        ones. Called by balance_array.

        Parameters
        ----------
        lams : np.array(np.float32, np.float32)
            A 2-D NumPy array of quartic coefficients of the potential. 
            Each entry along the 0 axis corresponds to a single set of 
            quartic potential coefficients specifying a potential function.
            Important that all elements of lams are bounded-from-below.

        nlams : int
            The number of new points to generate

        Returns
        -------
        np.array(np.float32, np.float32)
            A 2-D NumPy array representing nlams sets of new quartic 
            coefficients which are bounded-from-below, since they are 
            sampled along line segments between existing 
            bounded-from-below points.
        """
        new_lams = np.empty((nlams, self.lam_len))
        for i in np.arange(len(new_lams)):
            pair = self.rng[2].choice(lams, size = 2, replace = False)
            t = self.rng[3].random()
            new_lams[i] = t*pair[0] + (1.-t)*pair[1]
        return (new_lams / np.linalg.norm(new_lams, axis = 1, keepdims = True)).astype(np.float32)
    

[docs]
    def create_dataset(self, data, validation = False):
        """Given an np_data object, creates a Tensorflow dataset object 
        for training.

        Paramaters
        ----------
        data : np_data

        validation : bool, default=False
            If True, don't shuffle the dataset. Useful for keeping track 
            of agreement on the validation set for the model after 
            successive active learning rounds.

        Returns
        -------
        tf.data.Dataset
            This dataset is NOT batched, but is randomly shuffled on the 
            CPU. To use for training, it is necessary to batch the dataset 
            object with tf.data.Dataset's batch method.
        """
        # Retrieve the members of the np_data object.
        lam_pos = data.pos
        lam_neg = data.neg

        pos = len(lam_pos)
        neg = len(lam_neg)

        # Create NumPy arrays of the quartic coefficient sets (lams) 
        # and their labels (labs).
        lams = np.concatenate((lam_pos, lam_neg), axis=0)
        labs = np.concatenate((np.ones(shape=pos, dtype=bool), np.zeros(shape=neg, dtype=bool)), axis=0)

        # Shuffle the order of lams and labs.
        if not validation:
            shuff = self.rng[4].permutation(len(labs))
            lams = lams[shuff]
            labs = labs[shuff]
        # Return the tf.data.Dataset object created from the coefficient 
        # sets and their labels.
        return tf.data.Dataset.from_tensor_slices((lams, labs))



[docs]
    def generate_L(self, nL, lams, hop_dist, probs = None, rand_fraction = 0.):
        """Randomly generate a sample of new points in the vicinity of 
        some existing points. Given some set of existing points, samples 
        new points by making random hops of an angle given by a draw from 
        a normal distribution in a random direction along the 
        unit hypersphere in quartic coefficient space.

        Parameters
        ----------
        nL : int
            The number of new sets of quartic coefficients to generate.

        lams : np.array(np.float32, np.float32)
            A 2-D NumPy array representing sets of quartic potential 
            coefficients. New points will be sampled in the vicinity 
            of these.

        hop_dist : float
            The distance scale for sampling around the coefficients in 
            lams. Newly-generated points are taken from input points 
            by randomly rotating points in lams by an angle taken from 
            a normal distribution with standard deviation hop_dist.

        probs : np.array(np.float32), optional
            A 1-D NumPy array, should be an array of nonnegative floats 
            which sum to 1, representing the probability of the function 
            selecting each index of lams to generate new points around. 
            If not specified, a uniform selection probability for all 
            points in lams is used.

        rand_fraction : float, optional
            Must be a non-negative float between 0 and 1. If specified, 
            then the method will sample that fraction of its points as 
            uniformly distributed draws from the surface of the unit 
            hypersphere in quartic coefficient space, instead of sampling 
            in the vicinity of points in lams. If not specified, all 
            generated points will be sampled in the vicinity of points 
            in lams.

        Returns
        -------
        np.array(np.float32, np.float32)
            A 2-D NumPy array representing a list of sets of quartic 
            coefficients for the potential.
        """
        len_rands = int(np.rint(nL*rand_fraction))
        #Create a list of random draws from cart_lams
        picked_lams = self.rng[2].choice(lams, size = nL - len_rands, replace = True, p = probs, axis = 0)
        #Generate an ensemble of random unit vectors that are orthogonal to picked_lams
        orth_rands = self.create_random_lambdas(nL-len_rands)
        orth_rands = (orth_rands - np.sum(orth_rands * picked_lams, axis=1)[:,np.newaxis]*picked_lams)
        orth_rands = (orth_rands/(np.sqrt(np.sum(orth_rands * orth_rands, axis=1)[:,np.newaxis]))).astype(np.float32)
        #Create a list of random rotation angles to use here.
        rot_rands = self.rng[3].normal(loc = 0., scale = hop_dist, size = nL-len_rands).astype(np.float32)
        
        #Rotate picked_lams in the direction of orth_rands by a random angle.
        L_array = orth_rands*(np.sin(rot_rands)[:, np.newaxis]) + picked_lams*(np.cos(rot_rands)[:, np.newaxis])
 
        if(len_rands == 0):
            return L_array
        else:
            rand_array = self.create_random_lambdas(len_rands)
            return np.concatenate((L_array, rand_array), axis=0)