Source code for catkit.learn

from scipy.optimize import minimize
from scipy.optimize import basinhopping
import matplotlib.pyplot as plt
from sklearn.gaussian_process.kernels import WhiteKernel, DotProduct
from sklearn.metrics import mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import preprocessing
import numpy as np

[docs]def optimizer(obj_func, initial_theta, bounds, gradient=True, minimizer='L-BFGS-B', hopping=0, **kwargs): """Substitute optimizer in scikit-learn Gaussian Process function. Note 'L-BFGS-B' is equivalent to the standard optimizer used in scikit-learn. This function allows for more direct control over the arguments. Parameters ---------- obj_func : function scikit-learn objective function. initial_theta : array (n,) Hyperparameters to be optimized against. bounds : list of tuples (n, 2) Lower and upper bounds for each hyper parameter. gradient : bool Include the gradient for the optimization function. minimizer : str A scipy minimization method. hopping : int Perform a number of basin hopping steps. Returns ------- theta_opt : list (n,) Optimized hyperparameters. func_min : float Value of the minimized objective function. """ margs = { 'method': minimizer, 'args': (gradient, ), 'jac': gradient, 'bounds': bounds, } if hopping: m = basinhopping( obj_func, initial_theta, minimizer_kwargs=margs, niter=hopping, T=1e2, stepsize=2, ) else: m = minimize(obj_func, initial_theta, **margs) theta_opt = m.x func_min = return theta_opt, func_min
[docs]def online_learning(X, y, samples, factors=[1.0, 1.0], nsteps=40, plot=False): """A simple utility for performing online learning. The main components required are a regression method and a scoring technique. Currently, the scoring methodology and regressor are baked in. These need to be made modular. Minimum 3 samples are required for 3 fold cross validation. """ ids = np.arange(len(y)) kernel = DotProduct() + WhiteKernel() regressor = GaussianProcessRegressor( kernel=kernel, n_restarts_optimizer=5, alpha=0) step = 0 while step < nsteps: X0 = X[samples] y0 = y[samples], y0) yp, ys = regressor.predict(X, return_std=True) # Provides some form of normalization. # Multiples denote relative importance yp_scale = preprocessing.scale(yp) * factors[0] ys_scale = preprocessing.scale(ys) * factors[1] score = ys_scale - yp_scale srt = np.argsort(score)[::-1] for s in srt: if s not in samples: samples = np.concatenate([samples, [s]]) break if plot: mae = np.round(mean_absolute_error(yp, y), 3) n = len(samples) fig, ax = plt.subplots(figsize=(6, 4)) ax.plot(ids, y, 'o', zorder=0) ax.errorbar(ids, yp, yerr=ys, fmt='o', zorder=1) ax.plot(samples, y[samples], 'o', zorder=3) xlim = ax.get_xlim() ylim = ax.get_ylim() ax.text(xlim[0] / 9.0, ylim[0] / 9.0, mae) plt.tight_layout() plt.savefig('./online-learning-RBF-{}.png'.format(n)) plt.close() step += 1 return samples