import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
def k(z):
return (1.0 / np.sqrt(2 * np.pi)) * np.exp(-z**2 / 2)
def f(t, x, y, b): # return f(t) using data (x, y)
assert x.size == y.size
w = k((t-x)/b) # vector of numerators; rescaled to sum to zero on next line
w /= np.sum(w)
assert np.isclose(np.sum(w), 1.0)
return np.sum(w * y)
with random $x \in [0, 3]$ and $y = e^x + \sin(x) + \epsilon$, where $\epsilon \sim N(0, 1)$
N = 20
low = 0
high = 3
rng = np.random.default_rng(seed=0)
x = np.sort(rng.uniform(low=low, high=high, size=N)) # sort for left-to-right line plot
noise = rng.normal(loc=0, scale=1, size=N)
y = np.exp(x) * np.sin(x) + noise # I made up a nonlinear function.
X = x.copy().reshape(-1, 1)
Run this code chunk three times:
show_k_bell_curves = True
show_f_from_formulas = False
show_scikit_learn_model = False
show_k_bell_curves = True
show_f_from_formulas = True
show_scikit_learn_model = False
show_k_bell_curves = True
show_f_from_formulas = True
show_scikit_learn_model = True
show_k_bell_curves = True
show_f_from_formulas = True
show_scikit_learn_model = True
kernels = [ConstantKernel(constant_value=1.0) + WhiteKernel(noise_level=1.0), # underfit
ConstantKernel() * RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5)), # overfit
ConstantKernel() * RBF() + WhiteKernel() # nice fit
]
titles = ['Underfit', 'Overfit', 'Good fit']
for kernel_i in range(len(kernels)):
plt.plot(x, y, '.', color='black', markersize=10, label='data') # plot data
plt.plot(x, y - noise, linestyle='dotted', color='gray', label='(unknown truth)') # plot unknown truth
if show_k_bell_curves:
# for understanding, add plots of k(z) bell curves to understand weights {w_i}
b = [5, .05, .35] # I chose these by trial-and-error.
# - The first widens the bell curves so much we get the underfit f(x) = np.mean(y).
# - The second makes them so narrow enough we get overfit f(x) =~ 1NN(x) (kNN with k=1) for most x
# - The third gives a nice fit.
for i in np.arange(N):
x_plot_bells = np.linspace(start=low, stop=high, num=1000)
y_scale_hack = 8 # make bell curves a nice height; it is irrelevant to the math
y_plot_bells = y_scale_hack * k((x[i] - x_plot_bells) / b[kernel_i])
label = np.where(i == 0, 'k(x) curves', '') # only label first of N curves for legend
plt.plot(x_plot_bells, y_plot_bells, linestyle='dashdot', label=label)
if show_f_from_formulas:
# for understanding, add f(x) calculated from formulas
y_hat_formulas = np.zeros(shape=N*3) # Use N*3 to see more detail.
x_plot_formulas = np.linspace(low, high, N*3)
for i in np.arange(N*3):
y_hat_formulas[i] = f(x_plot_formulas[i], x, y, b[kernel_i])
plt.plot(x_plot_formulas, y_hat_formulas, linestyle='dashed',
color='lime', label=f'f(x) from formulas')
if show_scikit_learn_model:
model = GaussianProcessRegressor(kernel=kernels[kernel_i], n_restarts_optimizer=10, random_state=0)
model.fit(X, y)
print(f'model={model},\nmodel.kernel_={model.kernel_},\nmodel.score(X, y)={model.score(X, y):.3}')
y_hat = model.predict(X)
# plot model
# next line: for kernels[1], plot at x and some other points to see wild overfit
x_plot = np.sort(np.concatenate([x, np.linspace(start=low, stop=high, num=N)]))
y_hat_plot = model.predict(x_plot.reshape(-1, 1))
plt.plot(x_plot, y_hat_plot, '-', color='black', label=r'model')
plt.plot(x, y_hat, '.', color='red', label=r'$\hat{y}$')
plt.ylim(-1, 8)
plt.title(titles[kernel_i])
plt.legend()
plt.show()
model=GaussianProcessRegressor(kernel=1**2 + WhiteKernel(noise_level=1), n_restarts_optimizer=10, random_state=0), model.kernel_=4.29**2 + WhiteKernel(noise_level=8.68), model.score(X, y)=-0.00121
model=GaussianProcessRegressor(kernel=1**2 * RBF(length_scale=1), n_restarts_optimizer=10, random_state=0), model.kernel_=5.06**2 * RBF(length_scale=0.00618), model.score(X, y)=1.0
model=GaussianProcessRegressor(kernel=1**2 * RBF(length_scale=1) + WhiteKernel(noise_level=1), n_restarts_optimizer=10, random_state=0), model.kernel_=3.85**2 * RBF(length_scale=1.05) + WhiteKernel(noise_level=0.388), model.score(X, y)=0.963