import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from localreg import localreg, rbf # run '!pip install localreg' once first
def k(z):
return (1.0 / np.sqrt(2 * np.pi)) * np.exp(-z**2 / 2)
def f(t, x, y, b): # return f(t) using data (x, y)
assert x.size == y.size
w = k((t-x)/b) # vector of numerators; rescaled to sum to zero on next line
w /= np.sum(w)
assert np.isclose(np.sum(w), 1.0)
return np.sum(w * y)
with random $x \in [0, 3]$ and $y = e^x + \sin(x) + \epsilon$, where $\epsilon \sim N(0, 1)$
N = 20
low = 0
high = 3
rng = np.random.default_rng(seed=0)
x = np.sort(rng.uniform(low=low, high=high, size=N)) # sort for left-to-right line plot
noise = rng.normal(loc=0, scale=1, size=N)
y = np.exp(x) * np.sin(x) + noise # I made up a nonlinear function.
X = x.copy().reshape(-1, 1)
Run this code chunk three times:
show_k_bell_curves = True show_f_from_formulas = False show_scikit_learn_model = False
show_k_bell_curves = True show_f_from_formulas = True show_scikit_learn_model = False
show_k_bell_curves = True show_f_from_formulas = True show_localreg_model = True
show_k_bell_curves = True
show_f_from_formulas = True
show_localreg_model = True
b = [5, .05, .35] # I chose these by trial-and-error.
# - The first widens the bell curves so much we get the underfit f(x) = np.mean(y).
# - The second makes them so narrow enough we get overfit f(x) =~ 1NN(x) (kNN with k=1) for most x
# - The third gives a nice fit.
titles = ['Underfit', 'Overfit', 'Good fit']
for kernel_i in range(len(b)):
plt.plot(x, y, '.', color='black', markersize=10, label='data') # plot data
plt.plot(x, y - noise, linestyle='dotted', color='gray', label='(unknown truth)') # plot unknown truth
if show_k_bell_curves:
# for understanding, add plots of k(z) bell curves to understand weights {w_i}
for i in np.arange(N):
x_plot = np.linspace(start=low, stop=high, num=100)
y_scale_hack = 8 # make bell curves a nice height; it is irrelevant to the math
y_plot_bells = y_scale_hack * k((x[i] - x_plot) / b[kernel_i])
label = np.where(i == 0, 'k(x) curves', '') # only label first of N curves for legend
plt.plot(x_plot, y_plot_bells, linestyle='dashed', label=label)
if show_f_from_formulas:
# for understanding, add f(x) calculated from formulas
y_hat_formulas = np.zeros(shape=x_plot.shape[0])
for i in np.arange(x_plot.shape[0]):
y_hat_formulas[i] = f(x_plot[i], x, y, b[kernel_i])
plt.plot(x_plot, y_hat_formulas, linestyle='dashed',
color='lime', label=f'f(x) from formulas')
if show_localreg_model:
y_plot_localreg = localreg(x=X, y=y, x0=x_plot, degree=0, kernel=rbf.gaussian, radius=b[kernel_i])
plt.plot(x_plot, y_plot_localreg, linestyle='dotted', color='black', label=r'localreg()')
y_hat = localreg(x=X, y=y, x0=X, degree=0, kernel=rbf.gaussian, radius=b[kernel_i])
plt.plot(x, y_hat, '.', color='red', label=r'$\hat{y}$')
plt.ylim(-1, 8)
plt.title(titles[kernel_i])
plt.legend()
plt.show()