Kernel Regression¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from localreg import localreg, rbf # run '!pip install localreg' once first

Implement $k(z)$ and $f(x)$ to understand the method.¶

In [2]:
def k(z):
    return (1.0 / np.sqrt(2 * np.pi)) * np.exp(-z**2 / 2)

def f(t, x, y, b): # return f(t) using data (x, y)
    assert x.size == y.size
    w = k((t-x)/b) # vector of numerators; rescaled to sum to zero on next line
    w /= np.sum(w)
    assert np.isclose(np.sum(w), 1.0)
    return np.sum(w * y)

Make nonlinear toy data¶

with random $x \in [0, 3]$ and $y = e^x + \sin(x) + \epsilon$, where $\epsilon \sim N(0, 1)$

In [3]:
N = 20
low = 0
high = 3
rng = np.random.default_rng(seed=0)
x = np.sort(rng.uniform(low=low, high=high, size=N)) # sort for left-to-right line plot
noise = rng.normal(loc=0, scale=1, size=N)
y = np.exp(x) * np.sin(x) + noise # I made up a nonlinear function.
X = x.copy().reshape(-1, 1)

Look at what math formulas do; see that localreg() does the same thing¶

Run this code chunk three times:

  • To see the k(x) bell curves that determine the weights $w_i$, set the top three lines of code like this:

show_k_bell_curves = True show_f_from_formulas = False show_scikit_learn_model = False

  • To see also f(x) calculated from the formulas, set:

show_k_bell_curves = True show_f_from_formulas = True show_scikit_learn_model = False

  • To see also f(x) calculated by localreg(), set:

show_k_bell_curves = True show_f_from_formulas = True show_localreg_model = True

In [4]:
show_k_bell_curves = True
show_f_from_formulas = True
show_localreg_model = True
    
b = [5, .05, .35] # I chose these by trial-and-error.
# - The first widens the bell curves so much we get the underfit f(x) = np.mean(y).
# - The second makes them so narrow enough we get overfit f(x) =~ 1NN(x) (kNN with k=1) for most x
# - The third gives a nice fit.
titles = ['Underfit', 'Overfit', 'Good fit']
for kernel_i in range(len(b)):
    plt.plot(x, y, '.', color='black', markersize=10, label='data') # plot data
    plt.plot(x, y - noise, linestyle='dotted', color='gray', label='(unknown truth)') # plot unknown truth
    
    if show_k_bell_curves:
        # for understanding, add plots of k(z) bell curves to understand weights {w_i}
        for i in np.arange(N):
            x_plot = np.linspace(start=low, stop=high, num=100)
            y_scale_hack = 8 # make bell curves a nice height; it is irrelevant to the math
            y_plot_bells = y_scale_hack * k((x[i] - x_plot) / b[kernel_i])
            label = np.where(i == 0, 'k(x) curves', '') # only label first of N curves for legend
            plt.plot(x_plot, y_plot_bells, linestyle='dashed', label=label)

    if show_f_from_formulas:
        # for understanding, add f(x) calculated from formulas
        y_hat_formulas = np.zeros(shape=x_plot.shape[0])
        for i in np.arange(x_plot.shape[0]):
            y_hat_formulas[i] = f(x_plot[i], x, y, b[kernel_i])
        plt.plot(x_plot, y_hat_formulas, linestyle='dashed',
                 color='lime', label=f'f(x) from formulas')

    if show_localreg_model:
        y_plot_localreg = localreg(x=X, y=y, x0=x_plot, degree=0, kernel=rbf.gaussian, radius=b[kernel_i])
        plt.plot(x_plot, y_plot_localreg, linestyle='dotted', color='black', label=r'localreg()')
        y_hat = localreg(x=X, y=y, x0=X, degree=0, kernel=rbf.gaussian, radius=b[kernel_i])
        plt.plot(x, y_hat, '.', color='red', label=r'$\hat{y}$')
    
    plt.ylim(-1, 8)
    plt.title(titles[kernel_i])
    plt.legend()
    plt.show()
In [ ]: