import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
def k(x, mu=0, sigma=1):
return (1.0 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(1/2)*((x-mu)/sigma)**2))
def f(t, x, b): # return f(t) using data x and bandwidth b
N = x.size
return (1/N) * np.sum((1/b) * k((t - x) / b))
with random $x \in [0, 3]$
N = 6
low = 0
high = 3
rng = np.random.default_rng(seed=0)
x = np.sort(rng.uniform(low=low, high=high, size=N)) # sort for left-to-right line plot
X = x.reshape(-1, 1)
plt.plot(x, np.zeros(x.shape), '.', color='black', markersize=10, label='data') # plot data
# for understanding, add plots of k(z) bell curves to understand their average, f()
b = 1 # also try lower and higher values to see over- and under-fitting
for i in np.arange(N):
x_plot_bells = np.linspace(start=low-1, stop=high+1, num=1000)
y_plot_bells = k(x_plot_bells, mu=x[i], sigma=b)
label = np.where(i == 0, f'$\\frac{{1}}{{b}} k(\\frac{{x-x_i}}{{b}})$ curves', '') # only label first of N curves for legend
plt.plot(x_plot_bells, y_plot_bells, linestyle='dashdot', label=label)
# for understanding, add f(x) calculated from formulas
x_plot = np.linspace(low-1, high+1, 100)
y_hat_formulas = np.zeros(x_plot.shape[0])
for i in np.arange(x_plot.shape[0]):
y_hat_formulas[i] = f(x_plot[i], x, b)
plt.plot(x_plot, y_hat_formulas, linestyle='solid',
color='black', label=f'f(x) from formulas')
# now use KernelDensity()
kde = KernelDensity(bandwidth=1, kernel='gaussian')
kde.fit(X)
log_densities = kde.score_samples(x_plot.reshape(-1, 1))
y_hat = np.exp(log_densities)
# plot model
epsilon = 0.005 # visually separate the kde curve from the formulas curve
plt.plot(x_plot, y_hat + epsilon, linestyle='dashed', color='black', label='KDE')
#plt.ylim(-1, 8)
plt.title('KDE')
_ = plt.legend()