import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KernelDensity


def k(x, mu=0, sigma=1):
    return (1.0 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(1/2)*((x-mu)/sigma)**2))

def f(t, x, b): # return f(t) using data x and bandwidth b
    N = x.size
    return (1/N) * np.sum((1/b) * k((t - x) / b))


N = 6
low = 0
high = 3
rng = np.random.default_rng(seed=0)
x = np.sort(rng.uniform(low=low, high=high, size=N)) # sort for left-to-right line plot
X = x.reshape(-1, 1)


plt.plot(x, np.zeros(x.shape), '.', color='black', markersize=10, label='data') # plot data

# for understanding, add plots of k(z) bell curves to understand their average, f()
b = 1 # also try lower and higher values to see over- and under-fitting
for i in np.arange(N):
    x_plot_bells = np.linspace(start=low-1, stop=high+1, num=1000)
    y_plot_bells = k(x_plot_bells, mu=x[i], sigma=b)
    label = np.where(i == 0, f'$\\frac{{1}}{{b}} k(\\frac{{x-x_i}}{{b}})$ curves', '') # only label first of N curves for legend
    plt.plot(x_plot_bells, y_plot_bells, linestyle='dashdot', label=label)

# for understanding, add f(x) calculated from formulas
x_plot = np.linspace(low-1, high+1, 100)
y_hat_formulas = np.zeros(x_plot.shape[0])
for i in np.arange(x_plot.shape[0]):
    y_hat_formulas[i] = f(x_plot[i], x, b)

plt.plot(x_plot, y_hat_formulas, linestyle='solid',
         color='black', label=f'f(x) from formulas')

# now use KernelDensity()
kde = KernelDensity(bandwidth=1, kernel='gaussian')
kde.fit(X)
log_densities = kde.score_samples(x_plot.reshape(-1, 1))
y_hat = np.exp(log_densities)

# plot model
epsilon = 0.005 # visually separate the kde curve from the formulas curve 
plt.plot(x_plot, y_hat + epsilon, linestyle='dashed', color='black', label='KDE')

#plt.ylim(-1, 8)
plt.title('KDE')
plt.legend()
plt.show()

Kernel Density Estimation (KDE)¶

Implement $k(x)$ and $f(x)$ to understand the method.¶

Make toy data¶

Run KDE¶