import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from io import StringIO
data_string = """
x1, x2, y
.1, -.1, 0
-.3, .2, 1
-.4, 0, 1
-.7, .3, 0
.1, .7, 0
-.7, .9, 1
-.8, .8, 1
"""
df = pd.read_csv(StringIO(data_string), sep='\\s*,\\s*', engine='python')
X = df[['x1', 'x2']]
y = df.y
print(f'df=\n{df},\nX=\n{X},\ny={y}')
df= x1 x2 y 0 0.1 -0.1 0 1 -0.3 0.2 1 2 -0.4 0.0 1 3 -0.7 0.3 0 4 0.1 0.7 0 5 -0.7 0.9 1 6 -0.8 0.8 1, X= x1 x2 0 0.1 -0.1 1 -0.3 0.2 2 -0.4 0.0 3 -0.7 0.3 4 0.1 0.7 5 -0.7 0.9 6 -0.8 0.8, y=0 0 1 1 2 1 3 0 4 0 5 1 6 1 Name: y, dtype: int64
# draw points
plt.plot(df.x1[df.y == 0], df.x2[df.y == 0], '^r', label='0') # red triangles
plt.plot(df.x1[df.y == 1], df.x2[df.y == 1], 'sb', label='1') # blue squares
plt.plot(0, 0, 'og', label='unknown') # green dot
plt.text(x=0, y=.07, s='?', color='green', fontsize='x-large') # green question mark
# draw circles to contain 1, 3, and 5 points
theta = np.linspace(start=0, stop=2*np.pi, num=100)
radius = [.25, .5, 1]
linestyle = ['solid', 'dashed', 'dashdot', 'dotted']
circle_color = ['red', 'blue', 'red', 'blue']
for i in range(len(radius)):
plt.plot(radius[i] * np.cos(theta), radius[i] * np.sin(theta),
linestyle=linestyle[i], color=circle_color[i])
plt.axis('square')
plt.legend(loc='lower right')
plt.savefig(fname='kNN.png')
k_values = [1, 3, 5, 7]
for k in k_values:
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
clf.fit(X, y)
X_green = pd.DataFrame({'x1': [0], 'x2': [0]})
print(f'For k={k}, predict green is {clf.predict(X_green)[0]}.')
For k=1, predict green is 0. For k=3, predict green is 1. For k=5, predict green is 0. For k=7, predict green is 1.
See a more natural regression example below. (This example has $y \in \{0, 1\}$, not $y \in \mathbb{R}$.)
k_values = [1, 3, 5, 7]
for k in k_values:
model = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
model.fit(X, y)
print(f'For k={k}, predict green is {model.predict(X_green)[0]:.3}.')
For k=1, predict green is 0.0. For k=3, predict green is 0.667. For k=5, predict green is 0.4. For k=7, predict green is 0.571.
Recall that with unweighted kNN classifier, above, we saw "For k=3, predict green is 1."
Now we try weighted kNN.
k = 3
clf = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
clf.fit(X, y)
print(f'For k={k}, predict green is {clf.predict(X_green)[0]}.')
For k=3, predict green is 0.
Inspect the distances:
distances, indices = clf.kneighbors(X_green) # retrieve distances to and indices of kNN of green point
# each of distances and indices is a 2D array; use row [0] below
with np.printoptions(precision=2): # set precision for this block only
print(f'indices[0]={indices[0]}\n' + f'y[indices[0]]={y[indices[0]].to_numpy()}\n' +
f'distances[i]={distances[0]}\n' + f'1/distances[0]={1/distances[0]}')
indices[0]=[0 1 2] y[indices[0]]=[0 1 1] distances[i]=[0.14 0.36 0.4 ] 1/distances[0]=[7.07 2.77 2.5 ]
With unweighted 3-NN, we get 1 (blue). With weighted 3-NN, we get 0 (red) because the red's weight is greater than the sum of the two blue weights.
# Here is a more natural example of kNN regression.
x = np.array([1, 2, 3, 5])
y = np.array([1, 3, 2, 4])
k_values = [1, 2, 3, 4]
for k in k_values:
plt.plot(x, y, 'o')
plt.title(f'k={k}')
model = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
X = x.reshape(-1, 1)
model.fit(X, y)
xplot = np.linspace(start=0, stop=6)
yplot = model.predict(xplot.reshape(-1, 1))
plt.plot(xplot, yplot)
plt.xlim(0, 6)
plt.ylim(0, 5)
if (k == 2): # make labels larger for lecture notes figure
ax = plt.gca()
ax.title.set_fontsize(20)
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontsize(20)
plt.savefig('kNN_regression.png')
plt.show(block=False)
Investigate the effect of $p$ on the distance from $a = (0, 0)$ to $b = (3, 4)$.
def minkowski(a, b, p):
if p == 0:
return 1
return np.sum(np.abs(a - b)**p)**(1/p)
a = np.array([0.0, 0])
b = np.array([3.0, 4])
assert np.isclose(minkowski(a, b, p=1), 7)
assert np.isclose(minkowski(a, b, p=2), 5)
p = np.array([-5, -4, -3, -2, -1, 1/2, 1, 2, 3, 4, 5])
n = p.size
d = np.zeros(n)
for i in np.arange(n):
d[i] = minkowski(a=a, b=b, p=p[i])
print(f'{p[i]}, {d[i]}')
plt.figure(figsize=(8, 8)) # (width, height) in inches
plt.plot(p, d, '.k')
plt.title(r'Minkowski distance $\left( \sum_{i=1}^n |a_i - b_i|^p \right)^\frac{1}{p}$ vs. p')
plt.xlabel('p')
plt.ylabel(r'distance from $a = (0, 0)$ to $b = (3, 4)$')
plt.axhline(y=3, linestyle='dotted')
plt.axhline(y=4, linestyle='dotted')
plt.axvline(x=0)
plt.axhline(y=0)
plt.xlim(-7, 7)
plt.ylim(0, 15)
plt.text(x=1.0, y=13.9, s=r'$p=\frac{1}{2}$')
plt.text(x=1.4, y=6.9, s=r'$p=1$: Manhattan')
plt.text(x=2.3, y=4.9, s=r'$p=2$: Euclidean')
plt.text(x=4.8, y=3.7, s=r'$p=\infty$: max $|a_i - b_i|$')
plt.text(x=4.8, y=3.30, s=r'Chebyshev / Chessboard')
plt.text(x=-6.6, y=3.1, s=r'$p=-\infty$: min $|a_i - b_i|$')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
-5.0, 2.87492105207305 -4.0, 2.800746299492502 -3.0, 2.6678871092474226 -2.0, 2.4 -1.0, 1.7142857142857144 0.5, 13.928203230275509 1.0, 7.0 2.0, 5.0 3.0, 4.497941445275415 4.0, 4.284572294953817 5.0, 4.174027662897746