import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from io import StringIO


data_string = """
x1,   x2, y
 .1, -.1, 0
-.3,  .2, 1
-.4,   0, 1
-.7,  .3, 0
 .1,  .7, 0
-.7,  .9, 1
-.8,  .8, 1
"""
df = pd.read_csv(StringIO(data_string), sep='\\s*,\\s*', engine='python')
X = df[['x1', 'x2']]
y = df.y
print(f'df=\n{df},\nX=\n{X},\ny={y}')

df=
    x1   x2  y
0  0.1 -0.1  0
1 -0.3  0.2  1
2 -0.4  0.0  1
3 -0.7  0.3  0
4  0.1  0.7  0
5 -0.7  0.9  1
6 -0.8  0.8  1,
X=
    x1   x2
0  0.1 -0.1
1 -0.3  0.2
2 -0.4  0.0
3 -0.7  0.3
4  0.1  0.7
5 -0.7  0.9
6 -0.8  0.8,
y=0    0
1    1
2    1
3    0
4    0
5    1
6    1
Name: y, dtype: int64


# draw points
plt.plot(df.x1[df.y == 0], df.x2[df.y == 0], '^r', label='0') # red triangles
plt.plot(df.x1[df.y == 1], df.x2[df.y == 1], 'sb', label='1') # blue squares
plt.plot(0, 0, 'og', label='unknown') # green dot
plt.text(x=0, y=.07, s='?', color='green', fontsize='x-large') # green question mark
             
# draw circles to contain 1, 3, and 5 points
theta = np.linspace(start=0, stop=2*np.pi, num=100)
radius = [.25, .5, 1]
linestyle = ['solid', 'dashed', 'dashdot', 'dotted']
circle_color = ['red', 'blue', 'red', 'blue']

for i in range(len(radius)):
    plt.plot(radius[i] * np.cos(theta), radius[i] * np.sin(theta),
             linestyle=linestyle[i], color=circle_color[i])
plt.axis('square')
plt.legend(loc='lower right')
plt.savefig(fname='kNN.png')
plt.show(block=False)


k_values = [1, 3, 5, 7]
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn.fit(X, y)
    X_green = pd.DataFrame({'x1': [0], 'x2': [0]})
    print(f'For k={k}, predict green is {knn.predict(X_green)[0]}.')

For k=1, predict green is 0.
For k=3, predict green is 1.
For k=5, predict green is 0.
For k=7, predict green is 1.


k_values = [1, 3, 5, 7]
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
    knn.fit(X, y)
    print(f'For k={k}, predict green is {knn.predict(X_green)[0]:.3}.')

For k=1, predict green is 0.0.
For k=3, predict green is 0.667.
For k=5, predict green is 0.4.
For k=7, predict green is 0.571.


k = 3
knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
knn.fit(X, y)
print(f'For k={k}, predict green is {knn.predict(X_green)[0]}.')

For k=3, predict green is 0.


distances, indices = knn.kneighbors(X_green) # retrieve distances to and indices of kNN of green point
n_rows = X_green.shape[0]
for i in range(n_rows):
  with np.printoptions(precision=1): # set precision for this block only
      print(f'row i={i}, indices[i]={indices[i]}\n' + f'y[indices[i]]={y[indices[i]]}\n' +
            f'distances[i]={distances[i]}\n' + f'1/distances[i]={1/distances[i]}')

row i=0, indices[i]=[0 1 2]
y[indices[i]]=0    0
1    1
2    1
Name: y, dtype: int64
distances[i]=[0.1 0.4 0.4]
1/distances[i]=[7.1 2.8 2.5]


# Here is a more natural example of kNN regression.
x = np.array([1, 2, 3, 5])
y = np.array([1, 3, 2, 4])

k_values = [1, 2, 3, 4]
for k in k_values:
    plt.plot(x, y, 'o')
    plt.title(f'k={k}')
    kNN = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
    X = x.reshape(-1, 1)
    kNN.fit(X, y)
    xplot = np.linspace(start=0, stop=6)
    yplot = kNN.predict(xplot.reshape(-1, 1))
    plt.plot(xplot, yplot)
    plt.xlim(0, 6)
    plt.ylim(0, 5)
    if (k == 2): # make labels larger for lecture notes figure
        ax = plt.gca()
        ax.title.set_fontsize(20)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(20)
        
        plt.savefig('kNN_regression.png')
    plt.show(block=False)

k-Nearest Neighbors (kNN)¶

Make fake data¶

Plot data¶

kNN classifier¶

kNN regressor¶

Weighted kNN classifier¶

Here is a more natural k-NN regression example.¶