Clustering via $k$-Means and DBSCAN¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans, DBSCAN

$k$-Means Clustering¶

Toy example from lecture¶

In [2]:
X = np.array([(0,2), (1,1), (1,2), (2,3), (2,4), (3,3), (3,0), (4,0)])
kmeans = KMeans(n_clusters=3, n_init=10, random_state=0)
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
colors = {0:'red', 1:'blue', 2:'green'}
for label in kmeans.labels_:
    keep_row = (kmeans.labels_ == label)
    plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
k = kmeans.cluster_centers_.shape[0]
for i in range(k):
    plt.plot(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1], marker='+', linestyle='none', color=colors[i])
kmeans.labels_=[2 2 2 0 0 0 1 1]
kmeans.cluster_centers_=[[2.33333333 3.33333333]
 [3.5        0.        ]
 [0.66666667 1.66666667]]

Try clustering digits (from $\mathbf{x}=$ 8x8 image, ignoring $y=$ digit label).¶

Check whether clusters correspond to labels.

In [3]:
digits = load_digits() # reset the data (unnecessary) for clarity
X = digits.data
y = digits.target

# Where did I get the n_clusters=10? Cheating! I know these are
# images of 10 digits, but normal clustering is on unlabeled data.
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
N = X.shape[0]
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
#print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
kmeans.labels_=[0 1 1 ... 1 8 8]
In [4]:
# Confirm that '.labels_' gives the same values '.predict()' called on training data.
clusterIDs = kmeans.predict(X) # cluster IDs, not digit labels
assert np.all(clusterIDs == kmeans.labels_) # since I called .predict() on the training X
print(f'clusterIDs={clusterIDs}')
clusterIDs=[0 1 1 ... 1 8 8]

These IDs don't mean anything to me. However, since we clustered images that have $y$ labels (even though $k$-means didn't see the labels) let's inspect the $y$ labels in each cluster and visualize the centroids.

In [5]:
fig = plt.figure(figsize=(10, 5)) # new blank figure
gs = fig.add_gridspec(nrows=2, ncols=5) # grid of plot axes
cluster_digit_counts = np.zeros(shape=(10, 10))
for i in np.arange(10): # for each cluster ID
    clusterMembers = y[clusterIDs == i]
    values, counts = np.unique(ar=clusterMembers, return_counts=True)
    for j in np.arange(values.shape[0]):
        cluster_digit_counts[i, values[j]] = counts[j]
    print(f'i={i}, values={values}, counts={counts}')
    ax = fig.add_subplot(gs[i // 5, i % 5])
    ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='binary')
    ax.set_axis_off()
    plt.title(f'ID i={i}, n={values[np.argmax(counts)]}')
print('\nRows are labeled with cluster IDs. Columns are labeled with y=digit.')
df = pd.DataFrame(cluster_digit_counts, columns=np.arange(10), index=np.arange(10))
print(df)
print('\nImages of cluster centers are below. Each title shows cluster ID i and most-frequent digit n.')
i=0, values=[0 2 6], counts=[176   1   1]
i=1, values=[1 2 3 4 6 7 8 9], counts=[100   8   7   2   3   2 100   1]
i=2, values=[2 3 4 7 8 9], counts=[  3   7  11 174   5   8]
i=3, values=[1 2 4 7 8 9], counts=[54  2  3  2  6 20]
i=4, values=[1 2 3 5 8 9], counts=[  1  13 154   2   2   6]
i=5, values=[1 5 6 8], counts=[  2   1 177   2]
i=6, values=[0 4 5], counts=[  2 165   2]
i=7, values=[1 3 5 7 8 9], counts=[  1   2 136   1   4   6]
i=8, values=[2 3 5 8 9], counts=[  2  13  41  52 139]
i=9, values=[1 2 8], counts=[ 24 148   3]

Rows are labeled with cluster IDs. Columns are labeled with y=digit.
       0      1      2      3      4      5      6      7      8      9
0  176.0    0.0    1.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0
1    0.0  100.0    8.0    7.0    2.0    0.0    3.0    2.0  100.0    1.0
2    0.0    0.0    3.0    7.0   11.0    0.0    0.0  174.0    5.0    8.0
3    0.0   54.0    2.0    0.0    3.0    0.0    0.0    2.0    6.0   20.0
4    0.0    1.0   13.0  154.0    0.0    2.0    0.0    0.0    2.0    6.0
5    0.0    2.0    0.0    0.0    0.0    1.0  177.0    0.0    2.0    0.0
6    2.0    0.0    0.0    0.0  165.0    2.0    0.0    0.0    0.0    0.0
7    0.0    1.0    0.0    2.0    0.0  136.0    0.0    1.0    4.0    6.0
8    0.0    0.0    2.0   13.0    0.0   41.0    0.0    0.0   52.0  139.0
9    0.0   24.0  148.0    0.0    0.0    0.0    0.0    0.0    3.0    0.0

Images of cluster centers are below. Each title shows cluster ID i and most-frequent digit n.

Again, remember that clustering is normally done on unlabeled data, not on these labeled images of digits; and remember that I cheated to choose $k=10$.

DBSCAN: toy example from lecture:¶

first try $k$-means, then try DBSCAN

In [6]:
X = np.array([(0,1), (0,2), (0,3), (0,4),
              (.5,.5),
              (1,0), (2,0), (3,0), (4,0),
              (2,2), (2,3), (2,4), (3,2), (3,3), (4,4)])
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0)
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
colors = {0:'red', 1:'blue'}
for label in kmeans.labels_:
    keep_row = (kmeans.labels_ == label)
    plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
k = kmeans.cluster_centers_.shape[0]
for i in range(k):
    plt.plot(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1], marker='+', linestyle='none', color=colors[i])
plt.title('k-means(k=2)')
plt.show(block=False)
min_samples=2
epsilon=1
db = DBSCAN(eps=epsilon, min_samples=min_samples)
db.fit(X)
print(f'db.labels_={db.labels_}') # cluster IDs, not digit labels
colors = {0:'red', 1:'blue', -1:'green'}
for label in db.labels_:
    keep_row = (db.labels_ == label)
    plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
plt.title(f'DBSCAN(eps={epsilon}, min_samples={min_samples})')
core_indices = np.array(db.core_sample_indices_)
_ = plt.plot(X[core_indices, 0], X[core_indices, 1], marker='.', linestyle='none', color='white')
 # mark core samples
kmeans.labels_=[0 1 1 1 0 0 0 0 0 1 1 1 1 1 1]
kmeans.cluster_centers_=[[1.75       0.25      ]
 [1.77777778 3.        ]]
db.labels_=[ 0  0  0  0  0  0  0  0  0  1  1  1  1  1 -1]