import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN
X = np.array([(0,2), (1,1), (1,2), (2,3), (2,4), (3,3), (3,0), (4,0)])
kmeans = KMeans(n_clusters=3, n_init=10, random_state=0)
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
colors = {0:'red', 1:'blue', 2:'green'}
for label in kmeans.labels_:
keep_row = (kmeans.labels_ == label)
plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
k = kmeans.cluster_centers_.shape[0]
for i in range(k):
plt.plot(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1], marker='+', linestyle='none', color=colors[i])
kmeans.labels_=[2 2 2 0 0 0 1 1] kmeans.cluster_centers_=[[2.33333333 3.33333333] [3.5 0. ] [0.66666667 1.66666667]]
Check whether clusters correspond to labels.
digits = load_digits() # reset the data (unnecessary) for clarity
X = digits.data
y = digits.target
# Where did I get the n_clusters=10? Cheating! I know these are
# images of 10 digits, but normal clustering is on unlabeled data.
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
N = X.shape[0]
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
#print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
kmeans.labels_=[0 1 1 ... 1 8 8]
# Confirm that '.labels_' gives the same values '.predict()' called on training data.
clusterIDs = kmeans.predict(X) # cluster IDs, not digit labels
assert np.all(clusterIDs == kmeans.labels_) # since I called .predict() on the training X
print(f'clusterIDs={clusterIDs}')
clusterIDs=[0 1 1 ... 1 8 8]
These IDs don't mean anything to me. However, since we clustered images that have $y$ labels (even though $k$-means didn't see the labels) let's inspect the $y$ labels in each cluster and visualize the centroids.
fig = plt.figure(figsize=(10, 5)) # new blank figure
gs = fig.add_gridspec(nrows=2, ncols=5) # grid of plot axes
cluster_digit_counts = np.zeros(shape=(10, 10))
for i in np.arange(10): # for each cluster ID
clusterMembers = y[clusterIDs == i]
values, counts = np.unique(ar=clusterMembers, return_counts=True)
for j in np.arange(values.shape[0]):
cluster_digit_counts[i, values[j]] = counts[j]
print(f'i={i}, values={values}, counts={counts}')
ax = fig.add_subplot(gs[i // 5, i % 5])
ax.imshow(kmeans.cluster_centers_[i].reshape(8, 8), cmap='binary')
ax.set_axis_off()
plt.title(f'ID i={i}, n={values[np.argmax(counts)]}')
print('\nRows are labeled with cluster IDs. Columns are labeled with y=digit.')
df = pd.DataFrame(cluster_digit_counts, columns=np.arange(10), index=np.arange(10))
print(df)
print('\nImages of cluster centers are below. Each title shows cluster ID i and most-frequent digit n.')
i=0, values=[0 2 6], counts=[176 1 1]
i=1, values=[1 2 3 4 6 7 8 9], counts=[100 8 7 2 3 2 100 1]
i=2, values=[2 3 4 7 8 9], counts=[ 3 7 11 174 5 8]
i=3, values=[1 2 4 7 8 9], counts=[54 2 3 2 6 20]
i=4, values=[1 2 3 5 8 9], counts=[ 1 13 154 2 2 6]
i=5, values=[1 5 6 8], counts=[ 2 1 177 2]
i=6, values=[0 4 5], counts=[ 2 165 2]
i=7, values=[1 3 5 7 8 9], counts=[ 1 2 136 1 4 6]
i=8, values=[2 3 5 8 9], counts=[ 2 13 41 52 139]
i=9, values=[1 2 8], counts=[ 24 148 3]
Rows are labeled with cluster IDs. Columns are labeled with y=digit.
0 1 2 3 4 5 6 7 8 9
0 176.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
1 0.0 100.0 8.0 7.0 2.0 0.0 3.0 2.0 100.0 1.0
2 0.0 0.0 3.0 7.0 11.0 0.0 0.0 174.0 5.0 8.0
3 0.0 54.0 2.0 0.0 3.0 0.0 0.0 2.0 6.0 20.0
4 0.0 1.0 13.0 154.0 0.0 2.0 0.0 0.0 2.0 6.0
5 0.0 2.0 0.0 0.0 0.0 1.0 177.0 0.0 2.0 0.0
6 2.0 0.0 0.0 0.0 165.0 2.0 0.0 0.0 0.0 0.0
7 0.0 1.0 0.0 2.0 0.0 136.0 0.0 1.0 4.0 6.0
8 0.0 0.0 2.0 13.0 0.0 41.0 0.0 0.0 52.0 139.0
9 0.0 24.0 148.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0
Images of cluster centers are below. Each title shows cluster ID i and most-frequent digit n.
Again, remember that clustering is normally done on unlabeled data, not on these labeled images of digits; and remember that I cheated to choose $k=10$.
first try $k$-means, then try DBSCAN
X = np.array([(0,1), (0,2), (0,3), (0,4),
(.5,.5),
(1,0), (2,0), (3,0), (4,0),
(2,2), (2,3), (2,4), (3,2), (3,3), (4,4)])
kmeans = KMeans(n_clusters=2, n_init=10, random_state=0)
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
colors = {0:'red', 1:'blue'}
for label in kmeans.labels_:
keep_row = (kmeans.labels_ == label)
plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
k = kmeans.cluster_centers_.shape[0]
for i in range(k):
plt.plot(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1], marker='+', linestyle='none', color=colors[i])
plt.title('k-means(k=2)')
plt.show(block=False)
min_samples=2
epsilon=1
db = DBSCAN(eps=epsilon, min_samples=min_samples)
db.fit(X)
print(f'db.labels_={db.labels_}') # cluster IDs, not digit labels
colors = {0:'red', 1:'blue', -1:'green'}
for label in db.labels_:
keep_row = (db.labels_ == label)
plt.plot(X[keep_row, 0], X[keep_row, 1], marker='o', linestyle='none', color=colors[label])
plt.title(f'DBSCAN(eps={epsilon}, min_samples={min_samples})')
core_indices = np.array(db.core_sample_indices_)
_ = plt.plot(X[core_indices, 0], X[core_indices, 1], marker='.', linestyle='none', color='white')
# mark core samples
kmeans.labels_=[0 1 1 1 0 0 0 0 0 1 1 1 1 1 1] kmeans.cluster_centers_=[[1.75 0.25 ] [1.77777778 3. ]]
db.labels_=[ 0 0 0 0 0 0 0 0 0 1 1 1 1 1 -1]