import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
digits = load_digits() # reset the data (unnecessary) for clarity
X = digits.data
y = digits.target
# Where did I get the n_clusters=10? Cheating! I know these are
# images of 10 digits, but normal clustering is on unlabeled data.
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
N = X.shape[0]
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
#print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')
kmeans.labels_=[5 7 7 ... 7 3 3]
like $k$-means clustering because there is no $y$.
However, I chose to cluster the digits images, for which we have $y$ labels. Here I check whether the clusters correspond to the $y$ labels I know.
clusterIDs = kmeans.predict(X) # cluster IDs, not digit labels
assert np.all(clusterIDs == kmeans.labels_) # since I called .predict() on the training X
print(f'clusterIDs={clusterIDs}')
clusterIDs=[5 7 7 ... 7 3 3]
# I set this array after-the-fact to facilitate presenting the clusters in digit order.
clusterToDigit = np.array([5, 6, 8, 1, 0, 2, 9, 4, 7, 3])
for i in np.arange(10): # for each digit
clusterMembers = y[clusterIDs == clusterToDigit[i]]
values, counts = np.unique(ar=clusterMembers, return_counts=True)
print(f'i={i}, clusterToDigit[i]={clusterToDigit[i]}, values={values}, counts={counts}')
i=0, clusterToDigit[i]=5, values=[0 2 6], counts=[177 1 1] i=1, clusterToDigit[i]=6, values=[1 2 4 6 8 9], counts=[55 2 7 1 5 20] i=2, clusterToDigit[i]=8, values=[1 2 8], counts=[ 24 148 3] i=3, clusterToDigit[i]=1, values=[1 2 3 8 9], counts=[ 1 13 154 2 6] i=4, clusterToDigit[i]=0, values=[0 4 5], counts=[ 1 163 2] i=5, clusterToDigit[i]=2, values=[1 3 5 8 9], counts=[ 1 2 136 4 6] i=6, clusterToDigit[i]=9, values=[1 5 6 8], counts=[ 2 1 177 2] i=7, clusterToDigit[i]=4, values=[2 3 4 7 8 9], counts=[ 3 7 7 177 5 7] i=8, clusterToDigit[i]=7, values=[1 2 3 4 6 7 8 9], counts=[ 99 8 7 4 2 2 100 2] i=9, clusterToDigit[i]=3, values=[2 3 5 8 9], counts=[ 2 13 43 53 139]
Notice that the clusters are such that for each digit $i$ there is a cluster dominated by that digit. Some details:
Again, remember that clustering is normally done on unlabeled data, not on these labeled images of digits; and remember that I cheated to choose $k=10$.