import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans


digits = load_digits() # reset the data (unnecessary) for clarity
X = digits.data
y = digits.target

# Where did I get the n_clusters=10? Cheating! I know these are
# images of 10 digits, but normal clustering is on unlabeled data.
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
N = X.shape[0]
kmeans.fit(X)
print(f'kmeans.labels_={kmeans.labels_}') # cluster IDs, not digit labels
#print(f'kmeans.cluster_centers_={kmeans.cluster_centers_}')

kmeans.labels_=[5 7 7 ... 7 3 3]


clusterIDs = kmeans.predict(X) # cluster IDs, not digit labels
assert np.all(clusterIDs == kmeans.labels_) # since I called .predict() on the training X
print(f'clusterIDs={clusterIDs}')

clusterIDs=[5 7 7 ... 7 3 3]


# I set this array after-the-fact to facilitate presenting the clusters in digit order.
clusterToDigit = np.array([5, 6, 8, 1, 0, 2, 9, 4, 7, 3])
for i in np.arange(10): # for each digit
    clusterMembers = y[clusterIDs == clusterToDigit[i]]
    values, counts = np.unique(ar=clusterMembers, return_counts=True)
    print(f'i={i}, clusterToDigit[i]={clusterToDigit[i]}, values={values}, counts={counts}')

i=0, clusterToDigit[i]=5, values=[0 2 6], counts=[177   1   1]
i=1, clusterToDigit[i]=6, values=[1 2 4 6 8 9], counts=[55  2  7  1  5 20]
i=2, clusterToDigit[i]=8, values=[1 2 8], counts=[ 24 148   3]
i=3, clusterToDigit[i]=1, values=[1 2 3 8 9], counts=[  1  13 154   2   6]
i=4, clusterToDigit[i]=0, values=[0 4 5], counts=[  1 163   2]
i=5, clusterToDigit[i]=2, values=[1 3 5 8 9], counts=[  1   2 136   4   6]
i=6, clusterToDigit[i]=9, values=[1 5 6 8], counts=[  2   1 177   2]
i=7, clusterToDigit[i]=4, values=[2 3 4 7 8 9], counts=[  3   7   7 177   5   7]
i=8, clusterToDigit[i]=7, values=[1 2 3 4 6 7 8 9], counts=[ 99   8   7   4   2   2 100   2]
i=9, clusterToDigit[i]=3, values=[2 3 5 8 9], counts=[  2  13  43  53 139]

$k$-Means Clustering¶

$k$-Means Clustering¶

Normally we would not (could not) compute accuracy on an unsupervised algorithm¶