import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier


digits = datasets.load_digits()


print(f'digits.data.shape={digits.data.shape}')
print(f'digits.target.shape={digits.target.shape}')
row_0 = digits.data[0, :]
print(f'The first image is digits.data[0] (shown here as 8x8 instead of 1x64):')
print(f'{row_0.reshape(8, 8)}')
print(f'The first image is of the digit given by digits.target[0]={digits.target[0]}:')

digits.data.shape=(1797, 64)
digits.target.shape=(1797,)
The first image is digits.data[0] (shown here as 8x8 instead of 1x64):
[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
The first image is of the digit given by digits.target[0]=0:


fig = plt.figure(figsize=(10, 4)) # new blank figure
gs = fig.add_gridspec(nrows=4, ncols=10) # grid of plot axes
for i in np.arange(40): # i goes from 0 to 39; or if we consider it as a 2-digit number, from 00 to 39
    # In the next line, the row is i's first digit and the column is i's second digit.
    # e.g. Image 23 goes in row 2 and column 3 of the 10x4 plot.
    ax = fig.add_subplot(gs[i // 10, i % 10])
    ax.matshow(digits.data[i].reshape(8, 8))


first_40_y_values = digits.target[0:40].copy()
first_40_y_values.reshape(4, 10)

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 9, 5, 5, 6, 5, 0, 9, 8, 9]])


X = digits.data
y = digits.target
# split 80% training data, 20% "_tmp" for validation & test
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2,
                                                  random_state=0, stratify=y)
# of remaining 20%, split in half to get 10% validation, 10% test
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
  random_state=0, stratify=y_tmp)


# Here is a basic ID3 DecisionTree from long ago in unit 3.
clf = DecisionTreeClassifier(criterion='entropy', max_depth=8)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'DecisionTree validation_score={validation_score:.3}')

# Here is Bagging with the DecisionTree and some default values.
clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=8),
                        n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'Bagging validation_score={validation_score:.3}')

# Here is a RandomForest. I tuned the max_depth=8 by hand, starting
# with 2, getting lousy results, and finding 8 was best of 2 through 9.
clf = RandomForestClassifier(max_depth=8, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'RandomForest validation_score={validation_score:.3}')

# Here is GradientBoosting with default values.
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.00,
    max_depth=1, random_state=0)
clf.fit(X_train, y_train)
validation_score = clf.score(X_valid, y_valid)
print(f'GradientBoosting validation_score={validation_score:.3}')

DecisionTree validation_score=0.861
Bagging validation_score=0.956
RandomForest validation_score=0.972
GradientBoosting validation_score=0.183

Ensemble learning¶

Classification by optical recognition of handwritten digits¶

Here we try ensemble learning.¶