import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree # for tree.plot_tree()
from sklearn.tree import export_text # for export_text()
X = np.array([0, 1]) # We don't care about these values--only their number.
xplot = np.linspace(0, 1)
H_X = np.zeros(len(xplot))
for i in np.arange(0, len(xplot)):
p = xplot[i]
P = np.array([p, 1 - p])
if (0 < p) & (p < 1):
H_X[i] = -np.sum(P * np.log2(P))
else:
H_X[i] = 0
plt.plot(xplot, H_X)
plt.title('Entropy of coin flip (Bernouli trial) is\n' +
'average information in bits given by outcome.')
plt.xlabel(f'$P(X = 1)$ (heads)')
plt.ylabel(f'Entropy $H(X)$')
plt.show(block=False)
df_all = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv', index_col=0)
df = df_all.head(n=8)
df = df[['mpg', 'cyl', 'vs']]
df
mpg | cyl | vs | |
---|---|---|---|
Mazda RX4 | 21.0 | 6 | 0 |
Mazda RX4 Wag | 21.0 | 6 | 0 |
Datsun 710 | 22.8 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 1 |
Hornet Sportabout | 18.7 | 8 | 0 |
Valiant | 18.1 | 6 | 1 |
Duster 360 | 14.3 | 8 | 0 |
Merc 240D | 24.4 | 4 | 1 |
print(f"Sorted by mpg:\n{df.sort_values('mpg')}\n")
print(f"Sorted by cyl:\n{df.sort_values('cyl')}\n")
feature_names = ['mpg', 'cyl']
X = df[feature_names]
y = df.vs
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
clf.fit(X, y)
plt.rcParams["figure.figsize"] = (8, 7) # (width, height) https://matplotlib.org/stable/api/figure_api.html
tree.plot_tree(clf, feature_names=feature_names, filled=True)
# export_text() is from https://scikit-learn.org/stable/modules/tree.html
print(export_text(clf, feature_names=feature_names))
print(f'Accuracy on training data is clf.score(X, y)={clf.score(X, y)}.')
Sorted by mpg: mpg cyl vs Duster 360 14.3 8 0 Valiant 18.1 6 1 Hornet Sportabout 18.7 8 0 Mazda RX4 21.0 6 0 Mazda RX4 Wag 21.0 6 0 Hornet 4 Drive 21.4 6 1 Datsun 710 22.8 4 1 Merc 240D 24.4 4 1 Sorted by cyl: mpg cyl vs Datsun 710 22.8 4 1 Merc 240D 24.4 4 1 Mazda RX4 21.0 6 0 Mazda RX4 Wag 21.0 6 0 Hornet 4 Drive 21.4 6 1 Valiant 18.1 6 1 Hornet Sportabout 18.7 8 0 Duster 360 14.3 8 0 |--- mpg <= 21.20 | |--- mpg <= 18.40 | | |--- mpg <= 16.20 | | | |--- class: 0 | | |--- mpg > 16.20 | | | |--- class: 1 | |--- mpg > 18.40 | | |--- class: 0 |--- mpg > 21.20 | |--- class: 1 Accuracy on training data is clf.score(X, y)=1.0.
that implements the ID3 algorithm; but I think it may be helpful in understanding how ID3 works.
def f_ID3(y): # y is a vector of 0s and 1s; returns proportion of 1s
return (1 / y.size) * np.sum(y)
def I(p): # p is the probability of an outcome
return -np.log2(p)
def H(y): # y is a vector of 0s and 1s
p = f_ID3(y)
if (0 < p) & (p < 1):
return p * I(p) + (1 - p) * I(1 - p)
else:
return 0.0 # otherwise format specifier ":.3" used elsewhere complains about integer 0
def H_weighted(y_minus, y_plus): # y_minus and y_plus are each vectors of 0s and 1s
n_minus = y_minus.size
n_plus = y_plus.size
n = n_minus + n_plus
return (n_minus / n) * H(y_minus) + (n_plus / n) * H(y_plus)
that gives the best feature to split on, the best threshold, and the minimum entropy of that split.
def split(X, y, feature_names, debug=False):
# X=array of features, y=vector of 0s and 1s, feature_names=strings
min_H = np.Infinity
best_j = np.NAN
best_t = np.NAN
n_features = X.shape[1]
assert n_features == len(feature_names)
for j in range(n_features):
feature = X.iloc[:, j]
unique = np.unique(feature)
thresholds = (unique[1:] + unique[:-1]) / 2 # averages each adjacent pair of unique values
if debug:
print(f'feature_names[{j}]={feature_names[j]}')
print(f' unique ={unique}')
print(f' thresholds={thresholds}')
for t in thresholds:
S_minus = y[feature <= t]
S_plus = y[feature > t]
H_split = H_weighted(S_minus, S_plus)
if debug:
print(f' t={t:.3}, S_minus={S_minus}, S_plus={S_plus}, H_minus={H(S_minus):.3}, H_plus={H(S_plus):.3}, H_split={H_split:.3}')
if H_split < min_H:
min_H = H_split
best_j = j
best_t = t
return (best_j, best_t, min_H)
that just returns on a zero-entropy set S = (X, y) of examples but otherwise calls split() to get the best split and then recursively calls decision_tree() on each of the left and right splits.
def decision_tree(X, y, feature_names, depth=0, debug=False):
# X=array of features, y=vector of 0s and 1s, feature_names=strings,
# depth is of recursion for indenting output, debug activates debugging output
if H(y) == 0:
return
padding = ' ' * depth * 2 # indent output by depth * 2 spaces
if debug:
print(f'{padding}decision_tree(X=------------------------------------------------------------')
print(f'{padding}{X}, y={y}, feature_names={feature_names}')
best_j, best_t, min_H = split(X, y, feature_names, debug)
print(f'{padding}########## best_j={best_j}, best_feature={feature_names[best_j]}, best_t={best_t:.3}, min_H={min_H:.3}')
le = (X.iloc[:, best_j] <= best_t) # 'le'='less than or equal to'
decision_tree(X[ le], y[ le], feature_names, depth+1, debug) # left branch
decision_tree(X[~le], y[~le], feature_names, depth+1, debug) # right branch
decision_tree(X, y, feature_names, debug=True) # call it on small example data
decision_tree(X=------------------------------------------------------------ mpg cyl Mazda RX4 21.0 6 Mazda RX4 Wag 21.0 6 Datsun 710 22.8 4 Hornet 4 Drive 21.4 6 Hornet Sportabout 18.7 8 Valiant 18.1 6 Duster 360 14.3 8 Merc 240D 24.4 4, y=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Hornet Sportabout 0 Valiant 1 Duster 360 0 Merc 240D 1 Name: vs, dtype: int64, feature_names=['mpg', 'cyl'] feature_names[0]=mpg unique =[14.3 18.1 18.7 21. 21.4 22.8 24.4] thresholds=[16.2 18.4 19.85 21.2 22.1 23.6 ] t=16.2, S_minus=Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Hornet Sportabout 0 Valiant 1 Merc 240D 1 Name: vs, dtype: int64, H_minus=0.0, H_plus=0.985, H_split=0.862 t=18.4, S_minus=Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Hornet Sportabout 0 Merc 240D 1 Name: vs, dtype: int64, H_minus=1.0, H_plus=1.0, H_split=1.0 t=19.9, S_minus=Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Merc 240D 1 Name: vs, dtype: int64, H_minus=0.918, H_plus=0.971, H_split=0.951 t=21.2, S_minus=Mazda RX4 0 Mazda RX4 Wag 0 Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Datsun 710 1 Hornet 4 Drive 1 Merc 240D 1 Name: vs, dtype: int64, H_minus=0.722, H_plus=0.0, H_split=0.451 t=22.1, S_minus=Mazda RX4 0 Mazda RX4 Wag 0 Hornet 4 Drive 1 Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Datsun 710 1 Merc 240D 1 Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.689 t=23.6, S_minus=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Merc 240D 1 Name: vs, dtype: int64, H_minus=0.985, H_plus=0.0, H_split=0.862 feature_names[1]=cyl unique =[4 6 8] thresholds=[5. 7.] t=5.0, S_minus=Datsun 710 1 Merc 240D 1 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Hornet 4 Drive 1 Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, H_minus=0.0, H_plus=0.918, H_split=0.689 t=7.0, S_minus=Mazda RX4 0 Mazda RX4 Wag 0 Datsun 710 1 Hornet 4 Drive 1 Valiant 1 Merc 240D 1 Name: vs, dtype: int64, S_plus=Hornet Sportabout 0 Duster 360 0 Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.689 ########## best_j=0, best_feature=mpg, best_t=21.2, min_H=0.451 decision_tree(X=------------------------------------------------------------ mpg cyl Mazda RX4 21.0 6 Mazda RX4 Wag 21.0 6 Hornet Sportabout 18.7 8 Valiant 18.1 6 Duster 360 14.3 8, y=Mazda RX4 0 Mazda RX4 Wag 0 Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, feature_names=['mpg', 'cyl'] feature_names[0]=mpg unique =[14.3 18.1 18.7 21. ] thresholds=[16.2 18.4 19.85] t=16.2, S_minus=Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Hornet Sportabout 0 Valiant 1 Name: vs, dtype: int64, H_minus=0.0, H_plus=0.811, H_split=0.649 t=18.4, S_minus=Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Hornet Sportabout 0 Name: vs, dtype: int64, H_minus=1.0, H_plus=0.0, H_split=0.4 t=19.9, S_minus=Hornet Sportabout 0 Valiant 1 Duster 360 0 Name: vs, dtype: int64, S_plus=Mazda RX4 0 Mazda RX4 Wag 0 Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.551 feature_names[1]=cyl unique =[6 8] thresholds=[7.] t=7.0, S_minus=Mazda RX4 0 Mazda RX4 Wag 0 Valiant 1 Name: vs, dtype: int64, S_plus=Hornet Sportabout 0 Duster 360 0 Name: vs, dtype: int64, H_minus=0.918, H_plus=0.0, H_split=0.551 ########## best_j=0, best_feature=mpg, best_t=18.4, min_H=0.4 decision_tree(X=------------------------------------------------------------ mpg cyl Valiant 18.1 6 Duster 360 14.3 8, y=Valiant 1 Duster 360 0 Name: vs, dtype: int64, feature_names=['mpg', 'cyl'] feature_names[0]=mpg unique =[14.3 18.1] thresholds=[16.2] t=16.2, S_minus=Duster 360 0 Name: vs, dtype: int64, S_plus=Valiant 1 Name: vs, dtype: int64, H_minus=0.0, H_plus=0.0, H_split=0.0 feature_names[1]=cyl unique =[6 8] thresholds=[7.] t=7.0, S_minus=Valiant 1 Name: vs, dtype: int64, S_plus=Duster 360 0 Name: vs, dtype: int64, H_minus=0.0, H_plus=0.0, H_split=0.0 ########## best_j=0, best_feature=mpg, best_t=16.2, min_H=0.0
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv', index_col=0)
feature_names = ['mpg', 'cyl']
X = df[feature_names]
y = df.vs
class_names = ['V', 'straight']
clf = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=0)
clf.fit(X, y)
plt.figure(figsize=(8, 8)) # (width, height) in inches
tree.plot_tree(clf, feature_names=feature_names, class_names=class_names, filled=True)
plt.title('Classify cars from mtcars as 0=V or 1=straight engine\n' +
'from mpg and cyl (so y is vs and X includes mpg and cyl)\n')
plt.savefig(fname='mtcarsDecision.png')
plt.show(block=False)
print(export_text(clf, feature_names=feature_names))
print(f'Accuracy on training data is clf.score(X, y)={clf.score(X, y)}.')
|--- cyl <= 7.00 | |--- mpg <= 21.20 | | |--- mpg <= 19.45 | | | |--- class: 1 | | |--- mpg > 19.45 | | | |--- class: 0 | |--- mpg > 21.20 | | |--- mpg <= 25.20 | | | |--- class: 1 | | |--- mpg > 25.20 | | | |--- mpg <= 26.65 | | | | |--- class: 0 | | | |--- mpg > 26.65 | | | | |--- class: 1 |--- cyl > 7.00 | |--- class: 0 Accuracy on training data is clf.score(X, y)=1.0.
decision_tree(X, y, feature_names=feature_names) # check that my implementation gives the same tree
########## best_j=1, best_feature=cyl, best_t=7.0, min_H=0.43 ########## best_j=0, best_feature=mpg, best_t=21.2, min_H=0.609 ########## best_j=0, best_feature=mpg, best_t=19.4, min_H=0.0 ########## best_j=0, best_feature=mpg, best_t=25.2, min_H=0.325 ########## best_j=0, best_feature=mpg, best_t=26.6, min_H=0.0